[superlu-dist] 01/01: New upstream version 5.1.3

Wed Sep 13 04:02:11 UTC 2017

This is an automated email from the git hooks/post-receive script.

dparsons pushed a commit to branch master
in repository superlu-dist.

commit 18be632b19046f72c256af8150b09e1a462d22c1
Author: Drew Parsons <dparsons at emerall.com>
Date:   Wed Sep 13 12:01:26 2017 +0800

    New upstream version 5.1.3
---
 .gitignore                           |     9 +
 CBLAS/CMakeLists.txt                 |    50 +
 CBLAS/Makefile                       |    93 +
 CBLAS/caxpy.c                        |    90 +
 CBLAS/ccopy.c                        |    74 +
 CBLAS/cdotc.c                        |    87 +
 CBLAS/cgemv.c                        |   398 ++
 CBLAS/cgerc.c                        |   205 +
 CBLAS/cgeru.c                        |   202 +
 CBLAS/chemv.c                        |   420 ++
 CBLAS/cher2.c                        |   435 ++
 CBLAS/cscal.c                        |    70 +
 CBLAS/ctrsv.c                        |   508 ++
 CBLAS/dasum.c                        |    88 +
 CBLAS/daxpy.c                        |    94 +
 CBLAS/dcabs1.c                       |    28 +
 CBLAS/dcopy.c                        |    94 +
 CBLAS/ddot.c                         |    97 +
 CBLAS/dgemm.c                        |   393 ++
 CBLAS/dgemv.c                        |   298 +
 CBLAS/dger.c                         |   182 +
 CBLAS/dnrm2.c                        |    83 +
 CBLAS/drot.c                         |    76 +
 CBLAS/dscal.c                        |    83 +
 CBLAS/dsymv.c                        |   299 +
 CBLAS/dsyr2.c                        |   263 +
 CBLAS/dtrsm.c                        |   481 ++
 CBLAS/dtrsv.c                        |   337 +
 CBLAS/dzasum.c                       |    68 +
 CBLAS/dznrm2.c                       |    96 +
 CBLAS/f2c.h                          |    41 +
 CBLAS/icamax.c                       |    72 +
 CBLAS/idamax.c                       |    80 +
 CBLAS/input_error_dist.c             |    39 +
 CBLAS/isamax.c                       |    80 +
 CBLAS/izamax.c                       |    81 +
 CBLAS/sasum.c                        |    89 +
 CBLAS/saxpy.c                        |    94 +
 CBLAS/scasum.c                       |    74 +
 CBLAS/scnrm2.c                       |    96 +
 CBLAS/scopy.c                        |    94 +
 CBLAS/sdot.c                         |    96 +
 CBLAS/sgemv.c                        |   298 +
 CBLAS/sger.c                         |   181 +
 CBLAS/snrm2.c                        |    83 +
 CBLAS/srot.c                         |    76 +
 CBLAS/sscal.c                        |    82 +
 CBLAS/ssymv.c                        |   299 +
 CBLAS/ssyr2.c                        |   262 +
 CBLAS/strsv.c                        |   337 +
 CBLAS/superlu_f2c.h                  |    43 +
 CBLAS/z_internal.c                   |    45 +
 CBLAS/zaxpy.c                        |    87 +
 CBLAS/zcopy.c                        |    74 +
 CBLAS/zdotc.c                        |    85 +
 CBLAS/zgemm.c                        |   689 ++
 CBLAS/zgemv.c                        |   399 ++
 CBLAS/zgerc.c                        |   206 +
 CBLAS/zgeru.c                        |   203 +
 CBLAS/zhemv.c                        |   420 ++
 CBLAS/zher2.c                        |   436 ++
 CBLAS/zscal.c                        |    70 +
 CBLAS/ztrsm.c                        |   691 ++
 CBLAS/ztrsv.c                        |   509 ++
 CMakeLists.txt                       |   217 +
 DOC/ug.pdf                           |   Bin 0 -> 687318 bytes
 DoxyConfig                           |  1356 ++++
 EXAMPLE/CMakeLists.txt               |   120 +
 EXAMPLE/Makefile                     |   144 +
 EXAMPLE/README                       |    52 +
 EXAMPLE/big.rua                      | 11496 +++++++++++++++++++++++++++++++++
 EXAMPLE/cg20.cua                     |   918 +++
 EXAMPLE/dcreate_matrix.c             |   230 +
 EXAMPLE/dcreate_matrix_perturbed.c   |   230 +
 EXAMPLE/dreadhb.c                    |   389 ++
 EXAMPLE/dreadtriple.c                |   180 +
 EXAMPLE/g20.rua                      |   534 ++
 EXAMPLE/g4.rua                       |    21 +
 EXAMPLE/pddrive.c                    |   234 +
 EXAMPLE/pddrive1.c                   |   247 +
 EXAMPLE/pddrive1_ABglobal.c          |   285 +
 EXAMPLE/pddrive2.c                   |   273 +
 EXAMPLE/pddrive2_ABglobal.c          |   305 +
 EXAMPLE/pddrive3.c                   |   277 +
 EXAMPLE/pddrive3_ABglobal.c          |   310 +
 EXAMPLE/pddrive4.c                   |   288 +
 EXAMPLE/pddrive4_ABglobal.c          |   364 ++
 EXAMPLE/pddrive_ABglobal.c           |   264 +
 EXAMPLE/pzdrive.c                    |   233 +
 EXAMPLE/pzdrive1.c                   |   246 +
 EXAMPLE/pzdrive1_ABglobal.c          |   284 +
 EXAMPLE/pzdrive2.c                   |   272 +
 EXAMPLE/pzdrive2_ABglobal.c          |   304 +
 EXAMPLE/pzdrive3.c                   |   276 +
 EXAMPLE/pzdrive3_ABglobal.c          |   309 +
 EXAMPLE/pzdrive4.c                   |   287 +
 EXAMPLE/pzdrive4_ABglobal.c          |   363 ++
 EXAMPLE/pzdrive_ABglobal.c           |   260 +
 EXAMPLE/pzgsmv.c                     |   374 ++
 EXAMPLE/pzgstrs_Bglobal_Bsend.c      |  1031 +++
 EXAMPLE/pzgstrs_lsum_Bsend.c         |   423 ++
 EXAMPLE/pzutil.c                     |   549 ++
 EXAMPLE/sp_ienv.c                    |   119 +
 EXAMPLE/zcreate_matrix.c             |   229 +
 EXAMPLE/zcreate_matrix_perturbed.c   |   229 +
 EXAMPLE/zlook_ahead_update.c         |   230 +
 EXAMPLE/zreadhb.c                    |   292 +
 EXAMPLE/zreadtriple.c                |   177 +
 FORTRAN/Makefile                     |    48 +
 FORTRAN/README                       |    28 +
 FORTRAN/c_fortran_pdgssvx_ABglobal.c |   215 +
 FORTRAN/c_fortran_slugrid.c          |    56 +
 FORTRAN/dcreate_dist_matrix.c        |   206 +
 FORTRAN/dhbcode1.f90                 |    50 +
 FORTRAN/f_5x5.f90                    |   226 +
 FORTRAN/f_pddrive.f90                |   161 +
 FORTRAN/f_pddrive_ABglobal.f         |    76 +
 FORTRAN/f_pddrive_old.f90            |   159 +
 FORTRAN/f_pzdrive.f90                |   160 +
 FORTRAN/hbcode1.f.bak                |    46 +
 FORTRAN/sp_ienv.c                    |   121 +
 FORTRAN/superlu_c2f_dwrap.c          |   332 +
 FORTRAN/superlu_c2f_zwrap.c          |   331 +
 FORTRAN/superlu_mod.f90              |   163 +
 FORTRAN/superlupara.f90              |    91 +
 FORTRAN/zcreate_dist_matrix.c        |   205 +
 FORTRAN/zhbcode1.f90                 |    50 +
 INSTALL/Makefile                     |    26 +
 INSTALL/dmachtst.c                   |    34 +
 INSTALL/install.csh                  |    14 +
 INSTALL/smachtst.c                   |    34 +
 INSTALL/superlu_timer.c              |    54 +
 INSTALL/timertst.c                   |    72 +
 License.txt                          |    29 +
 MAKE_INC/make.altix                  |    77 +
 MAKE_INC/make.carver                 |    91 +
 MAKE_INC/make.cuda_gpu               |    91 +
 MAKE_INC/make.i386_linux             |    78 +
 MAKE_INC/make.mpich                  |    48 +
 MAKE_INC/make.opteron                |    78 +
 MAKE_INC/make.origin                 |    80 +
 MAKE_INC/make.sp                     |    80 +
 MAKE_INC/make.sp.64bit               |    85 +
 MAKE_INC/make.t3e                    |    73 +
 MAKE_INC/make.xc30                   |    83 +
 MAKE_INC/make.xe6                    |    79 +
 MAKE_INC/make.xt4                    |    66 +
 MAKE_INC/make.xt4.64bit              |    75 +
 MAKE_INC/make.xt4_pathscale          |    75 +
 MAKE_INC/make.xt4_pgi                |    75 +
 MAKE_INC/make.xt5                    |    78 +
 Makefile                             |    45 +
 README                               |   251 +
 SRC/CMakeLists.txt                   |   127 +
 SRC/Cnames.h                         |   365 ++
 SRC/Makefile                         |    91 +
 SRC/comm.c                           |   124 +
 SRC/cublas_utils.c                   |   109 +
 SRC/cublas_utils.h                   |    34 +
 SRC/dSchCompUdt-2Ddynamic.c          |   525 ++
 SRC/dSchCompUdt-cuda.c               |   550 ++
 SRC/dcomplex.h                       |    81 +
 SRC/dcomplex_dist.c                  |    94 +
 SRC/ddistribute.c                    |   750 +++
 SRC/dgsequ_dist.c                    |   193 +
 SRC/dlangs_dist.c                    |   121 +
 SRC/dlaqgs_dist.c                    |   143 +
 SRC/dldperm_dist.c                   |   172 +
 SRC/dlook_ahead_update.c             |   251 +
 SRC/dmach_dist.c                     |    94 +
 SRC/dmemory.patch                    |     8 +
 SRC/dmemory_dist.c                   |   169 +
 SRC/dmyblas2_dist.c                  |   248 +
 SRC/dreadMM.c                        |   243 +
 SRC/dreadhb.c                        |   389 ++
 SRC/dreadrb.c                        |   347 +
 SRC/dreadtriple.c                    |   180 +
 SRC/dreadtriple_noheader.c           |   199 +
 SRC/dscatter.c                       |   516 ++
 SRC/dsp_blas2_dist.c                 |   502 ++
 SRC/dsp_blas3_dist.c                 |   135 +
 SRC/dutil_dist.c                     |   614 ++
 SRC/etree.c                          |   431 ++
 SRC/get_perm_c.c                     |   544 ++
 SRC/get_perm_c_parmetis.c            |   920 +++
 SRC/html_mainpage.h                  |    20 +
 SRC/machines.h                       |    63 +
 SRC/mc64ad_dist.c                    |  2654 ++++++++
 SRC/memory.c                         |   580 ++
 SRC/memory.patch                     |    10 +
 SRC/mmd.c                            |  1025 +++
 SRC/old_colamd.c                     |  2596 ++++++++
 SRC/old_colamd.h                     |    86 +
 SRC/pdGetDiagU.c                     |   121 +
 SRC/pddistribute.c                   |  1071 +++
 SRC/pdgsequ.c                        |   244 +
 SRC/pdgsmv.c                         |   383 ++
 SRC/pdgsmv_AXglobal.c                |   324 +
 SRC/pdgsrfs.c                        |   262 +
 SRC/pdgsrfs_ABXglobal.c              |   465 ++
 SRC/pdgssvx.c                        |  1463 +++++
 SRC/pdgssvx_ABglobal.c               |  1105 ++++
 SRC/pdgstrf.c                        |  1820 ++++++
 SRC/pdgstrf2.c                       |   375 ++
 SRC/pdgstrf_X1.c                     |  1347 ++++
 SRC/pdgstrf_irecv.c                  |  1345 ++++
 SRC/pdgstrf_sherry.c                 |  1389 ++++
 SRC/pdgstrs.c                        |  1341 ++++
 SRC/pdgstrs1.c                       |   910 +++
 SRC/pdgstrsL.c                       |   848 +++
 SRC/pdgstrs_Bglobal.c                |  1040 +++
 SRC/pdgstrs_Bglobal_Bsend.c          |  1017 +++
 SRC/pdgstrs_lsum.c                   |   374 ++
 SRC/pdlangs.c                        |   145 +
 SRC/pdlaqgs.c                        |   151 +
 SRC/pdsymbfact_distdata.c            |  1974 ++++++
 SRC/pdutil.c                         |   538 ++
 SRC/psymbfact.c                      |  5225 +++++++++++++++
 SRC/psymbfact.h                      |   302 +
 SRC/psymbfact_util.c                 |   552 ++
 SRC/pxerr_dist.c                     |    32 +
 SRC/pzGetDiagU.c                     |   120 +
 SRC/pzdistribute.c                   |  1070 +++
 SRC/pzgsequ.c                        |   243 +
 SRC/pzgsmv.c                         |   385 ++
 SRC/pzgsmv_AXglobal.c                |   327 +
 SRC/pzgsrfs.c                        |   263 +
 SRC/pzgsrfs_ABXglobal.c              |   470 ++
 SRC/pzgssvx.c                        |  1464 +++++
 SRC/pzgssvx_ABglobal.c               |  1104 ++++
 SRC/pzgstrf.c                        |  1820 ++++++
 SRC/pzgstrf2.c                       |   376 ++
 SRC/pzgstrf_irecv.c                  |  1296 ++++
 SRC/pzgstrs.c                        |  1350 ++++
 SRC/pzgstrs1.c                       |   913 +++
 SRC/pzgstrs_Bglobal.c                |  1050 +++
 SRC/pzgstrs_lsum.c                   |   385 ++
 SRC/pzlangs.c                        |   144 +
 SRC/pzlaqgs.c                        |   152 +
 SRC/pzsymbfact_distdata.c            |  1973 ++++++
 SRC/pzutil.c                         |   539 ++
 SRC/smach_dist.c                     |    94 +
 SRC/sp_colorder.c                    |   243 +
 SRC/sp_ienv.c                        |   121 +
 SRC/static_schedule.c                |   968 +++
 SRC/superlu_ddefs.h                  |   382 ++
 SRC/superlu_defs.h                   |   764 +++
 SRC/superlu_enum_consts.h            |    81 +
 SRC/superlu_grid.c                   |   178 +
 SRC/superlu_timer.c                  |    78 +
 SRC/superlu_zdefs.h                  |   385 ++
 SRC/supermatrix.h                    |   191 +
 SRC/symbfact.c                       |   901 +++
 SRC/util.c                           |  1181 ++++
 SRC/util_dist.h                      |   147 +
 SRC/xerr_dist.c                      |    33 +
 SRC/zSchCompUdt-2Ddynamic.c          |   524 ++
 SRC/zSchCompUdt-cuda.c               |   553 ++
 SRC/zdistribute.c                    |   749 +++
 SRC/zdistribute_mark.c               |   711 ++
 SRC/zgsequ_dist.c                    |   193 +
 SRC/zlangs_dist.c                    |   118 +
 SRC/zlaqgs_dist.c                    |   145 +
 SRC/zldperm_dist.c                   |   174 +
 SRC/zlook_ahead_update.c             |   250 +
 SRC/zmemory_dist.c                   |   168 +
 SRC/zmyblas2_dist.c                  |   208 +
 SRC/zreadMM.c                        |   240 +
 SRC/zreadhb.c                        |   292 +
 SRC/zreadrb.c                        |   355 +
 SRC/zreadtriple.c                    |   177 +
 SRC/zreadtriple_noheader.c           |   198 +
 SRC/zscatter.c                       |   516 ++
 SRC/zsp_blas2_dist.c                 |   515 ++
 SRC/zsp_blas3_dist.c                 |   136 +
 SRC/zutil_dist.c                     |   497 ++
 cmake/XSDKDefaults.cmake             |   182 +
 make.inc.in                          |    39 +
 run_cmake_build.csh                  |    56 +
 279 files changed, 112681 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2eb65d5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*~
+
+# You have to ignore this genrated file or git will complain that it is an
+# unknown file!
+/make.inc
+
+# If the instructions are telling people to create this build dir under the
+# source tree, you had better put in an ignore for this.
+/build/
diff --git a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt
new file mode 100644
index 0000000..3d259fe
--- /dev/null
+++ b/CBLAS/CMakeLists.txt
@@ -0,0 +1,50 @@
+set(headers 
+    f2c.h
+)
+set(sources input_error_dist.c)
+
+if (enable_double)
+    list(APPEND sources
+      idamax.c
+      dasum.c
+      daxpy.c
+      dcopy.c
+      ddot.c
+      dnrm2.c
+      drot.c
+      dscal.c
+      dgemv.c
+      dsymv.c
+      dtrsv.c
+      dger.c
+      dsyr2.c
+      dgemm.c
+      dtrsm.c
+    )
+endif()
+
+if (enable_complex16)
+    list(APPEND sources
+      izamax.c
+      dzasum.c
+      zaxpy.c
+      zcopy.c
+      dznrm2.c
+      zscal.c
+      dcabs1.c
+      z_internal.c
+      zgemv.c
+      zhemv.c
+      ztrsv.c
+      zgerc.c
+      zgeru.c
+      zher2.c
+      zgemm.c
+      ztrsm.c
+    )
+endif()
+
+add_library(blas ${sources} ${HEADERS})
+
+install(TARGETS blas DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
diff --git a/CBLAS/Makefile b/CBLAS/Makefile
new file mode 100644
index 0000000..5812c03
--- /dev/null
+++ b/CBLAS/Makefile
@@ -0,0 +1,93 @@
+include ../make.inc
+#HEADER = ../SRC
+
+#######################################################################
+#  This is the makefile to create a library for C-BLAS.
+#  The files are organized as follows:
+#
+#       SBLAS1 -- Single precision real BLAS routines
+#       CBLAS1 -- Single precision complex BLAS routines
+#       DBLAS1 -- Double precision real BLAS routines
+#       ZBLAS1 -- Double precision complex BLAS routines
+#
+#       CB1AUX -- Real BLAS routines called by complex routines
+#       ZB1AUX -- D.P. real BLAS routines called by d.p. complex
+#                 routines
+#
+#      ALLBLAS -- Auxiliary routines for Level 2 and 3 BLAS
+#
+#       SBLAS2 -- Single precision real BLAS2 routines
+#       CBLAS2 -- Single precision complex BLAS2 routines
+#       DBLAS2 -- Double precision real BLAS2 routines
+#       ZBLAS2 -- Double precision complex BLAS2 routines
+#
+#       SBLAS3 -- Single precision real BLAS3 routines
+#       CBLAS3 -- Single precision complex BLAS3 routines
+#       DBLAS3 -- Double precision real BLAS3 routines
+#       ZBLAS3 -- Double precision complex BLAS3 routines
+#
+#  The library can be set up to include routines for any combination
+#  of the four precisions.  To create or add to the library, enter make
+#  followed by one or more of the precisions desired.  Some examples:
+#       make single
+#       make single complex
+#       make single double complex complex16
+#  Alternatively, the command
+#       make
+#  without any arguments creates a library of all four precisions.
+#  The library is called
+#       blas.a
+#  and is created at the next higher directory level.
+#
+#  To remove the object files after the library is created, enter
+#       make clean
+#
+#######################################################################
+
+SBLAS1 = isamax.o sasum.o saxpy.o scopy.o sdot.o snrm2.o \
+	 srot.o sscal.o
+SBLAS2 = sgemv.o ssymv.o strsv.o sger.o ssyr2.o
+
+DBLAS1 = idamax.o dasum.o daxpy.o dcopy.o ddot.o dnrm2.o \
+	 drot.o dscal.o
+DBLAS2 = dgemv.o dsymv.o dtrsv.o dger.o dsyr2.o
+DBLAS3 = dgemm.o dtrsm.o
+
+CBLAS1 = icamax.o scasum.o caxpy.o ccopy.o scnrm2.o \
+	 cscal.o
+CBLAS2 = cgemv.o chemv.o ctrsv.o cgerc.o cgeru.o cher2.o
+
+ZBLAS1 = izamax.o dzasum.o zaxpy.o zcopy.o dznrm2.o \
+	 zscal.o dcabs1.o z_internal.o
+ZBLAS2 = zgemv.o zhemv.o ztrsv.o zgerc.o zgeru.o zher2.o
+ZBLAS3 = zgemm.o ztrsm.o
+
+ALLBLAS = input_error_dist.o
+
+all: single double complex complex16
+
+single: $(SBLAS1) $(SBLAS2) $(SBLAS3)
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \
+	$(SBLAS2) $(SBLAS3)
+	$(RANLIB) $(BLASLIB)
+
+double: $(DBLAS1) $(DBLAS2) $(DBLAS3)
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \
+	$(DBLAS2) $(DBLAS3)
+	$(RANLIB) $(BLASLIB)
+
+complex: $(CBLAS1) $(CBLAS2) $(CBLAS3)
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(ALLBLAS) \
+	$(CBLAS2) $(CBLAS3)
+	$(RANLIB) $(BLASLIB)
+
+complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3)
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ALLBLAS) \
+	$(ZBLAS2) $(ZBLAS3)
+	$(RANLIB) $(BLASLIB)
+
+.c.o:
+	$(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
+
+clean:	
+	rm -f *.o
diff --git a/CBLAS/caxpy.c b/CBLAS/caxpy.c
new file mode 100644
index 0000000..96aa7db
--- /dev/null
+++ b/CBLAS/caxpy.c
@@ -0,0 +1,90 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int caxpy_(integer *n, complex *ca, complex *cx, integer *
+	incx, complex *cy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Builtin functions */
+    double r_imag(complex *);
+
+    /* Local variables */
+    static integer i, ix, iy;
+
+
+/*     constant times a vector plus a vector.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define CY(I) cy[(I)-1]
+#define CX(I) cx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if ((r__1 = ca->r, dabs(r__1)) + (r__2 = r_imag(ca), dabs(r__2)) == 0.f) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = iy;
+	i__3 = iy;
+	i__4 = ix;
+	q__2.r = ca->r * CX(ix).r - ca->i * CX(ix).i, q__2.i = ca->r * CX(
+		ix).i + ca->i * CX(ix).r;
+	q__1.r = CY(iy).r + q__2.r, q__1.i = CY(iy).i + q__2.i;
+	CY(iy).r = q__1.r, CY(iy).i = q__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = i;
+	i__3 = i;
+	i__4 = i;
+	q__2.r = ca->r * CX(i).r - ca->i * CX(i).i, q__2.i = ca->r * CX(
+		i).i + ca->i * CX(i).r;
+	q__1.r = CY(i).r + q__2.r, q__1.i = CY(i).i + q__2.i;
+	CY(i).r = q__1.r, CY(i).i = q__1.i;
+/* L30: */
+    }
+    return 0;
+} /* caxpy_ */
+
diff --git a/CBLAS/ccopy.c b/CBLAS/ccopy.c
new file mode 100644
index 0000000..dbfd87b
--- /dev/null
+++ b/CBLAS/ccopy.c
@@ -0,0 +1,74 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int ccopy_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i, ix, iy;
+
+
+/*     copies a vector, x, to a vector, y.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define CY(I) cy[(I)-1]
+#define CX(I) cx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = iy;
+	i__3 = ix;
+	CY(iy).r = CX(ix).r, CY(iy).i = CX(ix).i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = i;
+	i__3 = i;
+	CY(i).r = CX(i).r, CY(i).i = CX(i).i;
+/* L30: */
+    }
+    return 0;
+} /* ccopy_ */
+
diff --git a/CBLAS/cdotc.c b/CBLAS/cdotc.c
new file mode 100644
index 0000000..c7a153f
--- /dev/null
+++ b/CBLAS/cdotc.c
@@ -0,0 +1,87 @@
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Complex */ VOID cdotc_(complex * ret_val, integer *n, complex *cx, integer 
+	*incx, complex *cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer i;
+    static complex ctemp;
+    static integer ix, iy;
+
+
+/*     forms the dot product of two vectors, conjugating the first   
+       vector.   
+       jack dongarra, linpack,  3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    ctemp.r = 0.f, ctemp.i = 0.f;
+     ret_val->r = 0.f,  ret_val->i = 0.f;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	r_cnjg(&q__3, &cx[ix]);
+	i__2 = iy;
+	q__2.r = q__3.r * cy[iy].r - q__3.i * cy[iy].i, q__2.i = q__3.r * 
+		cy[iy].i + q__3.i * cy[iy].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	r_cnjg(&q__3, &cx[i]);
+	i__2 = i;
+	q__2.r = q__3.r * cy[i].r - q__3.i * cy[i].i, q__2.i = q__3.r * 
+		cy[i].i + q__3.i * cy[i].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+/* L30: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+} /* cdotc_ */
+
diff --git a/CBLAS/cgemv.c b/CBLAS/cgemv.c
new file mode 100644
index 0000000..5cfa9ae
--- /dev/null
+++ b/CBLAS/cgemv.c
@@ -0,0 +1,398 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int cgemv_(char *trans, integer *m, integer *n, complex *
+	alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
+	beta, complex *y, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer info;
+    static complex temp;
+    static integer lenx, leny, i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical noconj;
+
+
+/*  Purpose   
+    =======   
+
+    CGEMV  performs one of the matrix-vector operations   
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or   
+
+       y := alpha*conjg( A' )*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are vectors and A is an   
+    m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+
+                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX         .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - COMPLEX         .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX          array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+  
+             updated vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if ( strncmp(trans, "N", 1)!= 0 && strncmp(trans, "T", 1) != 0 && ! 
+	 strncmp(trans, "C", 1) !=0 ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("CGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r 
+	    == 1.f && beta->i == 0.f)) {
+	return 0;
+    }
+
+    noconj = (strncmp(trans, "T", 1)==0);
+
+/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+  
+       up the start points in  X  and  Y. */
+
+    if (strncmp(trans, "N", 1)==0) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A.   
+
+       First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = i;
+		    Y(i).r = 0.f, Y(i).i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = i;
+		    i__3 = i;
+		    q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
+			    q__1.i = beta->r * Y(i).i + beta->i * Y(i)
+			    .r;
+		    Y(i).r = q__1.r, Y(i).i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = iy;
+		    Y(iy).r = 0.f, Y(iy).i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
+			    q__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
+			    .r;
+		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		if (X(jx).r != 0.f || X(jx).i != 0.f) {
+		    i__2 = jx;
+		    q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    q__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    temp.r = q__1.r, temp.i = q__1.i;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i;
+			i__4 = i;
+			i__5 = i + j * a_dim1;
+			q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				q__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
+				.r;
+			q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + 
+				q__2.i;
+			Y(i).r = q__1.r, Y(i).i = q__1.i;
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		if (X(jx).r != 0.f || X(jx).i != 0.f) {
+		    i__2 = jx;
+		    q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    q__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    temp.r = q__1.r, temp.i = q__1.i;
+		    iy = ky;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = iy;
+			i__4 = iy;
+			i__5 = i + j * a_dim1;
+			q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				q__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
+				.r;
+			q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + 
+				q__2.i;
+			Y(iy).r = q__1.r, Y(iy).i = q__1.i;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+ */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp.r = 0.f, temp.i = 0.f;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i;
+			q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i)
+				.i, q__2.i = A(i,j).r * X(i).i + A(i,j)
+				.i * X(i).r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			r_cnjg(&q__3, &A(i,j));
+			i__3 = i;
+			q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
+				q__2.i = q__3.r * X(i).i + q__3.i * X(i)
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = 
+			alpha->r * temp.i + alpha->i * temp.r;
+		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
+		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
+		jy += *incy;
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp.r = 0.f, temp.i = 0.f;
+		ix = kx;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = ix;
+			q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix)
+				.i, q__2.i = A(i,j).r * X(ix).i + A(i,j)
+				.i * X(ix).r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			ix += *incx;
+/* L120: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			r_cnjg(&q__3, &A(i,j));
+			i__3 = ix;
+			q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
+				q__2.i = q__3.r * X(ix).i + q__3.i * X(ix)
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			ix += *incx;
+/* L130: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = 
+			alpha->r * temp.i + alpha->i * temp.r;
+		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
+		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
+		jy += *incy;
+/* L140: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CGEMV . */
+
+} /* cgemv_ */
+
diff --git a/CBLAS/cgerc.c b/CBLAS/cgerc.c
new file mode 100644
index 0000000..eec23f3
--- /dev/null
+++ b/CBLAS/cgerc.c
@@ -0,0 +1,205 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int cgerc_(integer *m, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer info;
+    static complex temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    CGERC  performs the rank 1 operation   
+
+       A := alpha*x*conjg( y' ) + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX         .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("CGERC ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
+		r_cnjg(&q__2, &Y(jy));
+		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			alpha->r * q__2.i + alpha->i * q__2.r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = i;
+		    q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i =
+			     X(i).r * temp.i + X(i).i * temp.r;
+		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
+		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
+		r_cnjg(&q__2, &Y(jy));
+		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			alpha->r * q__2.i + alpha->i * q__2.r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = ix;
+		    q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i =
+			     X(ix).r * temp.i + X(ix).i * temp.r;
+		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
+		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of CGERC . */
+
+} /* cgerc_ */
+
diff --git a/CBLAS/cgeru.c b/CBLAS/cgeru.c
new file mode 100644
index 0000000..dc43c59
--- /dev/null
+++ b/CBLAS/cgeru.c
@@ -0,0 +1,202 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int cgeru_(integer *m, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer info;
+    static complex temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    CGERU  performs the rank 1 operation   
+
+       A := alpha*x*y' + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX         .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("CGERU ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
+		i__2 = jy;
+		q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i =
+			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = i;
+		    q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i =
+			     X(i).r * temp.i + X(i).i * temp.r;
+		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
+		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
+		i__2 = jy;
+		q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i =
+			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = ix;
+		    q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i =
+			     X(ix).r * temp.i + X(ix).i * temp.r;
+		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
+		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of CGERU . */
+
+} /* cgeru_ */
+
diff --git a/CBLAS/chemv.c b/CBLAS/chemv.c
new file mode 100644
index 0000000..a61a061
--- /dev/null
+++ b/CBLAS/chemv.c
@@ -0,0 +1,420 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int chemv_(char *uplo, integer *n, complex *alpha, complex *
+	a, integer *lda, complex *x, integer *incx, complex *beta, complex *y,
+	 integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer info;
+    static complex temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    CHEMV  performs the matrix-vector  operation   
+
+       y := alpha*A*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are n element vectors and   
+    A is an n by n hermitian matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX         .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the hermitian matrix and the strictly   
+             lower triangular part of A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the hermitian matrix and the strictly   
+             upper triangular part of A is not referenced.   
+             Note that the imaginary parts of the diagonal elements need 
+  
+             not be set and are assumed to be zero.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - COMPLEX         .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y. On exit, Y is overwritten by the updated   
+             vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if ( strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1) !=0 ) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	input_error_dist("CHEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
+	    beta->i == 0.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A.   
+
+       First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = i;
+		    Y(i).r = 0.f, Y(i).i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = i;
+		    i__3 = i;
+		    q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
+			    q__1.i = beta->r * Y(i).i + beta->i * Y(i)
+			    .r;
+		    Y(i).r = q__1.r, Y(i).i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = iy;
+		    Y(iy).r = 0.f, Y(iy).i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
+			    q__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
+			    .r;
+		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i =
+			 alpha->r * X(j).i + alpha->i * X(j).r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    i__3 = i;
+		    i__4 = i;
+		    i__5 = i + j * a_dim1;
+		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i;
+		    Y(i).r = q__1.r, Y(i).i = q__1.i;
+		    r_cnjg(&q__3, &A(i,j));
+		    i__3 = i;
+		    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i =
+			     q__3.r * X(i).i + q__3.i * X(i).r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i;
+		q__2.r = Y(j).r + q__3.r, q__2.i = Y(j).i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		Y(j).r = q__1.r, Y(j).i = q__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i =
+			 alpha->r * X(jx).i + alpha->i * X(jx).r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i + j * a_dim1;
+		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i;
+		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
+		    r_cnjg(&q__3, &A(i,j));
+		    i__3 = ix;
+		    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i =
+			     q__3.r * X(ix).i + q__3.i * X(ix).r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i;
+		q__2.r = Y(jy).r + q__3.r, q__2.i = Y(jy).i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i =
+			 alpha->r * X(j).i + alpha->i * X(j).r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i;
+		q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i;
+		Y(j).r = q__1.r, Y(j).i = q__1.i;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    i__3 = i;
+		    i__4 = i;
+		    i__5 = i + j * a_dim1;
+		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i;
+		    Y(i).r = q__1.r, Y(i).i = q__1.i;
+		    r_cnjg(&q__3, &A(i,j));
+		    i__3 = i;
+		    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i =
+			     q__3.r * X(i).i + q__3.i * X(i).r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i;
+		Y(j).r = q__1.r, Y(j).i = q__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i =
+			 alpha->r * X(jx).i + alpha->i * X(jx).r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i;
+		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
+		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i + j * a_dim1;
+		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i;
+		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
+		    r_cnjg(&q__3, &A(i,j));
+		    i__3 = ix;
+		    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i =
+			     q__3.r * X(ix).i + q__3.i * X(ix).r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
+		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHEMV . */
+
+} /* chemv_ */
+
diff --git a/CBLAS/cher2.c b/CBLAS/cher2.c
new file mode 100644
index 0000000..f9329cb
--- /dev/null
+++ b/CBLAS/cher2.c
@@ -0,0 +1,435 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int cher2_(char *uplo, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer info;
+    static complex temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    CHER2  performs the hermitian rank 2 operation   
+
+       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,   
+
+    where alpha is a scalar, x and y are n element vectors and A is an n 
+  
+    by n hermitian matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX         .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the hermitian matrix and the strictly   
+             lower triangular part of A is not referenced. On exit, the   
+             upper triangular part of the array A is overwritten by the   
+             upper triangular part of the updated matrix.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the hermitian matrix and the strictly   
+             upper triangular part of A is not referenced. On exit, the   
+             lower triangular part of the array A is overwritten by the   
+             lower triangular part of the updated matrix.   
+             Note that the imaginary parts of the diagonal elements need 
+  
+             not be set, they are assumed to be zero, and on exit they   
+             are set to zero.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1) != 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("CHER2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*     Set up the start points in X and Y if the increments are not both 
+  
+       unity. */
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A. */
+
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f 
+			|| Y(j).i != 0.f)) {
+		    r_cnjg(&q__2, &Y(j));
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = j;
+		    q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
+			    q__2.i = alpha->r * X(j).i + alpha->i * X(j)
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = i;
+			q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
+				q__3.i = X(i).r * temp1.i + X(i).i * 
+				temp1.r;
+			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
+				q__3.i;
+			i__6 = i;
+			q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
+				q__4.i = Y(i).r * temp2.i + Y(i).i * 
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+/* L10: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
+			    q__2.i = X(j).r * temp1.i + X(j).i * 
+			    temp1.r;
+		    i__5 = j;
+		    q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
+			    q__3.i = Y(j).r * temp2.i + Y(j).i * 
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    d__1 = A(j,j).r + q__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f 
+			|| Y(jy).i != 0.f)) {
+		    r_cnjg(&q__2, &Y(jy));
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = jx;
+		    q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    q__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = ix;
+			q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
+				q__3.i = X(ix).r * temp1.i + X(ix).i * 
+				temp1.r;
+			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
+				q__3.i;
+			i__6 = iy;
+			q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
+				q__4.i = Y(iy).r * temp2.i + Y(iy).i * 
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
+			    q__2.i = X(jx).r * temp1.i + X(jx).i * 
+			    temp1.r;
+		    i__5 = jy;
+		    q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
+			    q__3.i = Y(jy).r * temp2.i + Y(jy).i * 
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    d__1 = A(j,j).r + q__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f 
+			|| Y(j).i != 0.f)) {
+		    r_cnjg(&q__2, &Y(j));
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = j;
+		    q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
+			    q__2.i = alpha->r * X(j).i + alpha->i * X(j)
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
+			    q__2.i = X(j).r * temp1.i + X(j).i * 
+			    temp1.r;
+		    i__5 = j;
+		    q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
+			    q__3.i = Y(j).r * temp2.i + Y(j).i * 
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    d__1 = A(j,j).r + q__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		    i__2 = *n;
+		    for (i = j + 1; i <= *n; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = i;
+			q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
+				q__3.i = X(i).r * temp1.i + X(i).i * 
+				temp1.r;
+			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
+				q__3.i;
+			i__6 = i;
+			q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
+				q__4.i = Y(i).r * temp2.i + Y(i).i * 
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+/* L50: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f 
+			|| Y(jy).i != 0.f)) {
+		    r_cnjg(&q__2, &Y(jy));
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = jx;
+		    q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    q__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
+			    q__2.i = X(jx).r * temp1.i + X(jx).i * 
+			    temp1.r;
+		    i__5 = jy;
+		    q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
+			    q__3.i = Y(jy).r * temp2.i + Y(jy).i * 
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    d__1 = A(j,j).r + q__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i = j + 1; i <= *n; ++i) {
+			ix += *incx;
+			iy += *incy;
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = ix;
+			q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
+				q__3.i = X(ix).r * temp1.i + X(ix).i * 
+				temp1.r;
+			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
+				q__3.i;
+			i__6 = iy;
+			q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
+				q__4.i = Y(iy).r * temp2.i + Y(iy).i * 
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
+/* L70: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.f;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHER2 . */
+
+} /* cher2_ */
+
diff --git a/CBLAS/cscal.c b/CBLAS/cscal.c
new file mode 100644
index 0000000..8cc5173
--- /dev/null
+++ b/CBLAS/cscal.c
@@ -0,0 +1,70 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int cscal_(integer *n, complex *ca, complex *cx, integer *
+	incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1;
+
+    /* Local variables */
+    static integer i, nincx;
+
+
+/*     scales a vector by a constant.   
+       jack dongarra, linpack,  3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define CX(I) cx[(I)-1]
+
+
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	i__3 = i;
+	i__4 = i;
+	q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX(
+		i).i + ca->i * CX(i).r;
+	CX(i).r = q__1.r, CX(i).i = q__1.i;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__2 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__1 = i;
+	i__3 = i;
+	q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX(
+		i).i + ca->i * CX(i).r;
+	CX(i).r = q__1.r, CX(i).i = q__1.i;
+/* L30: */
+    }
+    return 0;
+} /* cscal_ */
+
diff --git a/CBLAS/ctrsv.c b/CBLAS/ctrsv.c
new file mode 100644
index 0000000..66db64c
--- /dev/null
+++ b/CBLAS/ctrsv.c
@@ -0,0 +1,508 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int ctrsv_(char *uplo, char *trans, char *diag, integer *n, 
+	complex *a, integer *lda, complex *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    void c_div(complex *, complex *, complex *), r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    static integer info;
+    static complex temp;
+    static integer i, j;
+    static integer ix, jx, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*  Purpose   
+    =======   
+
+    CTRSV  solves one of the systems of equations   
+
+       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,   
+
+    where b and x are n element vectors and A is an n by n unit, or   
+    non-unit, upper or lower triangular matrix.   
+
+    No test for singularity or near-singularity is included in this   
+    routine. Such tests must be performed before calling this routine.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix is an upper or   
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the equations to be solved as   
+             follows:   
+
+                TRANS = 'N' or 'n'   A*x = b.   
+
+                TRANS = 'T' or 't'   A'*x = b.   
+
+                TRANS = 'C' or 'c'   conjg( A' )*x = b.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit   
+             triangular as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular matrix and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular matrix and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
+  
+             A are not referenced either, but are assumed to be unity.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX          array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element right-hand side vector b. On exit, X is overwritten 
+  
+             with the solution vector x.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&
+	       strncmp(trans, "C", 1)!=0) {
+	info = 2;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	input_error_dist("CTRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = (strncmp(trans, "T", 1)==0);
+    nounit = (strncmp(diag, "N", 1)==0);
+
+/*     Set up the start point in X if the increment is not unity. This   
+       will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (X(j).r != 0.f || X(j).i != 0.f) {
+			if (nounit) {
+			    i__1 = j;
+			    c_div(&q__1, &X(j), &A(j,j));
+			    X(j).r = q__1.r, X(j).i = q__1.i;
+			}
+			i__1 = j;
+			temp.r = X(j).r, temp.i = X(j).i;
+			for (i = j - 1; i >= 1; --i) {
+			    i__1 = i;
+			    i__2 = i;
+			    i__3 = i + j * a_dim1;
+			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - 
+				    q__2.i;
+			    X(i).r = q__1.r, X(i).i = q__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (X(jx).r != 0.f || X(jx).i != 0.f) {
+			if (nounit) {
+			    i__1 = jx;
+			    c_div(&q__1, &X(jx), &A(j,j));
+			    X(jx).r = q__1.r, X(jx).i = q__1.i;
+			}
+			i__1 = jx;
+			temp.r = X(jx).r, temp.i = X(jx).i;
+			ix = jx;
+			for (i = j - 1; i >= 1; --i) {
+			    ix -= *incx;
+			    i__1 = ix;
+			    i__2 = ix;
+			    i__3 = i + j * a_dim1;
+			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - 
+				    q__2.i;
+			    X(ix).r = q__1.r, X(ix).i = q__1.i;
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = j;
+		    if (X(j).r != 0.f || X(j).i != 0.f) {
+			if (nounit) {
+			    i__2 = j;
+			    c_div(&q__1, &X(j), &A(j,j));
+			    X(j).r = q__1.r, X(j).i = q__1.i;
+			}
+			i__2 = j;
+			temp.r = X(j).r, temp.i = X(j).i;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    i__3 = i;
+			    i__4 = i;
+			    i__5 = i + j * a_dim1;
+			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - 
+				    q__2.i;
+			    X(i).r = q__1.r, X(i).i = q__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = jx;
+		    if (X(jx).r != 0.f || X(jx).i != 0.f) {
+			if (nounit) {
+			    i__2 = jx;
+			    c_div(&q__1, &X(jx), &A(j,j));
+			    X(jx).r = q__1.r, X(jx).i = q__1.i;
+			}
+			i__2 = jx;
+			temp.r = X(jx).r, temp.i = X(jx).i;
+			ix = jx;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    ix += *incx;
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i + j * a_dim1;
+			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - 
+				    q__2.i;
+			    X(ix).r = q__1.r, X(ix).i = q__1.i;
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = j;
+		    temp.r = X(j).r, temp.i = X(j).i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    i__3 = i + j * a_dim1;
+			    i__4 = i;
+			    q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
+				    i).i, q__2.i = A(i,j).r * X(i).i + 
+				    A(i,j).i * X(i).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &A(j,j));
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    r_cnjg(&q__3, &A(i,j));
+			    i__3 = i;
+			    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
+				    q__2.i = q__3.r * X(i).i + q__3.i * X(
+				    i).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &A(j,j));
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__2 = j;
+		    X(j).r = temp.r, X(j).i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    ix = kx;
+		    i__2 = jx;
+		    temp.r = X(jx).r, temp.i = X(jx).i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    i__3 = i + j * a_dim1;
+			    i__4 = ix;
+			    q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
+				    ix).i, q__2.i = A(i,j).r * X(ix).i + 
+				    A(i,j).i * X(ix).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L120: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &A(j,j));
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    r_cnjg(&q__3, &A(i,j));
+			    i__3 = ix;
+			    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
+				    q__2.i = q__3.r * X(ix).i + q__3.i * X(
+				    ix).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L130: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &A(j,j));
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__2 = jx;
+		    X(jx).r = temp.r, X(jx).i = temp.i;
+		    jx += *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = X(j).r, temp.i = X(j).i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    i__2 = i + j * a_dim1;
+			    i__3 = i;
+			    q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
+				    i).i, q__2.i = A(i,j).r * X(i).i + 
+				    A(i,j).i * X(i).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &A(j,j));
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    r_cnjg(&q__3, &A(i,j));
+			    i__2 = i;
+			    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
+				    q__2.i = q__3.r * X(i).i + q__3.i * X(
+				    i).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &A(j,j));
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__1 = j;
+		    X(j).r = temp.r, X(j).i = temp.i;
+/* L170: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    ix = kx;
+		    i__1 = jx;
+		    temp.r = X(jx).r, temp.i = X(jx).i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    i__2 = i + j * a_dim1;
+			    i__3 = ix;
+			    q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
+				    ix).i, q__2.i = A(i,j).r * X(ix).i + 
+				    A(i,j).i * X(ix).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L180: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &A(j,j));
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    r_cnjg(&q__3, &A(i,j));
+			    i__2 = ix;
+			    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
+				    q__2.i = q__3.r * X(ix).i + q__3.i * X(
+				    ix).r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L190: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &A(j,j));
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__1 = jx;
+		    X(jx).r = temp.r, X(jx).i = temp.i;
+		    jx -= *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRSV . */
+
+} /* ctrsv_ */
+
diff --git a/CBLAS/dasum.c b/CBLAS/dasum.c
new file mode 100644
index 0000000..42d1c74
--- /dev/null
+++ b/CBLAS/dasum.c
@@ -0,0 +1,88 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal dasum_(integer *n, doublereal *dx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3, d__4, d__5, d__6;
+
+    /* Local variables */
+    static integer i, m;
+    static doublereal dtemp;
+    static integer nincx, mp1;
+
+
+/*     takes the sum of the absolute values.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DX(I) dx[(I)-1]
+
+
+    ret_val = 0.;
+    dtemp = 0.;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	dtemp += (d__1 = DX(i), abs(d__1));
+/* L10: */
+    }
+    ret_val = dtemp;
+    return ret_val;
+
+/*        code for increment equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 6;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i = 1; i <= m; ++i) {
+	dtemp += (d__1 = DX(i), abs(d__1));
+/* L30: */
+    }
+    if (*n < 6) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i = mp1; i <= *n; i += 6) {
+	dtemp = dtemp + (d__1 = DX(i), abs(d__1)) + (d__2 = DX(i + 1), abs(
+		d__2)) + (d__3 = DX(i + 2), abs(d__3)) + (d__4 = DX(i + 3), 
+		abs(d__4)) + (d__5 = DX(i + 4), abs(d__5)) + (d__6 = DX(i + 5)
+		, abs(d__6));
+/* L50: */
+    }
+L60:
+    ret_val = dtemp;
+    return ret_val;
+} /* dasum_ */
+
diff --git a/CBLAS/daxpy.c b/CBLAS/daxpy.c
new file mode 100644
index 0000000..953ae62
--- /dev/null
+++ b/CBLAS/daxpy.c
@@ -0,0 +1,94 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int daxpy_(integer *n, doublereal *da, doublereal *dx, 
+	integer *incx, doublereal *dy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i, m, ix, iy, mp1;
+
+
+/*     constant times a vector plus a vector.   
+       uses unrolled loops for increments equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DY(I) dy[(I)-1]
+#define DX(I) dx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*da == 0.) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	DY(iy) += *da * DX(ix);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 4;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	DY(i) += *da * DX(i);
+/* L30: */
+    }
+    if (*n < 4) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 4) {
+	DY(i) += *da * DX(i);
+	DY(i + 1) += *da * DX(i + 1);
+	DY(i + 2) += *da * DX(i + 2);
+	DY(i + 3) += *da * DX(i + 3);
+/* L50: */
+    }
+    return 0;
+} /* daxpy_ */
+
diff --git a/CBLAS/dcabs1.c b/CBLAS/dcabs1.c
new file mode 100644
index 0000000..f4f9f77
--- /dev/null
+++ b/CBLAS/dcabs1.c
@@ -0,0 +1,28 @@
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal dcabs1_(doublecomplex *z)
+{
+/* >>Start of File<<   
+
+       System generated locals */
+    doublereal ret_val;
+    static doublecomplex equiv_0[1];
+
+    /* Local variables */
+#define t ((doublereal *)equiv_0)
+#define zz (equiv_0)
+
+    zz->r = z->r, zz->i = z->i;
+    ret_val = abs(t[0]) + abs(t[1]);
+    return ret_val;
+} /* dcabs1_ */
+
+#undef zz
+#undef t
+
+
diff --git a/CBLAS/dcopy.c b/CBLAS/dcopy.c
new file mode 100644
index 0000000..a428b8f
--- /dev/null
+++ b/CBLAS/dcopy.c
@@ -0,0 +1,94 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int dcopy_(integer *n, doublereal *dx, integer *incx, 
+	doublereal *dy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i, m, ix, iy, mp1;
+
+
+/*     copies a vector, x, to a vector, y.   
+       uses unrolled loops for increments equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DY(I) dy[(I)-1]
+#define DX(I) dx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	DY(iy) = DX(ix);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 7;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	DY(i) = DX(i);
+/* L30: */
+    }
+    if (*n < 7) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 7) {
+	DY(i) = DX(i);
+	DY(i + 1) = DX(i + 1);
+	DY(i + 2) = DX(i + 2);
+	DY(i + 3) = DX(i + 3);
+	DY(i + 4) = DX(i + 4);
+	DY(i + 5) = DX(i + 5);
+	DY(i + 6) = DX(i + 6);
+/* L50: */
+    }
+    return 0;
+} /* dcopy_ */
+
diff --git a/CBLAS/ddot.c b/CBLAS/ddot.c
new file mode 100644
index 0000000..10ed2b6
--- /dev/null
+++ b/CBLAS/ddot.c
@@ -0,0 +1,97 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal ddot_(integer *n, doublereal *dx, integer *incx, doublereal *dy, 
+	integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val;
+
+    /* Local variables */
+    static integer i, m;
+    static doublereal dtemp;
+    static integer ix, iy, mp1;
+
+
+/*     forms the dot product of two vectors.   
+       uses unrolled loops for increments equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DY(I) dy[(I)-1]
+#define DX(I) dx[(I)-1]
+
+
+    ret_val = 0.;
+    dtemp = 0.;
+    if (*n <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	dtemp += DX(ix) * DY(iy);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    ret_val = dtemp;
+    return ret_val;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	dtemp += DX(i) * DY(i);
+/* L30: */
+    }
+    if (*n < 5) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 5) {
+	dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) * 
+		DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4);
+/* L50: */
+    }
+L60:
+    ret_val = dtemp;
+    return ret_val;
+} /* ddot_ */
+
diff --git a/CBLAS/dgemm.c b/CBLAS/dgemm.c
new file mode 100644
index 0000000..31c7e59
--- /dev/null
+++ b/CBLAS/dgemm.c
@@ -0,0 +1,393 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, doublereal *alpha, doublereal *a, integer *lda, 
+	doublereal *b, integer *ldb, doublereal *beta, doublereal *c, integer 
+	*ldc)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3;
+
+    /* Local variables */
+    static integer info;
+    static logical nota, notb;
+    static doublereal temp;
+    static integer i, j, l, ncola;
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    DGEMM  performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of   
+
+       op( X ) = X   or   op( X ) = X',   
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+  
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - CHARACTER*1.   
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+
+                TRANSA = 'C' or 'c',  op( A ) = A'.   
+
+             Unchanged on exit.   
+
+    TRANSB - CHARACTER*1.   
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+
+                TRANSB = 'C' or 'c',  op( B ) = B'.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry,  M  specifies  the number  of rows  of the  matrix 
+  
+             op( A )  and of the  matrix  C.  M  must  be at least  zero. 
+  
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry,  N  specifies the number  of columns of the matrix 
+  
+             op( B ) and the number of columns of the matrix C. N must be 
+  
+             at least zero.   
+             Unchanged on exit.   
+
+    K      - INTEGER.   
+             On entry,  K  specifies  the number of columns of the matrix 
+  
+             op( A ) and the number of rows of the matrix op( B ). K must 
+  
+             be at least  zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is 
+  
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.   
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k 
+  
+             part of the array  A  must contain the matrix  A,  otherwise 
+  
+             the leading  k by m  part of the array  A  must contain  the 
+  
+             matrix A.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then 
+  
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at 
+  
+             least  max( 1, k ).   
+             Unchanged on exit.   
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+  
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n 
+  
+             part of the array  B  must contain the matrix  B,  otherwise 
+  
+             the leading  n by k  part of the array  B  must contain  the 
+  
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - INTEGER.   
+             On entry, LDB specifies the first dimension of B as declared 
+  
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then 
+  
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at 
+  
+             least  max( 1, n ).   
+             Unchanged on exit.   
+
+    BETA   - DOUBLE PRECISION.   
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is 
+  
+             supplied as zero then C need not be set on input.   
+             Unchanged on exit.   
+
+    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
+             Before entry, the leading  m by n  part of the array  C must 
+  
+             contain the matrix  C,  except when  beta  is zero, in which 
+  
+             case C need not be set on entry.   
+             On exit, the array  C  is overwritten by the  m by n  matrix 
+  
+             ( alpha*op( A )*op( B ) + beta*C ).   
+
+    LDC    - INTEGER.   
+             On entry, LDC specifies the first dimension of C as declared 
+  
+             in  the  calling  (sub)  program.   LDC  must  be  at  least 
+  
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 3 Blas routine.   
+
+    -- Written on 8-February-1989.   
+       Jack Dongarra, Argonne National Laboratory.   
+       Iain Duff, AERE Harwell.   
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
+       Sven Hammarling, Numerical Algorithms Group Ltd.   
+
+
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not 
+  
+       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows 
+  
+       and  columns of  A  and the  number of  rows  of  B  respectively. 
+  
+
+    
+   Parameter adjustments   
+       Function Body */
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
+#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
+
+    nota = (strncmp(transa, "N", 1)==0);
+    notb = (strncmp(transb, "N", 1)==0);
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && strncmp(transa, "C", 1)!=0 && strncmp(transa, "T", 1)!=0) {
+	info = 1;
+    } else if (! notb && strncmp(transb, "C", 1)!=0 && strncmp(transb,"T", 1)!=0) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	input_error_dist("DGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And if  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (*beta == 0.) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    C(i,j) = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    C(i,j) = *beta * C(i,j);
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			C(i,j) = 0.;
+/* L50: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			C(i,j) = *beta * C(i,j);
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= *k; ++l) {
+		    if (B(l,j) != 0.) {
+			temp = *alpha * B(l,j);
+			i__3 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    C(i,j) += temp * A(i,l);
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			temp += A(l,i) * B(l,j);
+/* L100: */
+		    }
+		    if (*beta == 0.) {
+			C(i,j) = *alpha * temp;
+		    } else {
+			C(i,j) = *alpha * temp + *beta * C(i,j);
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	}
+    } else {
+	if (nota) {
+
+/*           Form  C := alpha*A*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			C(i,j) = 0.;
+/* L130: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			C(i,j) = *beta * C(i,j);
+/* L140: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= *k; ++l) {
+		    if (B(j,l) != 0.) {
+			temp = *alpha * B(j,l);
+			i__3 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    C(i,j) += temp * A(i,l);
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+/* L170: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			temp += A(l,i) * B(j,l);
+/* L180: */
+		    }
+		    if (*beta == 0.) {
+			C(i,j) = *alpha * temp;
+		    } else {
+			C(i,j) = *alpha * temp + *beta * C(i,j);
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMM . */
+
+} /* dgemm_ */
+
diff --git a/CBLAS/dgemv.c b/CBLAS/dgemv.c
new file mode 100644
index 0000000..a418c07
--- /dev/null
+++ b/CBLAS/dgemv.c
@@ -0,0 +1,298 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dgemv_(char *trans, integer *m, integer *n, doublereal *
+	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
+	doublereal *beta, doublereal *y, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp;
+    static integer lenx, leny, i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    DGEMV  performs one of the matrix-vector operations   
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are vectors and A is an   
+    m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+    X      - DOUBLE PRECISION array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - DOUBLE PRECISION.   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - DOUBLE PRECISION array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+  
+             updated vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if ( strncmp(trans, "N", 1) !=0 &&  strncmp(trans, "T", 1) !=0 &&
+	strncmp(trans, "C", 1)!=0 ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("DGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) {
+	return 0;
+    }
+
+/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+  
+       up the start points in  X  and  Y. */
+
+    if (strncmp(trans, "N", 1)==0) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A.   
+
+       First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(i) = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(i) = *beta * Y(i);
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(iy) = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(iy) = *beta * Y(iy);
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.) {
+		    temp = *alpha * X(jx);
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			Y(i) += temp * A(i,j);
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.) {
+		    temp = *alpha * X(jx);
+		    iy = ky;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			Y(iy) += temp * A(i,j);
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp = 0.;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp += A(i,j) * X(i);
+/* L90: */
+		}
+		Y(jy) += *alpha * temp;
+		jy += *incy;
+/* L100: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp = 0.;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp += A(i,j) * X(ix);
+		    ix += *incx;
+/* L110: */
+		}
+		Y(jy) += *alpha * temp;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMV . */
+
+} /* dgemv_ */
+
diff --git a/CBLAS/dger.c b/CBLAS/dger.c
new file mode 100644
index 0000000..d80ad4e
--- /dev/null
+++ b/CBLAS/dger.c
@@ -0,0 +1,182 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int dger_(integer *m, integer *n, doublereal *alpha, 
+	doublereal *x, integer *incx, doublereal *y, integer *incy, 
+	doublereal *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    DGER   performs the rank 1 operation   
+
+       A := alpha*x*y' + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("DGER  ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    if (Y(jy) != 0.) {
+		temp = *alpha * Y(jy);
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    A(i,j) += X(i) * temp;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    if (Y(jy) != 0.) {
+		temp = *alpha * Y(jy);
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    A(i,j) += X(ix) * temp;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of DGER  . */
+
+} /* dger_ */
+
diff --git a/CBLAS/dnrm2.c b/CBLAS/dnrm2.c
new file mode 100644
index 0000000..602813b
--- /dev/null
+++ b/CBLAS/dnrm2.c
@@ -0,0 +1,83 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal dnrm2_(integer *n, doublereal *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal ret_val, d__1;
+
+    /* Builtin functions */
+    double sqrt(doublereal);
+
+    /* Local variables */
+    static doublereal norm, scale, absxi;
+    static integer ix;
+    static doublereal ssq;
+
+
+/*  DNRM2 returns the euclidean norm of a vector via the function   
+    name, so that   
+
+       DNRM2 := sqrt( x'*x )   
+
+
+
+    -- This version written on 25-October-1982.   
+       Modified on 14-October-1993 to inline the call to DLASSQ.   
+       Sven Hammarling, Nag Ltd.   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+
+    if (*n < 1 || *incx < 1) {
+	norm = 0.;
+    } else if (*n == 1) {
+	norm = abs(X(1));
+    } else {
+	scale = 0.;
+	ssq = 1.;
+/*        The following loop is equivalent to this call to the LAPACK 
+  
+          auxiliary routine:   
+          CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
+	    if (X(ix) != 0.) {
+		absxi = (d__1 = X(ix), abs(d__1));
+		if (scale < absxi) {
+/* Computing 2nd power */
+		    d__1 = scale / absxi;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absxi / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of DNRM2. */
+
+} /* dnrm2_ */
+
diff --git a/CBLAS/drot.c b/CBLAS/drot.c
new file mode 100644
index 0000000..bc5264b
--- /dev/null
+++ b/CBLAS/drot.c
@@ -0,0 +1,76 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int drot_(integer *n, doublereal *dx, integer *incx, 
+	doublereal *dy, integer *incy, doublereal *c, doublereal *s)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i;
+    static doublereal dtemp;
+    static integer ix, iy;
+
+
+/*     applies a plane rotation.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DY(I) dy[(I)-1]
+#define DX(I) dx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*       code for unequal increments or equal increments not equal   
+           to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	dtemp = *c * DX(ix) + *s * DY(iy);
+	DY(iy) = *c * DY(iy) - *s * DX(ix);
+	DX(ix) = dtemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	dtemp = *c * DX(i) + *s * DY(i);
+	DY(i) = *c * DY(i) - *s * DX(i);
+	DX(i) = dtemp;
+/* L30: */
+    }
+    return 0;
+} /* drot_ */
+
diff --git a/CBLAS/dscal.c b/CBLAS/dscal.c
new file mode 100644
index 0000000..2444740
--- /dev/null
+++ b/CBLAS/dscal.c
@@ -0,0 +1,83 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int dscal_(integer *n, doublereal *da, doublereal *dx, 
+	integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i, m, nincx, mp1;
+
+
+/*     scales a vector by a constant.   
+       uses unrolled loops for increment equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DX(I) dx[(I)-1]
+
+
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	DX(i) = *da * DX(i);
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i = 1; i <= m; ++i) {
+	DX(i) = *da * DX(i);
+/* L30: */
+    }
+    if (*n < 5) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i = mp1; i <= *n; i += 5) {
+	DX(i) = *da * DX(i);
+	DX(i + 1) = *da * DX(i + 1);
+	DX(i + 2) = *da * DX(i + 2);
+	DX(i + 3) = *da * DX(i + 3);
+	DX(i + 4) = *da * DX(i + 4);
+/* L50: */
+    }
+    return 0;
+} /* dscal_ */
+
diff --git a/CBLAS/dsymv.c b/CBLAS/dsymv.c
new file mode 100644
index 0000000..da8df4b
--- /dev/null
+++ b/CBLAS/dsymv.c
@@ -0,0 +1,299 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dsymv_(char *uplo, integer *n, doublereal *alpha, 
+	doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal 
+	*beta, doublereal *y, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    DSYMV  performs the matrix-vector  operation   
+
+       y := alpha*A*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are n element vectors and   
+    A is an n by n symmetric matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the symmetric matrix and the strictly   
+             lower triangular part of A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the symmetric matrix and the strictly   
+             upper triangular part of A is not referenced.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - DOUBLE PRECISION.   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y. On exit, Y is overwritten by the updated   
+             vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	input_error_dist("DSYMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0. && *beta == 1.) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A.   
+
+       First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(i) = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(i) = *beta * Y(i);
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(iy) = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(iy) = *beta * Y(iy);
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(j);
+		temp2 = 0.;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    Y(i) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(i);
+/* L50: */
+		}
+		Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(jx);
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    Y(iy) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(ix);
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(j);
+		temp2 = 0.;
+		Y(j) += temp1 * A(j,j);
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    Y(i) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(i);
+/* L90: */
+		}
+		Y(j) += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(jx);
+		temp2 = 0.;
+		Y(jy) += temp1 * A(j,j);
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    ix += *incx;
+		    iy += *incy;
+		    Y(iy) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(ix);
+/* L110: */
+		}
+		Y(jy) += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYMV . */
+
+} /* dsymv_ */
+
diff --git a/CBLAS/dsyr2.c b/CBLAS/dsyr2.c
new file mode 100644
index 0000000..a2fd89c
--- /dev/null
+++ b/CBLAS/dsyr2.c
@@ -0,0 +1,263 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dsyr2_(char *uplo, integer *n, doublereal *alpha, 
+	doublereal *x, integer *incx, doublereal *y, integer *incy, 
+	doublereal *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    DSYR2  performs the symmetric rank 2 operation   
+
+       A := alpha*x*y' + alpha*y*x' + A,   
+
+    where alpha is a scalar, x and y are n element vectors and A is an n 
+  
+    by n symmetric matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the symmetric matrix and the strictly   
+             lower triangular part of A is not referenced. On exit, the   
+             upper triangular part of the array A is overwritten by the   
+             upper triangular part of the updated matrix.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the symmetric matrix and the strictly   
+             upper triangular part of A is not referenced. On exit, the   
+             lower triangular part of the array A is overwritten by the   
+             lower triangular part of the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("DSYR2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.) {
+	return 0;
+    }
+
+/*     Set up the start points in X and Y if the increments are not both 
+  
+       unity. */
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A. */
+
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(j) != 0. || Y(j) != 0.) {
+		    temp1 = *alpha * Y(j);
+		    temp2 = *alpha * X(j);
+		    i__2 = j;
+		    for (i = 1; i <= j; ++i) {
+			A(i,j) = A(i,j) + X(i) * temp1 
+				+ Y(i) * temp2;
+/* L10: */
+		    }
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0. || Y(jy) != 0.) {
+		    temp1 = *alpha * Y(jy);
+		    temp2 = *alpha * X(jx);
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j;
+		    for (i = 1; i <= j; ++i) {
+			A(i,j) = A(i,j) + X(ix) * temp1 
+				+ Y(iy) * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(j) != 0. || Y(j) != 0.) {
+		    temp1 = *alpha * Y(j);
+		    temp2 = *alpha * X(j);
+		    i__2 = *n;
+		    for (i = j; i <= *n; ++i) {
+			A(i,j) = A(i,j) + X(i) * temp1 
+				+ Y(i) * temp2;
+/* L50: */
+		    }
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0. || Y(jy) != 0.) {
+		    temp1 = *alpha * Y(jy);
+		    temp2 = *alpha * X(jx);
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i = j; i <= *n; ++i) {
+			A(i,j) = A(i,j) + X(ix) * temp1 
+				+ Y(iy) * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYR2 . */
+
+} /* dsyr2_ */
+
diff --git a/CBLAS/dtrsm.c b/CBLAS/dtrsm.c
new file mode 100644
index 0000000..2280d36
--- /dev/null
+++ b/CBLAS/dtrsm.c
@@ -0,0 +1,481 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dtrsm_(char *side, char *uplo, char *transa, char *diag, 
+	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
+	lda, doublereal *b, integer *ldb)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp;
+    static integer i, j, k;
+    static logical lside;
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical nounit;
+
+
+/*  Purpose   
+    =======   
+
+    DTRSM  solves one of the matrix equations   
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,   
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or 
+  
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of 
+  
+
+       op( A ) = A   or   op( A ) = A'.   
+
+    The matrix X is overwritten on B.   
+
+    Parameters   
+    ==========   
+
+    SIDE   - CHARACTER*1.   
+             On entry, SIDE specifies whether op( A ) appears on the left 
+  
+             or right of X as follows:   
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.   
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.   
+
+             Unchanged on exit.   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix A is an upper or 
+  
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANSA - CHARACTER*1.   
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSA = 'N' or 'n'   op( A ) = A.   
+
+                TRANSA = 'T' or 't'   op( A ) = A'.   
+
+                TRANSA = 'C' or 'c'   op( A ) = A'.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit triangular 
+  
+             as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of B. M must be at 
+  
+             least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of B.  N must be 
+  
+             at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - DOUBLE PRECISION.   
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is 
+  
+             zero then  A is not referenced and  B need not be set before 
+  
+             entry.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m 
+  
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. 
+  
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k 
+  
+             upper triangular part of the array  A must contain the upper 
+  
+             triangular matrix  and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k 
+  
+             lower triangular part of the array  A must contain the lower 
+  
+             triangular matrix  and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of 
+  
+             A  are not referenced either,  but are assumed to be  unity. 
+  
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then 
+  
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' 
+  
+             then LDA must be at least max( 1, n ).   
+             Unchanged on exit.   
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).   
+             Before entry,  the leading  m by n part of the array  B must 
+  
+             contain  the  right-hand  side  matrix  B,  and  on exit  is 
+  
+             overwritten by the solution matrix  X.   
+
+    LDB    - INTEGER.   
+             On entry, LDB specifies the first dimension of B as declared 
+  
+             in  the  calling  (sub)  program.   LDB  must  be  at  least 
+  
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 3 Blas routine.   
+
+
+    -- Written on 8-February-1989.   
+       Jack Dongarra, Argonne National Laboratory.   
+       Iain Duff, AERE Harwell.   
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
+       Sven Hammarling, Numerical Algorithms Group Ltd.   
+
+       Test the input parameters.   
+    
+   Parameter adjustments   
+       Function Body */
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
+
+    lside = (strncmp(side, "L", 1)==0);
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = (strncmp(diag, "N", 1)==0);
+    upper = (strncmp(uplo, "U", 1)==0);
+
+    info = 0;
+    if (! lside && strncmp(side, "R", 1)!=0) {
+	info = 1;
+    } else if (! upper && strncmp(uplo, "L", 1)!=0) {
+	info = 2;
+    } else if (strncmp(transa, "N", 1)!=0 && strncmp(transa, "T", 1)!=0
+	       &&  strncmp(transa, "C", 1)!=0) {
+	info = 3;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("DTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = *m;
+	    for (i = 1; i <= *m; ++i) {
+		B(i,j) = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (strncmp(transa, "N", 1)==0) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = *alpha * B(i,j);
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			if (B(k,j) != 0.) {
+			    if (nounit) {
+				B(k,j) /= A(k,k);
+			    }
+			    i__2 = k - 1;
+			    for (i = 1; i <= k-1; ++i) {
+				B(i,j) -= B(k,j) * A(i,k);
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = *alpha * B(i,j);
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= *m; ++k) {
+			if (B(k,j) != 0.) {
+			    if (nounit) {
+				B(k,j) /= A(k,k);
+			    }
+			    i__3 = *m;
+			    for (i = k + 1; i <= *m; ++i) {
+				B(i,j) -= B(k,j) * A(i,k);
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*inv( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			temp = *alpha * B(i,j);
+			i__3 = i - 1;
+			for (k = 1; k <= i-1; ++k) {
+			    temp -= A(k,i) * B(k,j);
+/* L110: */
+			}
+			if (nounit) {
+			    temp /= A(i,i);
+			}
+			B(i,j) = temp;
+/* L120: */
+		    }
+/* L130: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    for (i = *m; i >= 1; --i) {
+			temp = *alpha * B(i,j);
+			i__2 = *m;
+			for (k = i + 1; k <= *m; ++k) {
+			    temp -= A(k,i) * B(k,j);
+/* L140: */
+			}
+			if (nounit) {
+			    temp /= A(i,i);
+			}
+			B(i,j) = temp;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (strncmp(transa, "N", 1)==0) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = *alpha * B(i,j);
+/* L170: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= j-1; ++k) {
+			if (A(k,j) != 0.) {
+			    i__3 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				B(i,j) -= A(k,j) * B(i,k);
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+		    if (nounit) {
+			temp = 1. / A(j,j);
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = temp * B(i,j);
+/* L200: */
+			}
+		    }
+/* L210: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = *alpha * B(i,j);
+/* L220: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= *n; ++k) {
+			if (A(k,j) != 0.) {
+			    i__2 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				B(i,j) -= A(k,j) * B(i,k);
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    if (nounit) {
+			temp = 1. / A(j,j);
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,j) = temp * B(i,j);
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*inv( A' ). */
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			temp = 1. / A(k,k);
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,k) = temp * B(i,k);
+/* L270: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= k-1; ++j) {
+			if (A(j,k) != 0.) {
+			    temp = A(j,k);
+			    i__2 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				B(i,j) -= temp * B(i,k);
+/* L280: */
+			    }
+			}
+/* L290: */
+		    }
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,k) = *alpha * B(i,k);
+/* L300: */
+			}
+		    }
+/* L310: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= *n; ++k) {
+		    if (nounit) {
+			temp = 1. / A(k,k);
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,k) = temp * B(i,k);
+/* L320: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= *n; ++j) {
+			if (A(j,k) != 0.) {
+			    temp = A(j,k);
+			    i__3 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				B(i,j) -= temp * B(i,k);
+/* L330: */
+			    }
+			}
+/* L340: */
+		    }
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    B(i,k) = *alpha * B(i,k);
+/* L350: */
+			}
+		    }
+/* L360: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRSM . */
+
+} /* dtrsm_ */
+
diff --git a/CBLAS/dtrsv.c b/CBLAS/dtrsv.c
new file mode 100644
index 0000000..7ca21a1
--- /dev/null
+++ b/CBLAS/dtrsv.c
@@ -0,0 +1,337 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int dtrsv_(char *uplo, char *trans, char *diag, integer *n, 
+	doublereal *a, integer *lda, doublereal *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static doublereal temp;
+    static integer i, j;
+    static integer ix, jx, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical nounit;
+
+
+/*  Purpose   
+    =======   
+
+    DTRSV  solves one of the systems of equations   
+
+       A*x = b,   or   A'*x = b,   
+
+    where b and x are n element vectors and A is an n by n unit, or   
+    non-unit, upper or lower triangular matrix.   
+
+    No test for singularity or near-singularity is included in this   
+    routine. Such tests must be performed before calling this routine.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix is an upper or   
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the equations to be solved as   
+             follows:   
+
+                TRANS = 'N' or 'n'   A*x = b.   
+
+                TRANS = 'T' or 't'   A'*x = b.   
+
+                TRANS = 'C' or 'c'   A'*x = b.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit   
+             triangular as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular matrix and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular matrix and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
+  
+             A are not referenced either, but are assumed to be unity.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - DOUBLE PRECISION array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element right-hand side vector b. On exit, X is overwritten 
+  
+             with the solution vector x.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&
+	       strncmp(trans, "C", 1)!=0) {
+	info = 2;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	input_error_dist("DTRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = (strncmp(diag, "N", 1)==0);
+
+/*     Set up the start point in X if the increment is not unity. This   
+       will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (X(j) != 0.) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			for (i = j - 1; i >= 1; --i) {
+			    X(i) -= temp * A(i,j);
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    if (X(jx) != 0.) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			for (i = j - 1; i >= 1; --i) {
+			    ix -= *incx;
+			    X(ix) -= temp * A(i,j);
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(j) != 0.) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    X(i) -= temp * A(i,j);
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(jx) != 0.) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    ix += *incx;
+			    X(ix) -= temp * A(i,j);
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(j);
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(i);
+/* L90: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(ix);
+			ix += *incx;
+/* L110: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx += *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = X(j);
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(i);
+/* L130: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L140: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(ix);
+			ix -= *incx;
+/* L150: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx -= *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRSV . */
+
+} /* dtrsv_ */
+
diff --git a/CBLAS/dzasum.c b/CBLAS/dzasum.c
new file mode 100644
index 0000000..5605d1e
--- /dev/null
+++ b/CBLAS/dzasum.c
@@ -0,0 +1,68 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal dzasum_(integer *n, doublecomplex *zx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val;
+
+    /* Local variables */
+    static integer i;
+    static doublereal stemp;
+    extern doublereal dcabs1_(doublecomplex *);
+    static integer ix;
+
+
+/*     takes the sum of the absolute values.   
+       jack dongarra, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define ZX(I) zx[(I)-1]
+
+
+    ret_val = 0.;
+    stemp = 0.;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	stemp += dcabs1_(&ZX(ix));
+	ix += *incx;
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	stemp += dcabs1_(&ZX(i));
+/* L30: */
+    }
+    ret_val = stemp;
+    return ret_val;
+} /* dzasum_ */
+
diff --git a/CBLAS/dznrm2.c b/CBLAS/dznrm2.c
new file mode 100644
index 0000000..d0318b7
--- /dev/null
+++ b/CBLAS/dznrm2.c
@@ -0,0 +1,96 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+doublereal dznrm2_(integer *n, doublecomplex *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublereal ret_val, d__1;
+
+    /* Builtin functions */
+    double d_imag(doublecomplex *), sqrt(doublereal);
+
+    /* Local variables */
+    static doublereal temp, norm, scale;
+    static integer ix;
+    static doublereal ssq;
+
+
+/*  DZNRM2 returns the euclidean norm of a vector via the function   
+    name, so that   
+
+       DZNRM2 := sqrt( conjg( x' )*x )   
+
+
+
+    -- This version written on 25-October-1982.   
+       Modified on 14-October-1993 to inline the call to ZLASSQ.   
+       Sven Hammarling, Nag Ltd.   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+
+    if (*n < 1 || *incx < 1) {
+	norm = 0.;
+    } else {
+	scale = 0.;
+	ssq = 1.;
+/*        The following loop is equivalent to this call to the LAPACK 
+  
+          auxiliary routine:   
+          CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) */
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
+	    i__3 = ix;
+	    if (X(ix).r != 0.) {
+		i__3 = ix;
+		temp = (d__1 = X(ix).r, abs(d__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    d__1 = scale / temp;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+	    if (d_imag(&X(ix)) != 0.) {
+		temp = (d__1 = d_imag(&X(ix)), abs(d__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    d__1 = scale / temp;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of DZNRM2. */
+
+} /* dznrm2_ */
+
diff --git a/CBLAS/f2c.h b/CBLAS/f2c.h
new file mode 100644
index 0000000..b3106e4
--- /dev/null
+++ b/CBLAS/f2c.h
@@ -0,0 +1,41 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+typedef int integer;
+typedef int logical;
+
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+/* typedef long long longint; */ /* system-dependent */
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+
+#define VOID void
+
+#endif
diff --git a/CBLAS/icamax.c b/CBLAS/icamax.c
new file mode 100644
index 0000000..9ff4786
--- /dev/null
+++ b/CBLAS/icamax.c
@@ -0,0 +1,72 @@
+#include "f2c.h"
+
+integer icamax_(integer *n, complex *cx, integer *incx)
+{
+    /* System generated locals */
+    integer ret_val, i__1, i__2;
+    real r__1, r__2;
+    /* Builtin functions */
+    double r_imag(complex *);
+    /* Local variables */
+    static real smax;
+    static integer i, ix;
+/*     finds the index of element having max. absolute value.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+    
+   Parameter adjustments   
+       Function Body */
+#define CX(I) cx[(I)-1]
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+/*        code for increment not equal to 1 */
+    ix = 1;
+    smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2));
+    ix += *incx;
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	i__2 = ix;
+	if ((r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), dabs(
+		r__2)) <= smax) {
+	    goto L5;
+	}
+	ret_val = i;
+	i__2 = ix;
+	smax = (r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), 
+		dabs(r__2));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+/*        code for increment equal to 1 */
+L20:
+    smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2));
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	i__2 = i;
+	if ((r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs(
+		r__2)) <= smax) {
+	    goto L30;
+	}
+	ret_val = i;
+	i__2 = i;
+	smax = (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs(
+		r__2));
+L30:
+	;
+    }
+    return ret_val;
+} /* icamax_ */
+
+
diff --git a/CBLAS/idamax.c b/CBLAS/idamax.c
new file mode 100644
index 0000000..00ebc23
--- /dev/null
+++ b/CBLAS/idamax.c
@@ -0,0 +1,80 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+integer idamax_(integer *n, doublereal *dx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer ret_val, i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal dmax__;
+    static integer i, ix;
+
+
+/*     finds the index of element having max. absolute value.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define DX(I) dx[(I)-1]
+
+
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    dmax__ = abs(DX(1));
+    ix += *incx;
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if ((d__1 = DX(ix), abs(d__1)) <= dmax__) {
+	    goto L5;
+	}
+	ret_val = i;
+	dmax__ = (d__1 = DX(ix), abs(d__1));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    dmax__ = abs(DX(1));
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if ((d__1 = DX(i), abs(d__1)) <= dmax__) {
+	    goto L30;
+	}
+	ret_val = i;
+	dmax__ = (d__1 = DX(i), abs(d__1));
+L30:
+	;
+    }
+    return ret_val;
+} /* idamax_ */
+
diff --git a/CBLAS/input_error_dist.c b/CBLAS/input_error_dist.c
new file mode 100644
index 0000000..6825fa3
--- /dev/null
+++ b/CBLAS/input_error_dist.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+
+/*! @file input_error_dist_dist.c
+ * \brief Error handler for input parameters.
+ *
+ * <pre>
+ * -- SuperLU routine (version 4.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 20, 2012
+ * </pre>
+ */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose   
+ * =======   
+ *
+ * INPUT_ERROR is called if an input parameter has an   
+ * invalid value.  A message is printed and execution stops.   
+ *
+ * Arguments   
+ * =========   
+ *
+ * srname  (input) character*6
+ *         The name of the routine which called INPUT_ERROR.
+ *
+ * info    (input) int
+ *         The position of the invalid parameter in the parameter list   
+ *         of the calling routine.
+ *
+ * </pre>
+ */
+int input_error_dist(char *srname, int *info)
+{
+    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
+		srname, *info);
+    return 0;
+}
diff --git a/CBLAS/isamax.c b/CBLAS/isamax.c
new file mode 100644
index 0000000..b1ad475
--- /dev/null
+++ b/CBLAS/isamax.c
@@ -0,0 +1,80 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+integer isamax_(integer *n, real *sx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer ret_val, i__1;
+    real r__1;
+
+    /* Local variables */
+    static real smax;
+    static integer i, ix;
+
+
+/*     finds the index of element having max. absolute value.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SX(I) sx[(I)-1]
+
+
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    smax = dabs(SX(1));
+    ix += *incx;
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if ((r__1 = SX(ix), dabs(r__1)) <= smax) {
+	    goto L5;
+	}
+	ret_val = i;
+	smax = (r__1 = SX(ix), dabs(r__1));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    smax = dabs(SX(1));
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if ((r__1 = SX(i), dabs(r__1)) <= smax) {
+	    goto L30;
+	}
+	ret_val = i;
+	smax = (r__1 = SX(i), dabs(r__1));
+L30:
+	;
+    }
+    return ret_val;
+} /* isamax_ */
+
diff --git a/CBLAS/izamax.c b/CBLAS/izamax.c
new file mode 100644
index 0000000..44959d0
--- /dev/null
+++ b/CBLAS/izamax.c
@@ -0,0 +1,81 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+integer izamax_(integer *n, doublecomplex *zx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer ret_val, i__1;
+
+    /* Local variables */
+    static doublereal smax;
+    static integer i;
+    extern doublereal dcabs1_(doublecomplex *);
+    static integer ix;
+
+
+/*     finds the index of element having max. absolute value.   
+       jack dongarra, 1/15/85.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define ZX(I) zx[(I)-1]
+
+
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    smax = dcabs1_(&ZX(1));
+    ix += *incx;
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if (dcabs1_(&ZX(ix)) <= smax) {
+	    goto L5;
+	}
+	ret_val = i;
+	smax = dcabs1_(&ZX(ix));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    smax = dcabs1_(&ZX(1));
+    i__1 = *n;
+    for (i = 2; i <= *n; ++i) {
+	if (dcabs1_(&ZX(i)) <= smax) {
+	    goto L30;
+	}
+	ret_val = i;
+	smax = dcabs1_(&ZX(i));
+L30:
+	;
+    }
+    return ret_val;
+} /* izamax_ */
+
diff --git a/CBLAS/sasum.c b/CBLAS/sasum.c
new file mode 100644
index 0000000..29eafe4
--- /dev/null
+++ b/CBLAS/sasum.c
@@ -0,0 +1,89 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+real sasum_(integer *n, real *sx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+    real ret_val, r__1, r__2, r__3, r__4, r__5, r__6;
+
+    /* Local variables */
+    static integer i, m, nincx;
+    static real stemp;
+    static integer mp1;
+
+
+/*     takes the sum of the absolute values.   
+       uses unrolled loops for increment equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SX(I) sx[(I)-1]
+
+
+    ret_val = 0.f;
+    stemp = 0.f;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	stemp += (r__1 = SX(i), dabs(r__1));
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for increment equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 6;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i = 1; i <= m; ++i) {
+	stemp += (r__1 = SX(i), dabs(r__1));
+/* L30: */
+    }
+    if (*n < 6) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i = mp1; i <= *n; i += 6) {
+	stemp = stemp + (r__1 = SX(i), dabs(r__1)) + (r__2 = SX(i + 1), dabs(
+		r__2)) + (r__3 = SX(i + 2), dabs(r__3)) + (r__4 = SX(i + 3), 
+		dabs(r__4)) + (r__5 = SX(i + 4), dabs(r__5)) + (r__6 = SX(i + 
+		5), dabs(r__6));
+/* L50: */
+    }
+L60:
+    ret_val = stemp;
+    return ret_val;
+} /* sasum_ */
+
diff --git a/CBLAS/saxpy.c b/CBLAS/saxpy.c
new file mode 100644
index 0000000..06c26f3
--- /dev/null
+++ b/CBLAS/saxpy.c
@@ -0,0 +1,94 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int saxpy_(integer *n, real *sa, real *sx, integer *incx, 
+	real *sy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i, m, ix, iy, mp1;
+
+
+/*     constant times a vector plus a vector.   
+       uses unrolled loop for increments equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SY(I) sy[(I)-1]
+#define SX(I) sx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*sa == 0.f) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	SY(iy) += *sa * SX(ix);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 4;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	SY(i) += *sa * SX(i);
+/* L30: */
+    }
+    if (*n < 4) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 4) {
+	SY(i) += *sa * SX(i);
+	SY(i + 1) += *sa * SX(i + 1);
+	SY(i + 2) += *sa * SX(i + 2);
+	SY(i + 3) += *sa * SX(i + 3);
+/* L50: */
+    }
+    return 0;
+} /* saxpy_ */
+
diff --git a/CBLAS/scasum.c b/CBLAS/scasum.c
new file mode 100644
index 0000000..5c3d351
--- /dev/null
+++ b/CBLAS/scasum.c
@@ -0,0 +1,74 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+real scasum_(integer *n, complex *cx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real ret_val, r__1, r__2;
+
+    /* Builtin functions */
+    double r_imag(complex *);
+
+    /* Local variables */
+    static integer i, nincx;
+    static real stemp;
+
+
+/*     takes the sum of the absolute values of a complex vector and   
+       returns a single precision result.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define CX(I) cx[(I)-1]
+
+
+    ret_val = 0.f;
+    stemp = 0.f;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	i__3 = i;
+	stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(
+		i)), dabs(r__2));
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__2 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__1 = i;
+	stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(
+		i)), dabs(r__2));
+/* L30: */
+    }
+    ret_val = stemp;
+    return ret_val;
+} /* scasum_ */
+
diff --git a/CBLAS/scnrm2.c b/CBLAS/scnrm2.c
new file mode 100644
index 0000000..8325b3d
--- /dev/null
+++ b/CBLAS/scnrm2.c
@@ -0,0 +1,96 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+real scnrm2_(integer *n, complex *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real ret_val, r__1;
+
+    /* Builtin functions */
+    double r_imag(complex *), sqrt(doublereal);
+
+    /* Local variables */
+    static real temp, norm, scale;
+    static integer ix;
+    static real ssq;
+
+
+/*  SCNRM2 returns the euclidean norm of a vector via the function   
+    name, so that   
+
+       SCNRM2 := sqrt( conjg( x' )*x )   
+
+
+
+    -- This version written on 25-October-1982.   
+       Modified on 14-October-1993 to inline the call to CLASSQ.   
+       Sven Hammarling, Nag Ltd.   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+
+    if (*n < 1 || *incx < 1) {
+	norm = 0.f;
+    } else {
+	scale = 0.f;
+	ssq = 1.f;
+/*        The following loop is equivalent to this call to the LAPACK 
+  
+          auxiliary routine:   
+          CALL CLASSQ( N, X, INCX, SCALE, SSQ ) */
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
+	    i__3 = ix;
+	    if (X(ix).r != 0.f) {
+		i__3 = ix;
+		temp = (r__1 = X(ix).r, dabs(r__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    r__1 = scale / temp;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+	    if (r_imag(&X(ix)) != 0.f) {
+		temp = (r__1 = r_imag(&X(ix)), dabs(r__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    r__1 = scale / temp;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of SCNRM2. */
+
+} /* scnrm2_ */
+
diff --git a/CBLAS/scopy.c b/CBLAS/scopy.c
new file mode 100644
index 0000000..4e7a238
--- /dev/null
+++ b/CBLAS/scopy.c
@@ -0,0 +1,94 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int scopy_(integer *n, real *sx, integer *incx, real *sy, 
+	integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i, m, ix, iy, mp1;
+
+
+/*     copies a vector, x, to a vector, y.   
+       uses unrolled loops for increments equal to 1.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SY(I) sy[(I)-1]
+#define SX(I) sx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	SY(iy) = SX(ix);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 7;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	SY(i) = SX(i);
+/* L30: */
+    }
+    if (*n < 7) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 7) {
+	SY(i) = SX(i);
+	SY(i + 1) = SX(i + 1);
+	SY(i + 2) = SX(i + 2);
+	SY(i + 3) = SX(i + 3);
+	SY(i + 4) = SX(i + 4);
+	SY(i + 5) = SX(i + 5);
+	SY(i + 6) = SX(i + 6);
+/* L50: */
+    }
+    return 0;
+} /* scopy_ */
+
diff --git a/CBLAS/sdot.c b/CBLAS/sdot.c
new file mode 100644
index 0000000..b553937
--- /dev/null
+++ b/CBLAS/sdot.c
@@ -0,0 +1,96 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+real sdot_(integer *n, real *sx, integer *incx, real *sy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+    real ret_val;
+
+    /* Local variables */
+    static integer i, m;
+    static real stemp;
+    static integer ix, iy, mp1;
+
+
+/*     forms the dot product of two vectors.   
+       uses unrolled loops for increments equal to one.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SY(I) sy[(I)-1]
+#define SX(I) sx[(I)-1]
+
+
+    stemp = 0.f;
+    ret_val = 0.f;
+    if (*n <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	stemp += SX(ix) * SY(iy);
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for both increments equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i = 1; i <= m; ++i) {
+	stemp += SX(i) * SY(i);
+/* L30: */
+    }
+    if (*n < 5) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i = mp1; i <= *n; i += 5) {
+	stemp = stemp + SX(i) * SY(i) + SX(i + 1) * SY(i + 1) + SX(i + 2) * 
+		SY(i + 2) + SX(i + 3) * SY(i + 3) + SX(i + 4) * SY(i + 4);
+/* L50: */
+    }
+L60:
+    ret_val = stemp;
+    return ret_val;
+} /* sdot_ */
+
diff --git a/CBLAS/sgemv.c b/CBLAS/sgemv.c
new file mode 100644
index 0000000..27238a6
--- /dev/null
+++ b/CBLAS/sgemv.c
@@ -0,0 +1,298 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int sgemv_(char *trans, integer *m, integer *n, real *alpha, 
+	real *a, integer *lda, real *x, integer *incx, real *beta, real *y, 
+	integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static real temp;
+    static integer lenx, leny, i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    SGEMV  performs one of the matrix-vector operations   
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are vectors and A is an   
+    m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - REAL            .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+    X      - REAL             array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - REAL            .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - REAL             array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+  
+             updated vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&  
+	strncmp(trans, "C", 1)!=0) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("SGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) {
+	return 0;
+    }
+
+/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+  
+       up the start points in  X  and  Y. */
+
+    if (strncmp(trans, "N", 1)==0) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A.   
+
+       First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(i) = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(i) = *beta * Y(i);
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(iy) = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    Y(iy) = *beta * Y(iy);
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.f) {
+		    temp = *alpha * X(jx);
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			Y(i) += temp * A(i,j);
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.f) {
+		    temp = *alpha * X(jx);
+		    iy = ky;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			Y(iy) += temp * A(i,j);
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp = 0.f;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp += A(i,j) * X(i);
+/* L90: */
+		}
+		Y(jy) += *alpha * temp;
+		jy += *incy;
+/* L100: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp = 0.f;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp += A(i,j) * X(ix);
+		    ix += *incx;
+/* L110: */
+		}
+		Y(jy) += *alpha * temp;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SGEMV . */
+
+} /* sgemv_ */
+
diff --git a/CBLAS/sger.c b/CBLAS/sger.c
new file mode 100644
index 0000000..30c1a4e
--- /dev/null
+++ b/CBLAS/sger.c
@@ -0,0 +1,181 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int sger_(integer *m, integer *n, real *alpha, real *x, 
+	integer *incx, real *y, integer *incy, real *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static real temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    SGER   performs the rank 1 operation   
+
+       A := alpha*x*y' + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - REAL            .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - REAL             array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("SGER  ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.f) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    if (Y(jy) != 0.f) {
+		temp = *alpha * Y(jy);
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    A(i,j) += X(i) * temp;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    if (Y(jy) != 0.f) {
+		temp = *alpha * Y(jy);
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    A(i,j) += X(ix) * temp;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of SGER  . */
+
+} /* sger_ */
+
diff --git a/CBLAS/snrm2.c b/CBLAS/snrm2.c
new file mode 100644
index 0000000..99b4003
--- /dev/null
+++ b/CBLAS/snrm2.c
@@ -0,0 +1,83 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+real snrm2_(integer *n, real *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+    real ret_val, r__1;
+
+    /* Builtin functions */
+    double sqrt(doublereal);
+
+    /* Local variables */
+    static real norm, scale, absxi;
+    static integer ix;
+    static real ssq;
+
+
+/*  SNRM2 returns the euclidean norm of a vector via the function   
+    name, so that   
+
+       SNRM2 := sqrt( x'*x )   
+
+
+
+    -- This version written on 25-October-1982.   
+       Modified on 14-October-1993 to inline the call to SLASSQ.   
+       Sven Hammarling, Nag Ltd.   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+
+    if (*n < 1 || *incx < 1) {
+	norm = 0.f;
+    } else if (*n == 1) {
+	norm = dabs(X(1));
+    } else {
+	scale = 0.f;
+	ssq = 1.f;
+/*        The following loop is equivalent to this call to the LAPACK 
+  
+          auxiliary routine:   
+          CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
+	    if (X(ix) != 0.f) {
+		absxi = (r__1 = X(ix), dabs(r__1));
+		if (scale < absxi) {
+/* Computing 2nd power */
+		    r__1 = scale / absxi;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    r__1 = absxi / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of SNRM2. */
+
+} /* snrm2_ */
+
diff --git a/CBLAS/srot.c b/CBLAS/srot.c
new file mode 100644
index 0000000..bdbb234
--- /dev/null
+++ b/CBLAS/srot.c
@@ -0,0 +1,76 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int srot_(integer *n, real *sx, integer *incx, real *sy, 
+	integer *incy, real *c, real *s)
+{
+
+
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i;
+    static real stemp;
+    static integer ix, iy;
+
+
+/*     applies a plane rotation.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SY(I) sy[(I)-1]
+#define SX(I) sx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*       code for unequal increments or equal increments not equal   
+           to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	stemp = *c * SX(ix) + *s * SY(iy);
+	SY(iy) = *c * SY(iy) - *s * SX(ix);
+	SX(ix) = stemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	stemp = *c * SX(i) + *s * SY(i);
+	SY(i) = *c * SY(i) - *s * SX(i);
+	SX(i) = stemp;
+/* L30: */
+    }
+    return 0;
+} /* srot_ */
+
diff --git a/CBLAS/sscal.c b/CBLAS/sscal.c
new file mode 100644
index 0000000..a21ad4e
--- /dev/null
+++ b/CBLAS/sscal.c
@@ -0,0 +1,82 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int sscal_(integer *n, real *sa, real *sx, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i, m, nincx, mp1;
+
+
+/*     scales a vector by a constant.   
+       uses unrolled loops for increment equal to 1.   
+       jack dongarra, linpack, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define SX(I) sx[(I)-1]
+
+
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+	SX(i) = *sa * SX(i);
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1   
+
+
+          clean-up loop */
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i = 1; i <= m; ++i) {
+	SX(i) = *sa * SX(i);
+/* L30: */
+    }
+    if (*n < 5) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i = mp1; i <= *n; i += 5) {
+	SX(i) = *sa * SX(i);
+	SX(i + 1) = *sa * SX(i + 1);
+	SX(i + 2) = *sa * SX(i + 2);
+	SX(i + 3) = *sa * SX(i + 3);
+	SX(i + 4) = *sa * SX(i + 4);
+/* L50: */
+    }
+    return 0;
+} /* sscal_ */
+
diff --git a/CBLAS/ssymv.c b/CBLAS/ssymv.c
new file mode 100644
index 0000000..f58b37e
--- /dev/null
+++ b/CBLAS/ssymv.c
@@ -0,0 +1,299 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int ssymv_(char *uplo, integer *n, real *alpha, real *a, 
+	integer *lda, real *x, integer *incx, real *beta, real *y, integer *
+	incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static real temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    SSYMV  performs the matrix-vector  operation   
+
+       y := alpha*A*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are n element vectors and   
+    A is an n by n symmetric matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - REAL            .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the symmetric matrix and the strictly   
+             lower triangular part of A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the symmetric matrix and the strictly   
+             upper triangular part of A is not referenced.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - REAL            .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y. On exit, Y is overwritten by the updated   
+             vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	input_error_dist("SSYMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.f && *beta == 1.f) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A.   
+
+       First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(i) = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(i) = *beta * Y(i);
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(iy) = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    Y(iy) = *beta * Y(iy);
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(j);
+		temp2 = 0.f;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    Y(i) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(i);
+/* L50: */
+		}
+		Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(jx);
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    Y(iy) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(ix);
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(j);
+		temp2 = 0.f;
+		Y(j) += temp1 * A(j,j);
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    Y(i) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(i);
+/* L90: */
+		}
+		Y(j) += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp1 = *alpha * X(jx);
+		temp2 = 0.f;
+		Y(jy) += temp1 * A(j,j);
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    ix += *incx;
+		    iy += *incy;
+		    Y(iy) += temp1 * A(i,j);
+		    temp2 += A(i,j) * X(ix);
+/* L110: */
+		}
+		Y(jy) += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYMV . */
+
+} /* ssymv_ */
+
diff --git a/CBLAS/ssyr2.c b/CBLAS/ssyr2.c
new file mode 100644
index 0000000..0929361
--- /dev/null
+++ b/CBLAS/ssyr2.c
@@ -0,0 +1,262 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int ssyr2_(char *uplo, integer *n, real *alpha, real *x, 
+	integer *incx, real *y, integer *incy, real *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static real temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    SSYR2  performs the symmetric rank 2 operation   
+
+       A := alpha*x*y' + alpha*y*x' + A,   
+
+    where alpha is a scalar, x and y are n element vectors and A is an n 
+  
+    by n symmetric matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - REAL            .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the symmetric matrix and the strictly   
+             lower triangular part of A is not referenced. On exit, the   
+             upper triangular part of the array A is overwritten by the   
+             upper triangular part of the updated matrix.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the symmetric matrix and the strictly   
+             upper triangular part of A is not referenced. On exit, the   
+             lower triangular part of the array A is overwritten by the   
+             lower triangular part of the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("SSYR2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.f) {
+	return 0;
+    }
+
+/*     Set up the start points in X and Y if the increments are not both 
+  
+       unity. */
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A. */
+
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(j) != 0.f || Y(j) != 0.f) {
+		    temp1 = *alpha * Y(j);
+		    temp2 = *alpha * X(j);
+		    i__2 = j;
+		    for (i = 1; i <= j; ++i) {
+			A(i,j) = A(i,j) + X(i) * temp1 
+				+ Y(i) * temp2;
+/* L10: */
+		    }
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.f || Y(jy) != 0.f) {
+		    temp1 = *alpha * Y(jy);
+		    temp2 = *alpha * X(jx);
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j;
+		    for (i = 1; i <= j; ++i) {
+			A(i,j) = A(i,j) + X(ix) * temp1 
+				+ Y(iy) * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(j) != 0.f || Y(j) != 0.f) {
+		    temp1 = *alpha * Y(j);
+		    temp2 = *alpha * X(j);
+		    i__2 = *n;
+		    for (i = j; i <= *n; ++i) {
+			A(i,j) = A(i,j) + X(i) * temp1 
+				+ Y(i) * temp2;
+/* L50: */
+		    }
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (X(jx) != 0.f || Y(jy) != 0.f) {
+		    temp1 = *alpha * Y(jy);
+		    temp2 = *alpha * X(jx);
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i = j; i <= *n; ++i) {
+			A(i,j) = A(i,j) + X(ix) * temp1 
+				+ Y(iy) * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYR2 . */
+
+} /* ssyr2_ */
+
diff --git a/CBLAS/strsv.c b/CBLAS/strsv.c
new file mode 100644
index 0000000..b80470d
--- /dev/null
+++ b/CBLAS/strsv.c
@@ -0,0 +1,337 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int strsv_(char *uplo, char *trans, char *diag, integer *n, 
+	real *a, integer *lda, real *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer info;
+    static real temp;
+    static integer i, j;
+    static integer ix, jx, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical nounit;
+
+
+/*  Purpose   
+    =======   
+
+    STRSV  solves one of the systems of equations   
+
+       A*x = b,   or   A'*x = b,   
+
+    where b and x are n element vectors and A is an n by n unit, or   
+    non-unit, upper or lower triangular matrix.   
+
+    No test for singularity or near-singularity is included in this   
+    routine. Such tests must be performed before calling this routine.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix is an upper or   
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the equations to be solved as   
+             follows:   
+
+                TRANS = 'N' or 'n'   A*x = b.   
+
+                TRANS = 'T' or 't'   A'*x = b.   
+
+                TRANS = 'C' or 'c'   A'*x = b.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit   
+             triangular as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular matrix and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular matrix and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
+  
+             A are not referenced either, but are assumed to be unity.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element right-hand side vector b. On exit, X is overwritten 
+  
+             with the solution vector x.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&
+	       strncmp(trans, "C", 1)!=0) {
+	info = 2;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	input_error_dist("STRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = (strncmp(diag, "N", 1)==0);
+
+/*     Set up the start point in X if the increment is not unity. This   
+       will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (X(j) != 0.f) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			for (i = j - 1; i >= 1; --i) {
+			    X(i) -= temp * A(i,j);
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    if (X(jx) != 0.f) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			for (i = j - 1; i >= 1; --i) {
+			    ix -= *incx;
+			    X(ix) -= temp * A(i,j);
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(j) != 0.f) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    X(i) -= temp * A(i,j);
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(jx) != 0.f) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    ix += *incx;
+			    X(ix) -= temp * A(i,j);
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(j);
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(i);
+/* L90: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(ix);
+			ix += *incx;
+/* L110: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx += *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = X(j);
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(i);
+/* L130: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L140: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(ix);
+			ix -= *incx;
+/* L150: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx -= *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRSV . */
+
+} /* strsv_ */
+
diff --git a/CBLAS/superlu_f2c.h b/CBLAS/superlu_f2c.h
new file mode 100644
index 0000000..226252b
--- /dev/null
+++ b/CBLAS/superlu_f2c.h
@@ -0,0 +1,43 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#include "Cnames.h"
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+typedef int integer;
+typedef int logical;
+
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+/* typedef long long longint; */ /* system-dependent */
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+
+#define VOID void
+
+#endif
diff --git a/CBLAS/z_internal.c b/CBLAS/z_internal.c
new file mode 100644
index 0000000..393a4da
--- /dev/null
+++ b/CBLAS/z_internal.c
@@ -0,0 +1,45 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "f2c.h"
+
+/* Complex Division c = a/b */
+void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b)
+{
+    double ratio, den;
+    double abr, abi, cr, ci;
+  
+    if( (abr = b->r) < 0.)
+	abr = - abr;
+    if( (abi = b->i) < 0.)
+	abi = - abi;
+    if( abr <= abi ) {
+	if (abi == 0) {
+	    fprintf(stderr, "z_div.c: division by zero");
+	    exit(-1);
+	}	  
+	ratio = b->r / b->i ;
+	den = b->i * (1 + ratio*ratio);
+	cr = (a->r*ratio + a->i) / den;
+	ci = (a->i*ratio - a->r) / den;
+    } else {
+	ratio = b->i / b->r ;
+	den = b->r * (1 + ratio*ratio);
+	cr = (a->r + a->i*ratio) / den;
+	ci = (a->i - a->r*ratio) / den;
+    }
+    c->r = cr;
+    c->i = ci;
+}
+
+/* Return the complex conjugate */
+void d_cnjg(doublecomplex *r, doublecomplex *z)
+{
+    r->r = z->r;
+    r->i = -z->i;
+}
+
+/* Return the imaginary part */
+double d_imag(doublecomplex *z)
+{
+    return (z->i);
+}
diff --git a/CBLAS/zaxpy.c b/CBLAS/zaxpy.c
new file mode 100644
index 0000000..37d4f32
--- /dev/null
+++ b/CBLAS/zaxpy.c
@@ -0,0 +1,87 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int zaxpy_(integer *n, doublecomplex *za, doublecomplex *zx, 
+	integer *incx, doublecomplex *zy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i;
+    extern doublereal dcabs1_(doublecomplex *);
+    static integer ix, iy;
+
+
+/*     constant times a vector plus a vector.   
+       jack dongarra, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define ZY(I) zy[(I)-1]
+#define ZX(I) zx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (dcabs1_(za) == 0.) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = iy;
+	i__3 = iy;
+	i__4 = ix;
+	z__2.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__2.i = za->r * ZX(
+		ix).i + za->i * ZX(ix).r;
+	z__1.r = ZY(iy).r + z__2.r, z__1.i = ZY(iy).i + z__2.i;
+	ZY(iy).r = z__1.r, ZY(iy).i = z__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = i;
+	i__3 = i;
+	i__4 = i;
+	z__2.r = za->r * ZX(i).r - za->i * ZX(i).i, z__2.i = za->r * ZX(
+		i).i + za->i * ZX(i).r;
+	z__1.r = ZY(i).r + z__2.r, z__1.i = ZY(i).i + z__2.i;
+	ZY(i).r = z__1.r, ZY(i).i = z__1.i;
+/* L30: */
+    }
+    return 0;
+} /* zaxpy_ */
+
diff --git a/CBLAS/zcopy.c b/CBLAS/zcopy.c
new file mode 100644
index 0000000..4cec89c
--- /dev/null
+++ b/CBLAS/zcopy.c
@@ -0,0 +1,74 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int zcopy_(integer *n, doublecomplex *zx, integer *incx, 
+	doublecomplex *zy, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i, ix, iy;
+
+
+/*     copies a vector, x, to a vector, y.   
+       jack dongarra, linpack, 4/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define ZY(I) zy[(I)-1]
+#define ZX(I) zx[(I)-1]
+
+
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = iy;
+	i__3 = ix;
+	ZY(iy).r = ZX(ix).r, ZY(iy).i = ZX(ix).i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = i;
+	i__3 = i;
+	ZY(i).r = ZX(i).r, ZY(i).i = ZX(i).i;
+/* L30: */
+    }
+    return 0;
+} /* zcopy_ */
+
diff --git a/CBLAS/zdotc.c b/CBLAS/zdotc.c
new file mode 100644
index 0000000..63ba4fb
--- /dev/null
+++ b/CBLAS/zdotc.c
@@ -0,0 +1,85 @@
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Double Complex */ VOID zdotc_(doublecomplex * ret_val, integer *n, 
+	doublecomplex *zx, integer *incx, doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer i;
+    static doublecomplex ztemp;
+    static integer ix, iy;
+
+
+/*     forms the dot product of a vector.   
+       jack dongarra, 3/11/78.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+    
+   Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    ztemp.r = 0., ztemp.i = 0.;
+     ret_val->r = 0.,  ret_val->i = 0.;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments   
+            not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	d_cnjg(&z__3, &zx[ix]);
+	i__2 = iy;
+	z__2.r = z__3.r * zy[iy].r - z__3.i * zy[iy].i, z__2.i = z__3.r * 
+		zy[iy].i + z__3.i * zy[iy].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	d_cnjg(&z__3, &zx[i]);
+	i__2 = i;
+	z__2.r = z__3.r * zy[i].r - z__3.i * zy[i].i, z__2.i = z__3.r * 
+		zy[i].i + z__3.i * zy[i].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+/* L30: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+} /* zdotc_ */
+
diff --git a/CBLAS/zgemm.c b/CBLAS/zgemm.c
new file mode 100644
index 0000000..f27de60
--- /dev/null
+++ b/CBLAS/zgemm.c
@@ -0,0 +1,689 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int zgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda, 
+	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *c,
+	 integer *ldc)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static logical nota, notb;
+    static doublecomplex temp;
+    static integer i, j, l;
+    static logical conja, conjb;
+    static integer ncola;
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    ZGEMM  performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of   
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),   
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+  
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - CHARACTER*1.   
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+
+             Unchanged on exit.   
+
+    TRANSB - CHARACTER*1.   
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry,  M  specifies  the number  of rows  of the  matrix 
+  
+             op( A )  and of the  matrix  C.  M  must  be at least  zero. 
+  
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry,  N  specifies the number  of columns of the matrix 
+  
+             op( B ) and the number of columns of the matrix C. N must be 
+  
+             at least zero.   
+             Unchanged on exit.   
+
+    K      - INTEGER.   
+             On entry,  K  specifies  the number of columns of the matrix 
+  
+             op( A ) and the number of rows of the matrix op( B ). K must 
+  
+             be at least  zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is 
+  
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.   
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k 
+  
+             part of the array  A  must contain the matrix  A,  otherwise 
+  
+             the leading  k by m  part of the array  A  must contain  the 
+  
+             matrix A.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then 
+  
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at 
+  
+             least  max( 1, k ).   
+             Unchanged on exit.   
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is 
+  
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n 
+  
+             part of the array  B  must contain the matrix  B,  otherwise 
+  
+             the leading  n by k  part of the array  B  must contain  the 
+  
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - INTEGER.   
+             On entry, LDB specifies the first dimension of B as declared 
+  
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then 
+  
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at 
+  
+             least  max( 1, n ).   
+             Unchanged on exit.   
+
+    BETA   - COMPLEX*16      .   
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is 
+  
+             supplied as zero then C need not be set on input.   
+             Unchanged on exit.   
+
+    C      - COMPLEX*16       array of DIMENSION ( LDC, n ).   
+             Before entry, the leading  m by n  part of the array  C must 
+  
+             contain the matrix  C,  except when  beta  is zero, in which 
+  
+             case C need not be set on entry.   
+             On exit, the array  C  is overwritten by the  m by n  matrix 
+  
+             ( alpha*op( A )*op( B ) + beta*C ).   
+
+    LDC    - INTEGER.   
+             On entry, LDC specifies the first dimension of C as declared 
+  
+             in  the  calling  (sub)  program.   LDC  must  be  at  least 
+  
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 3 Blas routine.   
+
+    -- Written on 8-February-1989.   
+       Jack Dongarra, Argonne National Laboratory.   
+       Iain Duff, AERE Harwell.   
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
+       Sven Hammarling, Numerical Algorithms Group Ltd.   
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not 
+       conjugated or transposed, set  CONJA and CONJB  as true if  A  and 
+       B  respectively are to be  transposed but  not conjugated  and set 
+       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A 
+       and the number of rows of  B  respectively.   
+    
+   Parameter adjustments   
+       Function Body */
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
+#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
+
+    nota = (strncmp(transa, "N", 1)==0);
+    notb = (strncmp(transb, "N", 1)==0);
+    conja = (strncmp(transa, "C", 1)==0);
+    conjb = (strncmp(transb, "C", 1)==0);
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! conja && strncmp(transa, "T", 1)!=0) {
+	info = 1;
+    } else if (! notb && ! conjb && strncmp(transb, "T", 1)!=0) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	input_error_dist("ZGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (alpha->r == 0. && alpha->i == 0. || *k == 0) &&
+	     (beta->r == 1. && beta->i == 0.)) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	if (beta->r == 0. && beta->i == 0.) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * c_dim1;
+		    C(i,j).r = 0., C(i,j).i = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * c_dim1;
+		    i__4 = i + j * c_dim1;
+		    z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+			    z__1.i = beta->r * C(i,j).i + beta->i * C(i,j)
+			    .r;
+		    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			C(i,j).r = 0., C(i,j).i = 0.;
+/* L50: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			i__4 = i + j * c_dim1;
+			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= *k; ++l) {
+		    i__3 = l + j * b_dim1;
+		    if (B(l,j).r != 0. || B(l,j).i != 0.) {
+			i__3 = l + j * b_dim1;
+			z__1.r = alpha->r * B(l,j).r - alpha->i * B(l,j).i, 
+				z__1.i = alpha->r * B(l,j).i + alpha->i * B(l,j).r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__4 = i + j * c_dim1;
+			    i__5 = i + j * c_dim1;
+			    i__6 = i + l * a_dim1;
+			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
+				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
+			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
+				    z__2.i;
+			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else if (conja) {
+
+/*           Form  C := alpha*conjg( A' )*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			d_cnjg(&z__3, &A(l,i));
+			i__4 = l + j * b_dim1;
+			z__2.r = z__3.r * B(l,j).r - z__3.i * B(l,j).i, 
+				z__2.i = z__3.r * B(l,j).i + z__3.i * B(l,j)
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			i__4 = l + i * a_dim1;
+			i__5 = l + j * b_dim1;
+			z__2.r = A(l,i).r * B(l,j).r - A(l,i).i * B(l,j)
+				.i, z__2.i = A(l,i).r * B(l,j).i + A(l,i)
+				.i * B(l,j).r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L130: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L140: */
+		}
+/* L150: */
+	    }
+	}
+    } else if (nota) {
+	if (conjb) {
+
+/*           Form  C := alpha*A*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			C(i,j).r = 0., C(i,j).i = 0.;
+/* L160: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			i__4 = i + j * c_dim1;
+			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L170: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= *k; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (B(j,l).r != 0. || B(j,l).i != 0.) {
+			d_cnjg(&z__2, &B(j,l));
+			z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, 
+				z__1.i = alpha->r * z__2.i + alpha->i * 
+				z__2.r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__4 = i + j * c_dim1;
+			    i__5 = i + j * c_dim1;
+			    i__6 = i + l * a_dim1;
+			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
+				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
+			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
+				    z__2.i;
+			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L180: */
+			}
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A*B'          + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			C(i,j).r = 0., C(i,j).i = 0.;
+/* L210: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * c_dim1;
+			i__4 = i + j * c_dim1;
+			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L220: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= *k; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (B(j,l).r != 0. || B(j,l).i != 0.) {
+			i__3 = j + l * b_dim1;
+			z__1.r = alpha->r * B(j,l).r - alpha->i * B(j,l).i, 
+				z__1.i = alpha->r * B(j,l).i + alpha->i * B(j,l).r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__4 = i + j * c_dim1;
+			    i__5 = i + j * c_dim1;
+			    i__6 = i + l * a_dim1;
+			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
+				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
+			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
+				    z__2.i;
+			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+/* L250: */
+	    }
+	}
+    } else if (conja) {
+	if (conjb) {
+
+/*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			d_cnjg(&z__3, &A(l,i));
+			d_cnjg(&z__4, &B(j,l));
+			z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, z__2.i = 
+				z__3.r * z__4.i + z__3.i * z__4.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L260: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L270: */
+		}
+/* L280: */
+	    }
+	} else {
+
+/*           Form  C := alpha*conjg( A' )*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			d_cnjg(&z__3, &A(l,i));
+			i__4 = j + l * b_dim1;
+			z__2.r = z__3.r * B(j,l).r - z__3.i * B(j,l).i, 
+				z__2.i = z__3.r * B(j,l).i + z__3.i * B(j,l)
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L290: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L300: */
+		}
+/* L310: */
+	    }
+	}
+    } else {
+	if (conjb) {
+
+/*           Form  C := alpha*A'*conjg( B' ) + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			i__4 = l + i * a_dim1;
+			d_cnjg(&z__3, &B(j,l));
+			z__2.r = A(l,i).r * z__3.r - A(l,i).i * z__3.i, 
+				z__2.i = A(l,i).r * z__3.i + A(l,i).i * 
+				z__3.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L320: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L330: */
+		}
+/* L340: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= *k; ++l) {
+			i__4 = l + i * a_dim1;
+			i__5 = j + l * b_dim1;
+			z__2.r = A(l,i).r * B(j,l).r - A(l,i).i * B(j,l)
+				.i, z__2.i = A(l,i).r * B(j,l).i + A(l,i)
+				.i * B(j,l).r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L350: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__1.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    } else {
+			i__3 = i + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
+				z__2.i = alpha->r * temp.i + alpha->i * 
+				temp.r;
+			i__4 = i + j * c_dim1;
+			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
+				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
+		    }
+/* L360: */
+		}
+/* L370: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZGEMM . */
+
+} /* zgemm_ */
+
diff --git a/CBLAS/zgemv.c b/CBLAS/zgemv.c
new file mode 100644
index 0000000..a34a309
--- /dev/null
+++ b/CBLAS/zgemv.c
@@ -0,0 +1,399 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int zgemv_(char *trans, integer *m, integer *n, 
+	doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *
+	x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *
+	incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp;
+    static integer lenx, leny, i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical noconj;
+
+
+/*  Purpose   
+    =======   
+
+    ZGEMV  performs one of the matrix-vector operations   
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or   
+
+       y := alpha*conjg( A' )*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are vectors and A is an   
+    m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+
+                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - COMPLEX*16      .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX*16       array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+  
+             updated vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&
+	strncmp(trans, "C", 1)!=0) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("ZGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 
+	    1. && beta->i == 0.)) {
+	return 0;
+    }
+
+    noconj = (strncmp(trans, "T", 1)==0);
+
+/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+  
+       up the start points in  X  and  Y. */
+
+    if (strncmp(trans, "N", 1)==0) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A.   
+
+       First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = i;
+		    Y(i).r = 0., Y(i).i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = i;
+		    i__3 = i;
+		    z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
+			    z__1.i = beta->r * Y(i).i + beta->i * Y(i)
+			    .r;
+		    Y(i).r = z__1.r, Y(i).i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = iy;
+		    Y(iy).r = 0., Y(iy).i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i = 1; i <= leny; ++i) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
+			    z__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
+			    .r;
+		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		if (X(jx).r != 0. || X(jx).i != 0.) {
+		    i__2 = jx;
+		    z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    z__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    temp.r = z__1.r, temp.i = z__1.i;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i;
+			i__4 = i;
+			i__5 = i + j * a_dim1;
+			z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				z__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
+				.r;
+			z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + 
+				z__2.i;
+			Y(i).r = z__1.r, Y(i).i = z__1.i;
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		if (X(jx).r != 0. || X(jx).i != 0.) {
+		    i__2 = jx;
+		    z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    z__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    temp.r = z__1.r, temp.i = z__1.i;
+		    iy = ky;
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = iy;
+			i__4 = iy;
+			i__5 = i + j * a_dim1;
+			z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				z__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
+				.r;
+			z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + 
+				z__2.i;
+			Y(iy).r = z__1.r, Y(iy).i = z__1.i;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+ */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp.r = 0., temp.i = 0.;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i;
+			z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i)
+				.i, z__2.i = A(i,j).r * X(i).i + A(i,j)
+				.i * X(i).r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			d_cnjg(&z__3, &A(i,j));
+			i__3 = i;
+			z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
+				z__2.i = z__3.r * X(i).i + z__3.i * X(i)
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = 
+			alpha->r * temp.i + alpha->i * temp.r;
+		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
+		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
+		jy += *incy;
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		temp.r = 0., temp.i = 0.;
+		ix = kx;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = ix;
+			z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix)
+				.i, z__2.i = A(i,j).r * X(ix).i + A(i,j)
+				.i * X(ix).r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			ix += *incx;
+/* L120: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			d_cnjg(&z__3, &A(i,j));
+			i__3 = ix;
+			z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
+				z__2.i = z__3.r * X(ix).i + z__3.i * X(ix)
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			ix += *incx;
+/* L130: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = 
+			alpha->r * temp.i + alpha->i * temp.r;
+		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
+		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
+		jy += *incy;
+/* L140: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZGEMV . */
+
+} /* zgemv_ */
+
diff --git a/CBLAS/zgerc.c b/CBLAS/zgerc.c
new file mode 100644
index 0000000..954b2c6
--- /dev/null
+++ b/CBLAS/zgerc.c
@@ -0,0 +1,206 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int zgerc_(integer *m, integer *n, doublecomplex *alpha, 
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
+	doublecomplex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    ZGERC  performs the rank 1 operation   
+
+       A := alpha*x*conjg( y' ) + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("ZGERC ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
+		d_cnjg(&z__2, &Y(jy));
+		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			alpha->r * z__2.i + alpha->i * z__2.r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = i;
+		    z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i =
+			     X(i).r * temp.i + X(i).i * temp.r;
+		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
+		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
+		d_cnjg(&z__2, &Y(jy));
+		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			alpha->r * z__2.i + alpha->i * z__2.r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = ix;
+		    z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i =
+			     X(ix).r * temp.i + X(ix).i * temp.r;
+		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
+		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZGERC . */
+
+} /* zgerc_ */
+
diff --git a/CBLAS/zgeru.c b/CBLAS/zgeru.c
new file mode 100644
index 0000000..1242fa9
--- /dev/null
+++ b/CBLAS/zgeru.c
@@ -0,0 +1,203 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int zgeru_(integer *m, integer *n, doublecomplex *alpha, 
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
+	doublecomplex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp;
+    static integer i, j, ix, jy, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    ZGERU  performs the rank 1 operation   
+
+       A := alpha*x*y' + A,   
+
+    where alpha is a scalar, x is an m element vector, y is an n element 
+  
+    vector and A is an m by n matrix.   
+
+    Parameters   
+    ==========   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of the matrix A.   
+             M must be at least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of the matrix A. 
+  
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the m   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients. On exit, A is   
+             overwritten by the updated matrix.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("ZGERU ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
+		i__2 = jy;
+		z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i =
+			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = i;
+		    z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i =
+			     X(i).r * temp.i + X(i).i * temp.r;
+		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
+		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = jy;
+	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
+		i__2 = jy;
+		z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i =
+			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i = 1; i <= *m; ++i) {
+		    i__3 = i + j * a_dim1;
+		    i__4 = i + j * a_dim1;
+		    i__5 = ix;
+		    z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i =
+			     X(ix).r * temp.i + X(ix).i * temp.r;
+		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
+		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZGERU . */
+
+} /* zgeru_ */
+
diff --git a/CBLAS/zhemv.c b/CBLAS/zhemv.c
new file mode 100644
index 0000000..ccd5921
--- /dev/null
+++ b/CBLAS/zhemv.c
@@ -0,0 +1,420 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int zhemv_(char *uplo, integer *n, doublecomplex *alpha, 
+	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, 
+	doublecomplex *beta, doublecomplex *y, integer *incy)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    ZHEMV  performs the matrix-vector  operation   
+
+       y := alpha*A*x + beta*y,   
+
+    where alpha and beta are scalars, x and y are n element vectors and   
+    A is an n by n hermitian matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the hermitian matrix and the strictly   
+             lower triangular part of A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the hermitian matrix and the strictly   
+             upper triangular part of A is not referenced.   
+             Note that the imaginary parts of the diagonal elements need 
+  
+             not be set and are assumed to be zero.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    BETA   - COMPLEX*16      .   
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y. On exit, Y is overwritten by the updated   
+             vector y.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	input_error_dist("ZHEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
+	    beta->i == 0.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A.   
+
+       First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = i;
+		    Y(i).r = 0., Y(i).i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = i;
+		    i__3 = i;
+		    z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
+			    z__1.i = beta->r * Y(i).i + beta->i * Y(i)
+			    .r;
+		    Y(i).r = z__1.r, Y(i).i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = iy;
+		    Y(iy).r = 0., Y(iy).i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i = 1; i <= *n; ++i) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
+			    z__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
+			    .r;
+		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i =
+			 alpha->r * X(j).i + alpha->i * X(j).r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    i__3 = i;
+		    i__4 = i;
+		    i__5 = i + j * a_dim1;
+		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i;
+		    Y(i).r = z__1.r, Y(i).i = z__1.i;
+		    d_cnjg(&z__3, &A(i,j));
+		    i__3 = i;
+		    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i =
+			     z__3.r * X(i).i + z__3.i * X(i).r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = Y(j).r + z__3.r, z__2.i = Y(j).i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		Y(j).r = z__1.r, Y(j).i = z__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i =
+			 alpha->r * X(jx).i + alpha->i * X(jx).r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i = 1; i <= j-1; ++i) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i + j * a_dim1;
+		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i;
+		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
+		    d_cnjg(&z__3, &A(i,j));
+		    i__3 = ix;
+		    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i =
+			     z__3.r * X(ix).i + z__3.i * X(ix).r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = Y(jy).r + z__3.r, z__2.i = Y(jy).i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i =
+			 alpha->r * X(j).i + alpha->i * X(j).r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i;
+		Y(j).r = z__1.r, Y(j).i = z__1.i;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    i__3 = i;
+		    i__4 = i;
+		    i__5 = i + j * a_dim1;
+		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i;
+		    Y(i).r = z__1.r, Y(i).i = z__1.i;
+		    d_cnjg(&z__3, &A(i,j));
+		    i__3 = i;
+		    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i =
+			     z__3.r * X(i).i + z__3.i * X(i).r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i;
+		Y(j).r = z__1.r, Y(j).i = z__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i =
+			 alpha->r * X(jx).i + alpha->i * X(jx).r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = A(j,j).r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
+		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i = j + 1; i <= *n; ++i) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i + j * a_dim1;
+		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
+			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
+			    .r;
+		    z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i;
+		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
+		    d_cnjg(&z__3, &A(i,j));
+		    i__3 = ix;
+		    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i =
+			     z__3.r * X(ix).i + z__3.i * X(ix).r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
+		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHEMV . */
+
+} /* zhemv_ */
+
diff --git a/CBLAS/zher2.c b/CBLAS/zher2.c
new file mode 100644
index 0000000..30621ed
--- /dev/null
+++ b/CBLAS/zher2.c
@@ -0,0 +1,436 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int zher2_(char *uplo, integer *n, doublecomplex *alpha, 
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
+	doublecomplex *a, integer *lda)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp1, temp2;
+    static integer i, j;
+    static integer ix, iy, jx, jy, kx, ky;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+
+
+/*  Purpose   
+    =======   
+
+    ZHER2  performs the hermitian rank 2 operation   
+
+       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,   
+
+    where alpha is a scalar, x and y are n element vectors and A is an n 
+  
+    by n hermitian matrix.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the upper or lower   
+             triangular part of the array A is to be referenced as   
+             follows:   
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A   
+                                    is to be referenced.   
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A   
+                                    is to be referenced.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry, ALPHA specifies the scalar alpha.   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element vector x.   
+             Unchanged on exit.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+    Y      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ).   
+             Before entry, the incremented array Y must contain the n   
+             element vector y.   
+             Unchanged on exit.   
+
+    INCY   - INTEGER.   
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular part of the hermitian matrix and the strictly   
+             lower triangular part of A is not referenced. On exit, the   
+             upper triangular part of the array A is overwritten by the   
+             upper triangular part of the updated matrix.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular part of the hermitian matrix and the strictly   
+             upper triangular part of A is not referenced. On exit, the   
+             lower triangular part of the array A is overwritten by the   
+             lower triangular part of the updated matrix.   
+             Note that the imaginary parts of the diagonal elements need 
+  
+             not be set, they are assumed to be zero, and on exit they   
+             are set to zero.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+#define Y(I) y[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	input_error_dist("ZHER2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*     Set up the start points in X and Y if the increments are not both 
+  
+       unity. */
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through the triangular part   
+       of A. */
+
+    if (strncmp(uplo, "U", 1)==0) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || 
+			Y(j).i != 0.)) {
+		    d_cnjg(&z__2, &Y(j));
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = j;
+		    z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
+			    z__2.i = alpha->r * X(j).i + alpha->i * X(j)
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = i;
+			z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
+				z__3.i = X(i).r * temp1.i + X(i).i * 
+				temp1.r;
+			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
+				z__3.i;
+			i__6 = i;
+			z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
+				z__4.i = Y(i).r * temp2.i + Y(i).i * 
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+/* L10: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
+			    z__2.i = X(j).r * temp1.i + X(j).i * 
+			    temp1.r;
+		    i__5 = j;
+		    z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
+			    z__3.i = Y(j).r * temp2.i + Y(j).i * 
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = A(j,j).r + z__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || 
+			Y(jy).i != 0.)) {
+		    d_cnjg(&z__2, &Y(jy));
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = jx;
+		    z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    z__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = ix;
+			z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
+				z__3.i = X(ix).r * temp1.i + X(ix).i * 
+				temp1.r;
+			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
+				z__3.i;
+			i__6 = iy;
+			z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
+				z__4.i = Y(iy).r * temp2.i + Y(iy).i * 
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
+			    z__2.i = X(jx).r * temp1.i + X(jx).i * 
+			    temp1.r;
+		    i__5 = jy;
+		    z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
+			    z__3.i = Y(jy).r * temp2.i + Y(jy).i * 
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = A(j,j).r + z__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || 
+			Y(j).i != 0.)) {
+		    d_cnjg(&z__2, &Y(j));
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = j;
+		    z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
+			    z__2.i = alpha->r * X(j).i + alpha->i * X(j)
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
+			    z__2.i = X(j).r * temp1.i + X(j).i * 
+			    temp1.r;
+		    i__5 = j;
+		    z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
+			    z__3.i = Y(j).r * temp2.i + Y(j).i * 
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = A(j,j).r + z__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		    i__2 = *n;
+		    for (i = j + 1; i <= *n; ++i) {
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = i;
+			z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
+				z__3.i = X(i).r * temp1.i + X(i).i * 
+				temp1.r;
+			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
+				z__3.i;
+			i__6 = i;
+			z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
+				z__4.i = Y(i).r * temp2.i + Y(i).i * 
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+/* L50: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= *n; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || 
+			Y(jy).i != 0.)) {
+		    d_cnjg(&z__2, &Y(jy));
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = jx;
+		    z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
+			    z__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
+			    z__2.i = X(jx).r * temp1.i + X(jx).i * 
+			    temp1.r;
+		    i__5 = jy;
+		    z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
+			    z__3.i = Y(jy).r * temp2.i + Y(jy).i * 
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = A(j,j).r + z__1.r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i = j + 1; i <= *n; ++i) {
+			ix += *incx;
+			iy += *incy;
+			i__3 = i + j * a_dim1;
+			i__4 = i + j * a_dim1;
+			i__5 = ix;
+			z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
+				z__3.i = X(ix).r * temp1.i + X(ix).i * 
+				temp1.r;
+			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
+				z__3.i;
+			i__6 = iy;
+			z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
+				z__4.i = Y(iy).r * temp2.i + Y(iy).i * 
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
+/* L70: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = A(j,j).r;
+		    A(j,j).r = d__1, A(j,j).i = 0.;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHER2 . */
+
+} /* zher2_ */
+
diff --git a/CBLAS/zscal.c b/CBLAS/zscal.c
new file mode 100644
index 0000000..b3d88bb
--- /dev/null
+++ b/CBLAS/zscal.c
@@ -0,0 +1,70 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+#include "f2c.h"
+
+/* Subroutine */ int zscal_(integer *n, doublecomplex *za, doublecomplex *zx, 
+	integer *incx)
+{
+
+
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i, ix;
+
+
+/*     scales a vector by a constant.   
+       jack dongarra, 3/11/78.   
+       modified 3/93 to return if incx .le. 0.   
+       modified 12/3/93, array(1) declarations changed to array(*)   
+
+
+    
+   Parameter adjustments   
+       Function Body */
+#define ZX(I) zx[(I)-1]
+
+
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = ix;
+	i__3 = ix;
+	z__1.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__1.i = za->r * ZX(
+		ix).i + za->i * ZX(ix).r;
+	ZX(ix).r = z__1.r, ZX(ix).i = z__1.i;
+	ix += *incx;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i = 1; i <= *n; ++i) {
+	i__2 = i;
+	i__3 = i;
+	z__1.r = za->r * ZX(i).r - za->i * ZX(i).i, z__1.i = za->r * ZX(
+		i).i + za->i * ZX(i).r;
+	ZX(i).r = z__1.r, ZX(i).i = z__1.i;
+/* L30: */
+    }
+    return 0;
+} /* zscal_ */
+
diff --git a/CBLAS/ztrsm.c b/CBLAS/ztrsm.c
new file mode 100644
index 0000000..2ff59e2
--- /dev/null
+++ b/CBLAS/ztrsm.c
@@ -0,0 +1,691 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+
+/* Subroutine */ int ztrsm_(char *side, char *uplo, char *transa, char *diag, 
+	integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, 
+	integer *lda, doublecomplex *b, integer *ldb)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, 
+	    i__6, i__7;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg(
+	    doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp;
+    static integer i, j, k;
+    static logical lside;
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*  Purpose   
+    =======   
+
+    ZTRSM  solves one of the matrix equations   
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,   
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or 
+  
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of 
+  
+
+       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).   
+
+    The matrix X is overwritten on B.   
+
+    Parameters   
+    ==========   
+
+    SIDE   - CHARACTER*1.   
+             On entry, SIDE specifies whether op( A ) appears on the left 
+  
+             or right of X as follows:   
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.   
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.   
+
+             Unchanged on exit.   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix A is an upper or 
+  
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANSA - CHARACTER*1.   
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+  
+             the matrix multiplication as follows:   
+
+                TRANSA = 'N' or 'n'   op( A ) = A.   
+
+                TRANSA = 'T' or 't'   op( A ) = A'.   
+
+                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit triangular 
+  
+             as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    M      - INTEGER.   
+             On entry, M specifies the number of rows of B. M must be at 
+  
+             least zero.   
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the number of columns of B.  N must be 
+  
+             at least zero.   
+             Unchanged on exit.   
+
+    ALPHA  - COMPLEX*16      .   
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is 
+  
+             zero then  A is not referenced and  B need not be set before 
+  
+             entry.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m 
+  
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. 
+  
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k 
+  
+             upper triangular part of the array  A must contain the upper 
+  
+             triangular matrix  and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k 
+  
+             lower triangular part of the array  A must contain the lower 
+  
+             triangular matrix  and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of 
+  
+             A  are not referenced either,  but are assumed to be  unity. 
+  
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then 
+  
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' 
+  
+             then LDA must be at least max( 1, n ).   
+             Unchanged on exit.   
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, n ).   
+             Before entry,  the leading  m by n part of the array  B must 
+  
+             contain  the  right-hand  side  matrix  B,  and  on exit  is 
+  
+             overwritten by the solution matrix  X.   
+
+    LDB    - INTEGER.   
+             On entry, LDB specifies the first dimension of B as declared 
+  
+             in  the  calling  (sub)  program.   LDB  must  be  at  least 
+  
+             max( 1, m ).   
+             Unchanged on exit.   
+
+
+    Level 3 Blas routine.   
+
+    -- Written on 8-February-1989.   
+       Jack Dongarra, Argonne National Laboratory.   
+       Iain Duff, AERE Harwell.   
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
+       Sven Hammarling, Numerical Algorithms Group Ltd.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
+
+    lside = (strncmp(side, "L", 1)==0);
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    noconj = (strncmp(transa, "T", 1)==0);
+    nounit = (strncmp(diag, "N", 1)==0);
+    upper = (strncmp(uplo, "U", 1)==0);
+
+    info = 0;
+    if (! lside && strncmp(side, "R", 1)!=0) {
+	info = 1;
+    } else if (! upper && strncmp(uplo, "L", 1)!=0) {
+	info = 2;
+    } else if (strncmp(transa, "N", 1)!=0 && strncmp(transa, "T", 1)!=0
+	       && strncmp(transa, "C", 1)!=0) {
+	info = 3;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	input_error_dist("ZTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= *n; ++j) {
+	    i__2 = *m;
+	    for (i = 1; i <= *m; ++i) {
+		i__3 = i + j * b_dim1;
+		B(i,j).r = 0., B(i,j).i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (strncmp(transa, "N", 1)==0) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + j * b_dim1;
+			    i__4 = i + j * b_dim1;
+			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
+				    .i, z__1.i = alpha->r * B(i,j).i + 
+				    alpha->i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			i__2 = k + j * b_dim1;
+			if (B(k,j).r != 0. || B(k,j).i != 0.) {
+			    if (nounit) {
+				i__2 = k + j * b_dim1;
+				z_div(&z__1, &B(k,j), &A(k,k));
+				B(k,j).r = z__1.r, B(k,j).i = z__1.i;
+			    }
+			    i__2 = k - 1;
+			    for (i = 1; i <= k-1; ++i) {
+				i__3 = i + j * b_dim1;
+				i__4 = i + j * b_dim1;
+				i__5 = k + j * b_dim1;
+				i__6 = i + k * a_dim1;
+				z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * 
+					A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + j * b_dim1;
+			    i__4 = i + j * b_dim1;
+			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
+				    .i, z__1.i = alpha->r * B(i,j).i + 
+				    alpha->i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= *m; ++k) {
+			i__3 = k + j * b_dim1;
+			if (B(k,j).r != 0. || B(k,j).i != 0.) {
+			    if (nounit) {
+				i__3 = k + j * b_dim1;
+				z_div(&z__1, &B(k,j), &A(k,k));
+				B(k,j).r = z__1.r, B(k,j).i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (i = k + 1; i <= *m; ++i) {
+				i__4 = i + j * b_dim1;
+				i__5 = i + j * b_dim1;
+				i__6 = k + j * b_dim1;
+				i__7 = i + k * a_dim1;
+				z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * 
+					A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*inv( A' )*B   
+             or    B := alpha*inv( conjg( A' ) )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = *m;
+		    for (i = 1; i <= *m; ++i) {
+			i__3 = i + j * b_dim1;
+			z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, 
+				z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			if (noconj) {
+			    i__3 = i - 1;
+			    for (k = 1; k <= i-1; ++k) {
+				i__4 = k + i * a_dim1;
+				i__5 = k + j * b_dim1;
+				z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * 
+					B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L110: */
+			    }
+			    if (nounit) {
+				z_div(&z__1, &temp, &A(i,i));
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			} else {
+			    i__3 = i - 1;
+			    for (k = 1; k <= i-1; ++k) {
+				d_cnjg(&z__3, &A(k,i));
+				i__4 = k + j * b_dim1;
+				z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j)
+					.i, z__2.i = z__3.r * B(k,j).i + 
+					z__3.i * B(k,j).r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L120: */
+			    }
+			    if (nounit) {
+				d_cnjg(&z__2, &A(i,i));
+				z_div(&z__1, &temp, &z__2);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			}
+			i__3 = i + j * b_dim1;
+			B(i,j).r = temp.r, B(i,j).i = temp.i;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    for (i = *m; i >= 1; --i) {
+			i__2 = i + j * b_dim1;
+			z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, 
+				z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			if (noconj) {
+			    i__2 = *m;
+			    for (k = i + 1; k <= *m; ++k) {
+				i__3 = k + i * a_dim1;
+				i__4 = k + j * b_dim1;
+				z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * 
+					B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			    }
+			    if (nounit) {
+				z_div(&z__1, &temp, &A(i,i));
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			} else {
+			    i__2 = *m;
+			    for (k = i + 1; k <= *m; ++k) {
+				d_cnjg(&z__3, &A(k,i));
+				i__3 = k + j * b_dim1;
+				z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j)
+					.i, z__2.i = z__3.r * B(k,j).i + 
+					z__3.i * B(k,j).r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			    }
+			    if (nounit) {
+				d_cnjg(&z__2, &A(i,i));
+				z_div(&z__1, &temp, &z__2);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			}
+			i__2 = i + j * b_dim1;
+			B(i,j).r = temp.r, B(i,j).i = temp.i;
+/* L170: */
+		    }
+/* L180: */
+		}
+	    }
+	}
+    } else {
+	if (strncmp(transa, "N", 1)==0) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + j * b_dim1;
+			    i__4 = i + j * b_dim1;
+			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
+				    .i, z__1.i = alpha->r * B(i,j).i + 
+				    alpha->i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L190: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= j-1; ++k) {
+			i__3 = k + j * a_dim1;
+			if (A(k,j).r != 0. || A(k,j).i != 0.) {
+			    i__3 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				i__4 = i + j * b_dim1;
+				i__5 = i + j * b_dim1;
+				i__6 = k + j * a_dim1;
+				i__7 = i + k * b_dim1;
+				z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * 
+					B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+		    if (nounit) {
+			z_div(&z__1, &c_b1, &A(j,j));
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + j * b_dim1;
+			    i__4 = i + j * b_dim1;
+			    z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, 
+				    z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L220: */
+			}
+		    }
+/* L230: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__2 = i + j * b_dim1;
+			    i__3 = i + j * b_dim1;
+			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
+				    .i, z__1.i = alpha->r * B(i,j).i + 
+				    alpha->i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L240: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= *n; ++k) {
+			i__2 = k + j * a_dim1;
+			if (A(k,j).r != 0. || A(k,j).i != 0.) {
+			    i__2 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				i__3 = i + j * b_dim1;
+				i__4 = i + j * b_dim1;
+				i__5 = k + j * a_dim1;
+				i__6 = i + k * b_dim1;
+				z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * 
+					B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L250: */
+			    }
+			}
+/* L260: */
+		    }
+		    if (nounit) {
+			z_div(&z__1, &c_b1, &A(j,j));
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__2 = i + j * b_dim1;
+			    i__3 = i + j * b_dim1;
+			    z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, 
+				    z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r;
+			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L270: */
+			}
+		    }
+/* L280: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*inv( A' )   
+             or    B := alpha*B*inv( conjg( A' ) ). */
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			if (noconj) {
+			    z_div(&z__1, &c_b1, &A(k,k));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &A(k,k));
+			    z_div(&z__1, &c_b1, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__2 = i + k * b_dim1;
+			    i__3 = i + k * b_dim1;
+			    z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, 
+				    z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r;
+			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
+/* L290: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= k-1; ++j) {
+			i__2 = j + k * a_dim1;
+			if (A(j,k).r != 0. || A(j,k).i != 0.) {
+			    if (noconj) {
+				i__2 = j + k * a_dim1;
+				temp.r = A(j,k).r, temp.i = A(j,k).i;
+			    } else {
+				d_cnjg(&z__1, &A(j,k));
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__2 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				i__3 = i + j * b_dim1;
+				i__4 = i + j * b_dim1;
+				i__5 = i + k * b_dim1;
+				z__2.r = temp.r * B(i,k).r - temp.i * B(i,k)
+					.i, z__2.i = temp.r * B(i,k).i + 
+					temp.i * B(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L300: */
+			    }
+			}
+/* L310: */
+		    }
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__1 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__2 = i + k * b_dim1;
+			    i__3 = i + k * b_dim1;
+			    z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k)
+				    .i, z__1.i = alpha->r * B(i,k).i + 
+				    alpha->i * B(i,k).r;
+			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
+/* L320: */
+			}
+		    }
+/* L330: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= *n; ++k) {
+		    if (nounit) {
+			if (noconj) {
+			    z_div(&z__1, &c_b1, &A(k,k));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &A(k,k));
+			    z_div(&z__1, &c_b1, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + k * b_dim1;
+			    i__4 = i + k * b_dim1;
+			    z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, 
+				    z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r;
+			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
+/* L340: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= *n; ++j) {
+			i__3 = j + k * a_dim1;
+			if (A(j,k).r != 0. || A(j,k).i != 0.) {
+			    if (noconj) {
+				i__3 = j + k * a_dim1;
+				temp.r = A(j,k).r, temp.i = A(j,k).i;
+			    } else {
+				d_cnjg(&z__1, &A(j,k));
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (i = 1; i <= *m; ++i) {
+				i__4 = i + j * b_dim1;
+				i__5 = i + j * b_dim1;
+				i__6 = i + k * b_dim1;
+				z__2.r = temp.r * B(i,k).r - temp.i * B(i,k)
+					.i, z__2.i = temp.r * B(i,k).i + 
+					temp.i * B(i,k).r;
+				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
+					.i - z__2.i;
+				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
+/* L350: */
+			    }
+			}
+/* L360: */
+		    }
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i = 1; i <= *m; ++i) {
+			    i__3 = i + k * b_dim1;
+			    i__4 = i + k * b_dim1;
+			    z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k)
+				    .i, z__1.i = alpha->r * B(i,k).i + 
+				    alpha->i * B(i,k).r;
+			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
+/* L370: */
+			}
+		    }
+/* L380: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRSM . */
+
+} /* ztrsm_ */
+
diff --git a/CBLAS/ztrsv.c b/CBLAS/ztrsv.c
new file mode 100644
index 0000000..21cbc5e
--- /dev/null
+++ b/CBLAS/ztrsv.c
@@ -0,0 +1,509 @@
+
+/*  -- translated by f2c (version 19940927).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+#include <string.h>
+#include "f2c.h"
+
+/* Subroutine */ int ztrsv_(char *uplo, char *trans, char *diag, integer *n, 
+	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx)
+{
+
+
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg(
+	    doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    static integer info;
+    static doublecomplex temp;
+    static integer i, j;
+    static integer ix, jx, kx;
+    extern /* Subroutine */ int input_error_dist(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*  Purpose   
+    =======   
+
+    ZTRSV  solves one of the systems of equations   
+
+       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,   
+
+    where b and x are n element vectors and A is an n by n unit, or   
+    non-unit, upper or lower triangular matrix.   
+
+    No test for singularity or near-singularity is included in this   
+    routine. Such tests must be performed before calling this routine.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix is an upper or   
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the equations to be solved as   
+             follows:   
+
+                TRANS = 'N' or 'n'   A*x = b.   
+
+                TRANS = 'T' or 't'   A'*x = b.   
+
+                TRANS = 'C' or 'c'   conjg( A' )*x = b.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit   
+             triangular as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular matrix and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular matrix and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
+  
+             A are not referenced either, but are assumed to be unity.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - COMPLEX*16       array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element right-hand side vector b. On exit, X is overwritten 
+  
+             with the solution vector x.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (strncmp(uplo, "U", 1)!=0 && strncmp(uplo, "L", 1)!=0) {
+	info = 1;
+    } else if (strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 &&
+	       strncmp(trans, "C", 1)!=0) {
+	info = 2;
+    } else if (strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	input_error_dist("ZTRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = (strncmp(trans, "T", 1)==0);
+    nounit = (strncmp(diag, "N", 1)==0);
+
+/*     Set up the start point in X if the increment is not unity. This   
+       will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (strncmp(trans, "N", 1)==0) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (X(j).r != 0. || X(j).i != 0.) {
+			if (nounit) {
+			    i__1 = j;
+			    z_div(&z__1, &X(j), &A(j,j));
+			    X(j).r = z__1.r, X(j).i = z__1.i;
+			}
+			i__1 = j;
+			temp.r = X(j).r, temp.i = X(j).i;
+			for (i = j - 1; i >= 1; --i) {
+			    i__1 = i;
+			    i__2 = i;
+			    i__3 = i + j * a_dim1;
+			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - 
+				    z__2.i;
+			    X(i).r = z__1.r, X(i).i = z__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (X(jx).r != 0. || X(jx).i != 0.) {
+			if (nounit) {
+			    i__1 = jx;
+			    z_div(&z__1, &X(jx), &A(j,j));
+			    X(jx).r = z__1.r, X(jx).i = z__1.i;
+			}
+			i__1 = jx;
+			temp.r = X(jx).r, temp.i = X(jx).i;
+			ix = jx;
+			for (i = j - 1; i >= 1; --i) {
+			    ix -= *incx;
+			    i__1 = ix;
+			    i__2 = ix;
+			    i__3 = i + j * a_dim1;
+			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - 
+				    z__2.i;
+			    X(ix).r = z__1.r, X(ix).i = z__1.i;
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = j;
+		    if (X(j).r != 0. || X(j).i != 0.) {
+			if (nounit) {
+			    i__2 = j;
+			    z_div(&z__1, &X(j), &A(j,j));
+			    X(j).r = z__1.r, X(j).i = z__1.i;
+			}
+			i__2 = j;
+			temp.r = X(j).r, temp.i = X(j).i;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    i__3 = i;
+			    i__4 = i;
+			    i__5 = i + j * a_dim1;
+			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - 
+				    z__2.i;
+			    X(i).r = z__1.r, X(i).i = z__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = jx;
+		    if (X(jx).r != 0. || X(jx).i != 0.) {
+			if (nounit) {
+			    i__2 = jx;
+			    z_div(&z__1, &X(jx), &A(j,j));
+			    X(jx).r = z__1.r, X(jx).i = z__1.i;
+			}
+			i__2 = jx;
+			temp.r = X(jx).r, temp.i = X(jx).i;
+			ix = jx;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    ix += *incx;
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i + j * a_dim1;
+			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
+				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
+			    z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - 
+				    z__2.i;
+			    X(ix).r = z__1.r, X(ix).i = z__1.i;
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
+
+	if (strncmp(uplo, "U", 1)==0) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    i__2 = j;
+		    temp.r = X(j).r, temp.i = X(j).i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    i__3 = i + j * a_dim1;
+			    i__4 = i;
+			    z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
+				    i).i, z__2.i = A(i,j).r * X(i).i + 
+				    A(i,j).i * X(i).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &A(j,j));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    d_cnjg(&z__3, &A(i,j));
+			    i__3 = i;
+			    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
+				    z__2.i = z__3.r * X(i).i + z__3.i * X(
+				    i).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &A(j,j));
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__2 = j;
+		    X(j).r = temp.r, X(j).i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    ix = kx;
+		    i__2 = jx;
+		    temp.r = X(jx).r, temp.i = X(jx).i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    i__3 = i + j * a_dim1;
+			    i__4 = ix;
+			    z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
+				    ix).i, z__2.i = A(i,j).r * X(ix).i + 
+				    A(i,j).i * X(ix).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L120: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &A(j,j));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i = 1; i <= j-1; ++i) {
+			    d_cnjg(&z__3, &A(i,j));
+			    i__3 = ix;
+			    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
+				    z__2.i = z__3.r * X(ix).i + z__3.i * X(
+				    ix).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L130: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &A(j,j));
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__2 = jx;
+		    X(jx).r = temp.r, X(jx).i = temp.i;
+		    jx += *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = X(j).r, temp.i = X(j).i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    i__2 = i + j * a_dim1;
+			    i__3 = i;
+			    z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
+				    i).i, z__2.i = A(i,j).r * X(i).i + 
+				    A(i,j).i * X(i).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &A(j,j));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    d_cnjg(&z__3, &A(i,j));
+			    i__2 = i;
+			    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
+				    z__2.i = z__3.r * X(i).i + z__3.i * X(
+				    i).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &A(j,j));
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__1 = j;
+		    X(j).r = temp.r, X(j).i = temp.i;
+/* L170: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    ix = kx;
+		    i__1 = jx;
+		    temp.r = X(jx).r, temp.i = X(jx).i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    i__2 = i + j * a_dim1;
+			    i__3 = ix;
+			    z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
+				    ix).i, z__2.i = A(i,j).r * X(ix).i + 
+				    A(i,j).i * X(ix).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L180: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &A(j,j));
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i = *n; i >= j+1; --i) {
+			    d_cnjg(&z__3, &A(i,j));
+			    i__2 = ix;
+			    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
+				    z__2.i = z__3.r * X(ix).i + z__3.i * X(
+				    ix).r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L190: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &A(j,j));
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__1 = jx;
+		    X(jx).r = temp.r, X(jx).i = temp.i;
+		    jx -= *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRSV . */
+
+} /* ztrsv_ */
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..d082edc
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,217 @@
+######################################################################
+#
+# CMakeLists.txt for SUPERLU_DIST
+#
+######################################################################
+
+# Required version
+cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+
+# Project version numbers
+project(SuperLU_DIST NONE)
+set(VERSION_MAJOR "5")
+set(VERSION_MINOR "1")
+set(VERSION_BugFix "3")
+set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
+
+######################################################################
+#
+# IDEAS: xSDK standards module
+MESSAGE("\nProcess XSDK defaults ...")
+# SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
+INCLUDE("cmake/XSDKDefaults.cmake")
+######################################################################
+
+######################################################################
+#
+# Usual initialization stuff
+#
+######################################################################
+# setup options
+option(enable_blaslib   "Build the CBLAS library" ${enable_blaslib_DEFAULT})
+option(enable_parmetislib   "Build the ParMETIS library" ON)
+option(enable_doc       "Build doxygen documentation" OFF)
+option(enable_double    "Enable double precision library" ON)
+option(enable_complex16 "Enable complex16 precision library" ON)
+option(enable_examples  "Build examples" ON)
+option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
+option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
+
+if (NOT CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX /usr/local)
+endif()
+
+
+#---- For shared library
+
+# use, i.e. don't skip the full RPATH for the build tree
+SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
+
+# when building, don't use the install RPATH already
+# (but later on when installing)
+SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) 
+
+# the RPATH to be used when installing
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+
+# add the automatically determined parts of the RPATH
+# which point to directories outside the build tree to the install RPATH
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+#----
+
+if (BUILD_SHARED_LIBS)
+  message("-- SuperLU_DIST will be built as a shared library.")
+  set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.so)
+else()
+  message("-- SuperLU_DIST will be built as a static library.")
+  set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.a)
+endif()
+
+enable_language (C)
+if (XSDK_ENABLE_Fortran)
+  enable_language (Fortran)
+  set(NOFORTRAN FALSE)
+endif()
+set(SUPERLU_VERSION "${PROJECT_VERSION}")
+set(SUPERLU_REV "${PROJECT_REV}")
+
+# The XSDK standard does not allow using internally built BLAS
+if (USE_XSDK_DEFAULTS)
+  set(enable_blaslib_DEFAULT OFF)
+else()
+  set(enable_blaslib_DEFAULT ON)
+endif()
+
+
+# setup required compiler defines and options.
+## get_directory_property( DirDefs COMPILE_DEFINITIONS )
+set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
+if(XSDK_INDEX_SIZE EQUAL 64)
+    message("-- Using 64 bit integer for index size")
+    set(CMAKE_C_FLAGS "-D_LONGINT ${CMAKE_C_FLAGS}")
+endif()	
+set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
+
+######################################################################
+#
+# Find packages
+#
+######################################################################
+#
+#--------------------- BLAS ---------------------
+if(NOT enable_blaslib)
+#  set(TPL_BLAS_LIBRARIES "" CACHE FILEPATH
+#    "Override of list of absolute path to libs for BLAS.")
+  if(TPL_BLAS_LIBRARIES)
+    set(BLAS_FOUND TRUE)
+  else()
+    find_package(BLAS)
+    if(BLAS_FOUND)
+      set(TPL_BLAS_LIBRARIES "${BLAS_LIBRARIES}" CACHE FILEPATH
+        "Set from FindBLAS.cmake BLAS_LIBRARIES." FORCE)
+    endif()
+  endif()
+endif()
+
+if(BLAS_FOUND)
+    message("-- Using TPL_BLAS_LIBRARIES='${TPL_BLAS_LIBRARIES}'")
+    set(CMAKE_C_FLAGS "-DUSE_VENDOR_BLAS ${CMAKE_C_FLAGS}")
+    set(BLAS_LIB ${TPL_BLAS_LIBRARIES})
+    # fix up BLAS library name
+    string (REPLACE ";" " " BLAS_LIB_STR "${BLAS_LIB}")
+    set(BLAS_LIB_EXPORT ${BLAS_LIB_STR})
+else()
+    message("-- Did not find or specify BLAS, so configure to build internal CBLAS ...")
+    add_subdirectory(CBLAS)
+    set(BLAS_LIB blas)
+    if (BUILD_SHARED_LIBS)  # export to be referenced by downstream makefile
+        set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.so)
+    else()
+        set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.a)
+    endif()
+endif()
+
+#--------------------- MPI ---------------------
+find_package(MPI)
+if(MPI_C_FOUND)
+    set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" )
+endif()
+#--------------------- OpenMP ---------------------
+find_package(OpenMP)
+if(OPENMP_FOUND)
+  set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
+# On edison, OpenMP_EXE_LINKER_FLAGS is empty
+# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+# message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
+endif()
+#--------------------- ParMETIS ---------------------
+if (enable_parmetislib)   ## want to use parmetis
+  if (NOT TPL_PARMETIS_LIBRARIES)
+    message(FATAL_ERROR "TPL_PARMETIS_LIBRARIES option should be set for PARMETIS support to be enabled.")
+  endif()
+
+  if (NOT TPL_PARMETIS_INCLUDE_DIRS)
+    message(FATAL_ERROR "TPL_PARMETIS_INCLUDE_DIRS option be set for PARMETIS support to be enabled.")
+  endif()
+  foreach(dir ${TPL_PARMETIS_INCLUDE_DIRS})
+    if (NOT EXISTS ${dir})
+      message(FATAL_ERROR "PARMETIS include directory not found: ${dir}")
+    endif()
+    set(CMAKE_C_FLAGS "-I${dir} ${CMAKE_C_FLAGS}")
+  endforeach()
+
+  message("-- Enabled support for PARMETIS")
+  set(PARMETIS_FOUND TRUE)
+
+  set(PARMETIS_LIB ${TPL_PARMETIS_LIBRARIES})
+  # fix up PARMETIS library names
+  string (REPLACE ";" " " PARMETIS_LIB_STR "${PARMETIS_LIB}")
+  set(PARMETIS_LIB_EXPORT ${PARMETIS_LIB_STR})
+
+else()
+  message("-- Will not link with ParMETIS.")
+endif()
+
+# if(NOT enable_parmetislib)
+#  find_package(PARMETIS)    ## does not have this Module yet.
+# endif()
+
+
+######################################################################
+#
+# Include directories
+#
+######################################################################
+
+include_directories(${CMAKE_SOURCE_DIR}/SRC)
+include_directories(${TPL_PARMETIS_INCLUDE_DIRS})  ## parmetis
+include_directories(${MPI_C_INCLUDE_PATH})
+
+######################################################################
+#
+# Add subdirectories
+#
+######################################################################
+
+add_subdirectory(SRC)
+
+if(enable_tests)
+  enable_testing()
+  add_subdirectory(TESTING)
+endif()
+
+if(enable_doc)
+  message(FATAL_ERROR "Documentation build requested but not implemented.")
+  #implement doxygen
+endif()
+
+if(enable_examples)
+  enable_testing()
+  add_subdirectory(EXAMPLE)
+endif()
+
+# file(WRITE "make.defs" "# can be exposed to users" ${CMAKE_C_COMPILER})
+# configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc)
+configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc)
diff --git a/DOC/ug.pdf b/DOC/ug.pdf
new file mode 100644
index 0000000..f854405
Binary files /dev/null and b/DOC/ug.pdf differ
diff --git a/DoxyConfig b/DoxyConfig
new file mode 100644
index 0000000..5bbc5a0
--- /dev/null
+++ b/DoxyConfig
@@ -0,0 +1,1356 @@
+# Doxyfile 1.5.5
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = SuperLU Distributed
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = 5.0.0
+e
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = DOC
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek, 
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages), 
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish, 
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish, 
+# and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = NO
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = 
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful is your file systems 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen 
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member 
+# documentation.
+
+DETAILS_AT_TOP         = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = YES
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also make the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or define consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and defines in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories 
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = SRC/ EXAMPLE/ FORTRAN/
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = 
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
+# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = 
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# ignored.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# is applied to all files.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default) 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES (the default) 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If the tag is left blank doxygen 
+# will generate a default style sheet. Note that doxygen will try to copy 
+# the style sheet file to the HTML output directory, so don't put your own 
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
+# files or namespaces will be aligned in HTML using tables. If set to 
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded. For this to work a browser that supports 
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
+# top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20]) 
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, 
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are 
+# probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all function-like macros that are alone 
+# on a line, have an all uppercase name, and do not end with a semicolon. Such 
+# function macros are typically used for boiler-plate code, and will confuse 
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references   
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. 
+# Optionally an initial location of the external documentation 
+# can be added for each tagfile. The format of a tag file without 
+# this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths or 
+# URLs. If a location is present for each tag, the installdox tool 
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen 
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option is superseded by the HAVE_DOT option below. This is only a 
+# fallback. It is recommended to install and use dot, since it yields more 
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is enabled by default, which results in a transparent 
+# background. Warning: Depending on the platform used, enabling this option 
+# may lead to badly anti-aliased labels on the edges of a graph (i.e. they 
+# become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine   
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be 
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
new file mode 100644
index 0000000..5eb7473
--- /dev/null
+++ b/EXAMPLE/CMakeLists.txt
@@ -0,0 +1,120 @@
+include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
+
+# Libs linked to all of the examples
+set(all_link_libs superlu_dist ${BLAS_LIB} m)
+
+function(add_superlu_dist_test target input nprow npcol)
+    set(TEST_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
+    set(TEST_OUTPUT "${SuperLU_DIST_BINARY_DIR}/EXAMPLE/${target}.out")
+
+##  get_target_property(TEST_LOC ${target} LOCATION)
+    set(TEST_LOC ${CMAKE_CURRENT_BINARY_DIR})
+
+    MATH( EXPR procs "${nprow}*${npcol}" )
+#    message("MPIEXEC_FLAG is ${MPIEXEC_NUMPROC_FLAG}")
+
+# corresponding to mpiexec -n 4 pddrive -r <nprow> -c <npcol> g20.rua
+    add_test(${target} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
+             ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}")
+#     add_test(NAME ${target} COMMAND "${CMAKE_COMMAND}"
+#              -DTEST=${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
+#             ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}"
+#	     -DOUTPUT=${target}.out
+#	    -P "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/runexample.cmake" )
+
+
+# MPI variables:
+# ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} PROCS
+#  	${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS)
+
+endfunction(add_superlu_dist_test)
+
+
+if(enable_double)
+  set(DEXM pddrive.c dcreate_matrix.c)
+  add_executable(pddrive ${DEXM})
+  target_link_libraries(pddrive ${all_link_libs})
+  add_superlu_dist_test(pddrive big.rua 2 2)
+
+  set(DEXM1 pddrive1.c dcreate_matrix.c)
+  add_executable(pddrive1 ${DEXM1})
+  target_link_libraries(pddrive1 ${all_link_libs})
+  add_superlu_dist_test(pddrive1 big.rua 2 2)
+
+  set(DEXM2 pddrive2.c dcreate_matrix.c dcreate_matrix_perturbed.c)
+  add_executable(pddrive2 ${DEXM2})
+  target_link_libraries(pddrive2 ${all_link_libs})
+
+  set(DEXM3 pddrive3.c dcreate_matrix.c)
+  add_executable(pddrive3 ${DEXM3})
+  target_link_libraries(pddrive3 ${all_link_libs})
+
+  set(DEXM4 pddrive4.c dcreate_matrix.c)
+  add_executable(pddrive4 ${DEXM4})
+  target_link_libraries(pddrive4 ${all_link_libs})
+
+  set(DEXMG pddrive_ABglobal.c)
+  add_executable(pddrive_ABglobal ${DEXMG})
+  target_link_libraries(pddrive_ABglobal ${all_link_libs})
+
+  set(DEXMG1 pddrive1_ABglobal.c)
+  add_executable(pddrive1_ABglobal ${DEXMG1})
+  target_link_libraries(pddrive1_ABglobal ${all_link_libs})
+
+  set(DEXMG2 pddrive2_ABglobal.c)
+  add_executable(pddrive2_ABglobal ${DEXMG2})
+  target_link_libraries(pddrive2_ABglobal ${all_link_libs})
+
+  set(DEXMG3 pddrive3_ABglobal.c)
+  add_executable(pddrive3_ABglobal ${DEXMG3})
+  target_link_libraries(pddrive3_ABglobal ${all_link_libs})
+
+  set(DEXMG4 pddrive4_ABglobal.c)
+  add_executable(pddrive4_ABglobal ${DEXMG4})
+  target_link_libraries(pddrive4_ABglobal ${all_link_libs})
+endif()
+
+
+if(enable_complex16)
+
+  set(ZEXM pzdrive.c zcreate_matrix.c)
+  add_executable(pzdrive ${ZEXM})
+  target_link_libraries(pzdrive ${all_link_libs})
+
+  set(ZEXM1 pzdrive1.c zcreate_matrix.c)
+  add_executable(pzdrive1 ${ZEXM1})
+  target_link_libraries(pzdrive1 ${all_link_libs})
+
+  set(ZEXM2 pzdrive2.c zcreate_matrix.c zcreate_matrix_perturbed.c)
+  add_executable(pzdrive2 ${ZEXM2})
+  target_link_libraries(pzdrive2 ${all_link_libs})
+
+  set(ZEXM3 pzdrive3.c zcreate_matrix.c)
+  add_executable(pzdrive3 ${ZEXM3})
+  target_link_libraries(pzdrive3 ${all_link_libs})
+
+  set(ZEXM4 pzdrive4.c zcreate_matrix.c)
+  add_executable(pzdrive4 ${ZEXM4})
+  target_link_libraries(pzdrive4 ${all_link_libs})
+
+  set(ZEXMG pzdrive_ABglobal.c)
+  add_executable(pzdrive_ABglobal ${ZEXMG})
+  target_link_libraries(pzdrive_ABglobal ${all_link_libs})
+
+  set(ZEXMG1 pzdrive1_ABglobal.c)
+  add_executable(pzdrive1_ABglobal ${ZEXMG1})
+  target_link_libraries(pzdrive1_ABglobal ${all_link_libs})
+
+  set(ZEXMG2 pzdrive2_ABglobal.c)
+  add_executable(pzdrive2_ABglobal ${ZEXMG2})
+  target_link_libraries(pzdrive2_ABglobal ${all_link_libs})
+
+  set(ZEXMG3 pzdrive3_ABglobal.c)
+  add_executable(pzdrive3_ABglobal ${ZEXMG3})
+  target_link_libraries(pzdrive3_ABglobal ${all_link_libs})
+
+  set(ZEXMG4 pzdrive4_ABglobal.c)
+  add_executable(pzdrive4_ABglobal ${ZEXMG4})
+  target_link_libraries(pzdrive4_ABglobal ${all_link_libs})
+
+endif()
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
new file mode 100644
index 0000000..907c4ec
--- /dev/null
+++ b/EXAMPLE/Makefile
@@ -0,0 +1,144 @@
+#######################################################################
+#
+#  This makefile creates the example programs for the linear equation
+#  routines in SuperLU_DIST.
+#
+#  The command
+#       make
+#  without any arguments creates all the example programs.
+#  The command
+# 	make double
+#  creates double precision real example programs.
+#  The command
+#       make complex16
+#  creates double precision complex example programs.
+#
+#  The executable files are called
+#       double real:	pddrive pddrive_ABglobal pddrive1
+#                       pddrive1_ABglobal pddrive2 pddrive3 pddrive4
+#	double complex: pzdrive pzdrive_ABglobal pzdrive1
+#                       pzdrive1_ABglobal pzdrive2 pzdrive3 pzdrive4 
+#
+#  Alternatively, you can create example programs individually by
+#  typing the command (for example)
+#	make pddrive
+#
+#  To remove the object files after the executable files have been
+#  created, enter
+#       make clean
+#
+#######################################################################
+include ../make.inc
+INCLUDEDIR = -I../SRC
+
+DEXM	= pddrive.o dcreate_matrix.o #pdgstrf2.o 
+#pdgssvx.o
+# pdgstrs_lsum_X1.o pdgstrf_X1.o
+DEXM1	= pddrive1.o dcreate_matrix.o
+DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
+DEXM3	= pddrive3.o dcreate_matrix.o
+DEXM4	= pddrive4.o dcreate_matrix.o
+DEXMG	= pddrive_ABglobal.o
+DEXMG1	= pddrive1_ABglobal.o
+DEXMG2	= pddrive2_ABglobal.o
+DEXMG3	= pddrive3_ABglobal.o
+DEXMG4	= pddrive4_ABglobal.o
+ZEXM	= pzdrive.o zcreate_matrix.o
+	#pzgstrf2.o pzgstrf_v3.3.o pzgstrf.o
+ZEXM1	= pzdrive1.o zcreate_matrix.o
+ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
+ZEXM3	= pzdrive3.o zcreate_matrix.o
+ZEXM4	= pzdrive4.o zcreate_matrix.o
+ZEXMG	= pzdrive_ABglobal.o
+ZEXMG1	= pzdrive1_ABglobal.o
+ZEXMG2	= pzdrive2_ABglobal.o
+ZEXMG3	= pzdrive3_ABglobal.o
+ZEXMG4	= pzdrive4_ABglobal.o
+
+
+all: double complex16
+
+double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
+	   pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \
+	   pddrive3_ABglobal pddrive4_ABglobal
+
+complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
+	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
+	   pzdrive3_ABglobal pzdrive4_ABglobal
+
+pddrive: $(DEXM) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@
+
+pddrive1: $(DEXM1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM1) $(LIBS) -lm -o $@
+
+pddrive2: $(DEXM2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM2) $(LIBS) -lm -o $@
+
+pddrive3: $(DEXM3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3) $(LIBS) -lm -o $@
+
+pddrive4: $(DEXM4) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@
+
+pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@
+
+pddrive1_ABglobal: $(DEXMG1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXMG1) $(LIBS) -lm -o $@
+
+pddrive2_ABglobal: $(DEXMG2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXMG2) $(LIBS) -lm -o $@
+
+pddrive3_ABglobal: $(DEXMG3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXMG3) $(LIBS) -lm -o $@
+
+pddrive4_ABglobal: $(DEXMG4) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXMG4) $(LIBS) -lm -o $@
+
+pzdrive: $(ZEXM) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM) $(LIBS) -lm -o $@
+
+pzdrive_triple: $(ZEXM) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM) $(LIBS) -lm -o $@
+
+pzdrive1: $(ZEXM1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM1) $(LIBS) -lm -o $@
+
+pzdrive2: $(ZEXM2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM2) $(LIBS) -lm -o $@
+
+pzdrive3: $(ZEXM3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3) $(LIBS) -lm -o $@
+
+pzdrive4: $(ZEXM4) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@
+
+pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@
+
+pzdrive1_ABglobal: $(ZEXMG1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXMG1) $(LIBS) -lm -o $@
+
+pzdrive2_ABglobal: $(ZEXMG2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXMG2) $(LIBS) -lm -o $@
+
+pzdrive3_ABglobal: $(ZEXMG3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXMG3) $(LIBS) -lm -o $@
+
+pzdrive4_ABglobal: $(ZEXMG4) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXMG4) $(LIBS) -lm -o $@
+
+#pdgstrf.o: dscatter.c dSchCompUdt-cuda.c pdgstrf.c
+#	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c pdgstrf.c $(VERBOSE)
+.c.o:
+	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE)
+
+.f.o:
+	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
+
+clean:	
+	rm -f *.o p[dz]drive p[dz]drive[1-9] \
+	p[dz]drive_ABglobal p[dz]drive[1-9]_ABglobal
+
+
diff --git a/EXAMPLE/README b/EXAMPLE/README
new file mode 100644
index 0000000..f773812
--- /dev/null
+++ b/EXAMPLE/README
@@ -0,0 +1,52 @@
+		SuperLU_DIST  EXAMPLES
+		======================
+
+This directory contains sample programs to illustrate how to use
+various functions provded in SuperLU_DIST. You can modify these
+examples to suit your applications.
+
+The examples illustrate the following functionalities:
+  1. pddrive.c, pddrive_ABglobal.c
+     Use PDGSSVX with the full (default) options to solve a linear system.
+  2. pddrive1.c, pddrive1_ABglobal.c
+     Solve the systems with same A but different right-hand side.
+     (Reuse the factored form of A)
+  3. pddrive2.c, pddrive2_ABglobal.c
+     Solve the systems with the same sparsity pattern of A.
+     (Reuse the sparsity ordering)
+  4. pddrive3.c, pddrive3_ABglobal.c
+     Solve the systems with the same sparsity pattern and similar values.
+  5. pddrive4.c, pddrive4_ABglobal.c
+     Divide the processors into two subgroups (two grids) such that each
+     subgroup solves a linear system independently from the other.
+
+
+The command line options "-r <process rows>" and "-c <process columns>"
+defines the 2-D process grid. The total number of processes <procs> is:
+	<procs> = <process rows> * <process columns>
+If the options is not provided at the command line, the programs
+will use 1 processor as default in each case.
+
+Three input matrices (Harwell-Boeing format) are provided in this directory:
+	g20.rua  -- a real matrix of dimension 400x400
+	big.rua  -- a real matrix of dimension 4960x4960
+        cg20.cua -- a complex matrix of dimension 400x400
+
+The command lines given below show how to run the parallel programs
+using "mpiexec". You may need to replace mpiexec by platform specific
+command.
+
+1. To run the real examples (pddrive, pddrive1, etc.)
+   you may type:
+   % mpiexec -n <np> pddrive -r <process row> -c <process columns> g20.rua 
+     (e.g., mpiexec -n 4 pddrive -r 2 -c 2 g20.rua)
+
+2. To run the real examples pddrive4 and pddrive4_ABglobal, you may type:
+   % mpiexec -n 10 pddrive4 g20.rua
+   
+3. To run the complex examples (pzdrive, pzdrive1, etc.),
+   you may type:
+   % mpiexec -n <np> pzdrive -r <process row> -c <process columns> cg20.cua
+
+4. To run the complex examples pzdrive4 and pzdrive4_ABglobal, you may type:
+   % mpiexec -n 10 pzdrive4 cg20.cua
diff --git a/EXAMPLE/big.rua b/EXAMPLE/big.rua
new file mode 100644
index 0000000..3a5c16f
--- /dev/null
+++ b/EXAMPLE/big.rua
@@ -0,0 +1,11496 @@
+32-bit adder, from Steve Hamm (Motorola) hamm at austoto.sps.mot.com       add32   
+         11491           382          1493          7962          1654
+RUA                     4960          4960         23884             0
+(13i6)          (16i5)          (3e26.18)           (3e26.18)           
+F                          1             0
+     1    30    42    45    57    65    73    76    87   111   114   117   127
+   130   144   157   165   178   181   191   203   206   214   226   236   239
+   251   254   263   274   278   288   310   321   324   340   350   359   362
+   373   397   400   404   417   420   430   440   450   468   471   484   496
+   499   509   520   530   534   546   549   558   570   573   581   609   622
+   625   635   644   653   656   671   695   698   702   714   717   728   739
+   752   765   768   778   793   796   804   816   826   829   841   844   853
+   865   868   876   899   912   915   925   934   943   946   961   988   991
+   995  1009  1012  1022  1031  1043  1058  1061  1071  1087  1090  1098  1110
+  1121  1124  1136  1140  1149  1160  1164  1174  1202  1214  1218  1230  1239
+  1248  1251  1263  1288  1291  1294  1304  1307  1319  1330  1343  1356  1359
+  1371  1385  1388  1398  1409  1418  1422  1436  1440  1449  1461  1464  1472
+  1501  1512  1516  1532  1542  1551  1554  1565  1590  1593  1596  1607  1611
+  1623  1634  1646  1660  1663  1675  1689  1692  1700  1712  1722  1725  1737
+  1740  1749  1761  1764  1772  1801  1812  1816  1832  1842  1851  1854  1865
+  1891  1894  1897  1910  1913  1923  1932  1945  1959  1962  1974  1990  1993
+  2003  2014  2024  2028  2039  2043  2052  2064  2067  2075  2101  2113  2117
+  2131  2140  2149  2152  2163  2190  2193  2197  2209  2212  2224  2234  2244
+  2258  2261  2271  2286  2289  2297  2309  2319  2322  2334  2337  2346  2358
+  2361  2369  2399  2411  2414  2425  2434  2443  2446  2459  2483  2486  2490
+  2504  2507  2517  2526  2536  2552  2555  2565  2581  2584  2592  2604  2614
+  2617  2629  2632  2641  2653  2656  2664  2696  2708  2711  2722  2731  2740
+  2743  2755  2780  2783  2787  2799  2802  2813  2827  2839  2852  2855  2868
+  2879  2882  2890  2902  2912  2915  2927  2930  2939  2951  2954  2962  2992
+  3004  3007  3018  3027  3036  3039  3051  3078  3081  3085  3096  3099  3114
+  3126  3139  3150  3153  3165  3177  3180  3188  3200  3210  3213  3225  3228
+  3237  3248  3252  3262  3292  3304  3307  3318  3327  3336  3339  3351  3377
+  3380  3384  3395  3398  3412  3425  3437  3449  3452  3465  3476  3479  3487
+  3499  3509  3512  3524  3527  3536  3548  3551  3559  3586  3598  3601  3614
+  3623  3632  3635  3647  3673  3676  3680  3691  3694  3709  3722  3735  3746
+  3749  3761  3773  3776  3784  3796  3806  3809  3821  3824  3833  3845  3848
+  3856  3886  3898  3901  3914  3923  3932  3935  3946  3973  3976  3980  3991
+  3994  4006  4019  4031  4044  4047  4057  4070  4073  4081  4093  4103  4106
+  4118  4121  4130  4142  4145  4153  4180  4193  4196  4206  4215  4224  4227
+  4241  4267  4270  4274  4287  4290  4300  4312  4322  4338  4341  4353  4365
+  4368  4376  4388  4398  4401  4413  4416  4425  4436  4440  4450  4476  4489
+  4492  4502  4511  4520  4523  4537  4566  4569  4573  4584  4587  4602  4614
+  4626  4638  4641  4651  4664  4667  4677  4688  4697  4701  4712  4715  4724
+  4736  4739  4747  4779  4792  4795  4805  4814  4823  4826  4841  4864  4867
+  4871  4882  4885  4897  4908  4920  4934  4937  4948  4962  4965  4973  4985
+  4995  4998  5010  5013  5022  5034  5037  5045  5070  5082  5085  5096  5105
+  5114  5117  5137  5164  5167  5171  5183  5186  5200  5213  5225  5237  5240
+  5252  5264  5267  5275  5287  5297  5300  5312  5315  5324  5336  5339  5347
+  5376  5389  5392  5402  5411  5420  5423  5437  5463  5466  5470  5481  5484
+  5498  5512  5526  5537  5540  5553  5564  5567  5575  5587  5597  5600  5612
+  5615  5624  5636  5639  5647  5674  5687  5690  5700  5709  5718  5721  5735
+  5759  5762  5766  5777  5780  5792  5805  5815  5829  5832  5843  5856  5859
+  5867  5879  5889  5894  5906  5909  5918  5930  5935  5943  5970  5982  5985
+  5997  6007  6016  6019  6033  6056  6059  6062  6072  6075  6088  6098  6111
+  6125  6129  6142  6156  6160  6169  6179  6189  6193  6205  6208  6217  6228
+  6233  6240  6264  6275  6278  6294  6304  6313  6316  6327  6351  6354  6357
+  6367  6370  6383  6393  6406  6420  6424  6437  6451  6455  6463  6475  6485
+  6490  6502  6505  6514  6526  6531  6539  6563  6574  6577  6596  6605  6613
+  6616  6627  6652  6655  6658  6671  6674  6684  6693  6706  6721  6725  6737
+  6754  6758  6765  6776  6786  6791  6803  6806  6815  6825  6829  6838  6867
+  6879  6882  6897  6907  6916  6919  6931  6955  6958  6961  6972  6975  6987
+  6997  7010  7023  7027  7039  7054  7058  7066  7078  7088  7093  7105  7108
+  7117  7129  7134  7142  7168  7179  7182  7197  7207  7216  7219  7230  7255
+  7258  7261  7271  7274  7287  7297  7309  7324  7328  7341  7354  7357  7364
+  7374  7383  7388  7400  7403  7412  7423  7428  7435  7465  7478  7481  7491
+  7500  7509  7512  7527  7551  7554  7557  7567  7570  7583  7593  7604  7618
+  7622  7633  7648  7652  7663  7673  7683  7687  7699  7702  7711  7722  7728
+  7737  7765  7778  7781  7791  7800  7809  7812  7827  7851  7854  7857  7867
+  7870  7883  7893  7903  7917  7920  7930  7945  7948  7957  7968  7978  7984
+  7996  7999  8008  8019  8025  8034  8061  8074  8077  8087  8096  8105  8108
+  8123  8150  8154  8158  8171  8174  8184  8196  8206  8222  8225  8236  8249
+  8253  8262  8273  8283  8289  8301  8304  8313  8323  8327  8338  8369  8381
+  8384  8395  8404  8413  8416  8429  8451  8454  8458  8469  8472  8484  8495
+  8505  8520  8523  8534  8548  8551  8560  8571  8581  8587  8599  8602  8611
+  8622  8628  8637  8663  8676  8679  8689  8698  8707  8710  8724  8750  8754
+  8758  8769  8772  8784  8795  8805  8820  8823  8834  8848  8852  8861  8872
+  8882  8888  8900  8903  8912  8922  8926  8937  8959  8971  8974  8986  8995
+  9004  9007  9018  9044  9048  9052  9063  9066  9079  9089  9099  9113  9116
+  9127  9142  9146  9155  9166  9176  9182  9194  9197  9206  9216  9220  9231
+  9234  9244  9247  9259  9269  9278  9281  9292  9317  9320  9323  9333  9336
+  9352  9362  9376  9387  9391  9404  9417  9421  9432  9442  9452  9456  9468
+  9471  9480  9491  9497  9506  9510  9513  9518  9521  9525  9532  9537  9540
+  9542  9545  9548  9550  9554  9557  9559  9562  9566  9569  9574  9577  9582
+  9585  9590  9596  9599  9601  9605  9608  9611  9615  9619  9622  9626  9629
+  9633  9636  9638  9641  9645  9648  9651  9653  9655  9658  9662  9665  9669
+  9672  9677  9680  9684  9691  9695  9698  9700  9703  9707  9710  9713  9715
+  9719  9722  9726  9729  9734  9737  9742  9748  9752  9755  9757  9760  9763
+  9765  9769  9772  9775  9779  9783  9786  9790  9793  9798  9801  9805  9813
+  9817  9820  9824  9827  9830  9834  9838  9841  9845  9848  9852  9855  9858
+  9862  9866  9869  9872  9874  9876  9879  9883  9886  9890  9893  9898  9901
+  9905  9912  9916  9919  9921  9924  9928  9931  9934  9936  9940  9943  9947
+  9950  9955  9958  9962  9970  9974  9977  9979  9982  9985  9987  9991  9994
+  9997 10001 10005 10008 10012 10015 10020 10023 10027 10034 10038 10041 10045
+ 10048 10051 10055 10059 10062 10066 10069 10073 10076 10079 10083 10087 10090
+ 10093 10095 10097 10100 10104 10107 10111 10114 10119 10122 10126 10133 10137
+ 10140 10142 10145 10149 10152 10155 10157 10161 10164 10168 10171 10176 10179
+ 10183 10190 10194 10197 10199 10202 10205 10207 10211 10214 10217 10221 10225
+ 10228 10232 10235 10240 10243 10247 10256 10260 10263 10267 10270 10273 10277
+ 10281 10284 10288 10291 10295 10298 10301 10305 10309 10312 10315 10317 10319
+ 10322 10326 10329 10333 10336 10340 10343 10347 10353 10357 10360 10362 10365
+ 10369 10372 10375 10377 10381 10384 10388 10391 10395 10398 10402 10408 10412
+ 10415 10417 10420 10423 10425 10429 10432 10435 10439 10443 10446 10451 10454
+ 10459 10462 10467 10473 10477 10480 10484 10487 10490 10494 10498 10501 10505
+ 10508 10512 10515 10518 10522 10526 10529 10532 10534 10536 10539 10543 10546
+ 10550 10553 10558 10561 10565 10570 10574 10577 10579 10582 10586 10589 10592
+ 10594 10598 10601 10605 10608 10612 10615 10620 10625 10629 10632 10634 10637
+ 10640 10642 10646 10649 10652 10656 10660 10663 10668 10671 10675 10678 10682
+ 10688 10692 10695 10699 10702 10705 10709 10713 10716 10720 10723 10727 10730
+ 10733 10737 10741 10744 10747 10749 10751 10754 10758 10761 10765 10768 10773
+ 10776 10780 10787 10791 10794 10796 10799 10803 10806 10809 10811 10815 10818
+ 10822 10825 10829 10832 10837 10842 10846 10849 10851 10854 10857 10859 10863
+ 10866 10869 10873 10877 10880 10885 10888 10893 10896 10900 10909 10913 10916
+ 10920 10923 10926 10930 10934 10937 10941 10944 10948 10951 10954 10958 10962
+ 10965 10968 10970 10972 10975 10979 10982 10986 10989 10993 10996 11000 11007
+ 11011 11014 11016 11019 11023 11026 11029 11031 11035 11038 11042 11045 11049
+ 11052 11056 11062 11066 11069 11071 11074 11077 11079 11083 11086 11089 11093
+ 11097 11100 11104 11107 11112 11115 11119 11126 11130 11133 11137 11140 11143
+ 11147 11151 11154 11158 11161 11165 11168 11171 11175 11179 11182 11185 11187
+ 11189 11192 11196 11199 11203 11206 11211 11214 11218 11225 11229 11232 11234
+ 11237 11241 11244 11247 11249 11253 11256 11260 11263 11268 11271 11275 11282
+ 11286 11289 11291 11294 11297 11299 11303 11306 11309 11313 11317 11320 11324
+ 11327 11332 11335 11339 11348 11352 11355 11359 11362 11365 11369 11373 11376
+ 11380 11383 11387 11390 11393 11397 11401 11404 11407 11409 11411 11414 11418
+ 11421 11425 11428 11433 11436 11440 11447 11451 11454 11456 11459 11463 11466
+ 11469 11471 11475 11478 11482 11485 11490 11493 11497 11504 11508 11511 11513
+ 11516 11519 11521 11525 11528 11531 11535 11539 11542 11546 11549 11554 11557
+ 11561 11568 11572 11575 11579 11582 11585 11589 11593 11596 11600 11603 11607
+ 11610 11613 11617 11621 11624 11627 11629 11631 11634 11638 11641 11645 11648
+ 11653 11656 11660 11667 11671 11674 11676 11679 11683 11686 11689 11691 11695
+ 11698 11702 11705 11710 11713 11717 11724 11728 11731 11733 11736 11739 11741
+ 11745 11748 11751 11755 11759 11762 11766 11769 11774 11777 11782 11788 11792
+ 11795 11799 11802 11805 11809 11813 11816 11820 11823 11827 11830 11833 11837
+ 11841 11844 11847 11849 11851 11854 11858 11861 11865 11868 11873 11876 11880
+ 11887 11891 11894 11896 11899 11903 11906 11909 11911 11915 11918 11922 11925
+ 11930 11933 11937 11944 11948 11951 11953 11956 11959 11961 11965 11968 11971
+ 11975 11979 11982 11986 11989 11994 11997 12002 12008 12012 12015 12019 12022
+ 12025 12029 12033 12036 12040 12043 12047 12050 12053 12057 12061 12064 12067
+ 12069 12071 12074 12078 12081 12085 12088 12093 12096 12100 12107 12111 12114
+ 12116 12119 12123 12126 12129 12131 12135 12138 12142 12145 12150 12153 12157
+ 12164 12168 12171 12173 12176 12179 12181 12185 12188 12191 12195 12199 12202
+ 12206 12209 12214 12217 12222 12228 12232 12235 12239 12242 12245 12249 12253
+ 12256 12260 12263 12267 12270 12273 12277 12281 12284 12287 12289 12291 12294
+ 12298 12301 12305 12308 12313 12316 12320 12327 12331 12334 12336 12339 12343
+ 12346 12349 12351 12355 12358 12362 12365 12370 12373 12377 12384 12388 12391
+ 12393 12396 12399 12401 12405 12408 12411 12415 12419 12422 12426 12429 12434
+ 12437 12442 12448 12452 12455 12459 12462 12465 12469 12473 12476 12480 12483
+ 12487 12490 12493 12497 12501 12504 12507 12509 12511 12514 12518 12521 12525
+ 12528 12533 12536 12540 12547 12551 12554 12556 12559 12563 12566 12569 12571
+ 12575 12578 12582 12585 12590 12593 12597 12605 12609 12612 12614 12617 12620
+ 12622 12626 12629 12632 12636 12640 12643 12647 12650 12655 12658 12662 12670
+ 12674 12677 12681 12684 12687 12691 12695 12698 12702 12705 12709 12712 12715
+ 12719 12723 12726 12729 12731 12733 12736 12740 12743 12747 12750 12755 12758
+ 12762 12769 12773 12776 12778 12781 12785 12788 12791 12793 12797 12800 12804
+ 12807 12812 12815 12819 12827 12831 12834 12836 12839 12842 12844 12848 12851
+ 12854 12858 12862 12865 12869 12872 12877 12880 12885 12891 12895 12898 12902
+ 12905 12908 12912 12916 12919 12923 12926 12930 12933 12936 12940 12944 12947
+ 12950 12952 12954 12957 12961 12964 12968 12971 12976 12979 12983 12991 12995
+ 12998 13000 13003 13007 13010 13013 13015 13019 13022 13026 13029 13034 13037
+ 13041 13049 13053 13056 13058 13061 13064 13066 13070 13073 13076 13080 13084
+ 13087 13091 13094 13099 13102 13107 13113 13117 13120 13124 13127 13130 13134
+ 13138 13141 13145 13148 13152 13155 13158 13162 13166 13169 13172 13174 13176
+ 13179 13183 13186 13190 13193 13198 13201 13205 13212 13216 13219 13221 13224
+ 13228 13231 13234 13236 13240 13243 13247 13250 13255 13258 13262 13269 13273
+ 13276 13278 13281 13284 13286 13290 13293 13296 13300 13304 13307 13311 13314
+ 13319 13322 13326 13333 13337 13340 13344 13347 13350 13354 13358 13361 13365
+ 13368 13372 13375 13378 13382 13386 13389 13392 13394 13396 13399 13403 13406
+ 13410 13413 13418 13421 13425 13432 13436 13439 13441 13444 13448 13451 13454
+ 13456 13460 13463 13467 13470 13475 13478 13482 13490 13494 13497 13499 13502
+ 13505 13507 13511 13514 13517 13521 13525 13528 13532 13535 13540 13543 13548
+ 13554 13558 13561 13565 13568 13571 13575 13579 13582 13586 13589 13593 13596
+ 13599 13603 13607 13610 13613 13615 13617 13620 13624 13627 13631 13634 13639
+ 13642 13646 13653 13657 13660 13662 13665 13669 13672 13675 13677 13681 13684
+ 13688 13691 13696 13699 13703 13710 13714 13717 13719 13722 13725 13727 13731
+ 13734 13737 13741 13745 13748 13752 13755 13760 13763 13768 13774 13778 13781
+ 13785 13788 13791 13795 13799 13802 13806 13809 13813 13816 13819 13823 13827
+ 13830 13833 13835 13837 13840 13844 13847 13851 13854 13859 13862 13866 13873
+ 13877 13880 13882 13885 13889 13892 13895 13897 13901 13904 13908 13911 13916
+ 13919 13924 13930 13934 13937 13939 13942 13945 13947 13951 13954 13957 13961
+ 13965 13968 13973 13976 13981 13984 13989 13995 13999 14002 14006 14009 14012
+ 14016 14020 14023 14027 14030 14034 14037 14040 14044 14048 14051 14054 14056
+ 14058 14061 14066 14069 14073 14076 14081 14084 14088 14095 14099 14102 14104
+ 14107 14112 14115 14118 14120 14124 14127 14131 14134 14139 14142 14147 14153
+ 14157 14160 14162 14165 14168 14170 14174 14177 14180 14184 14188 14191 14196
+ 14199 14204 14207 14212 14218 14222 14225 14229 14232 14235 14239 14243 14246
+ 14250 14253 14257 14260 14263 14267 14271 14274 14277 14279 14281 14284 14288
+ 14291 14295 14298 14303 14306 14310 14317 14321 14324 14326 14329 14333 14336
+ 14339 14341 14345 14348 14352 14355 14360 14363 14368 14374 14379 14382 14384
+ 14387 14390 14392 14396 14399 14402 14406 14410 14413 14418 14421 14426 14429
+ 14433 14442 14446 14449 14453 14456 14459 14463 14467 14470 14474 14477 14481
+ 14484 14487 14491 14495 14498 14501 14503 14505 14508 14513 14516 14520 14523
+ 14528 14531 14535 14542 14546 14549 14551 14554 14559 14562 14565 14567 14571
+ 14574 14578 14581 14586 14589 14594 14600 14604 14607 14609 14612 14615 14617
+ 14621 14624 14627 14631 14635 14638 14643 14646 14651 14654 14658 14665 14669
+ 14672 14676 14679 14682 14686 14690 14693 14697 14700 14704 14707 14710 14714
+ 14718 14721 14724 14726 14728 14731 14735 14738 14742 14745 14750 14753 14757
+ 14764 14768 14771 14773 14776 14780 14783 14786 14788 14792 14795 14799 14802
+ 14807 14810 14815 14821 14825 14828 14830 14833 14836 14838 14842 14845 14848
+ 14852 14856 14859 14864 14867 14872 14875 14880 14886 14890 14893 14897 14900
+ 14903 14907 14911 14914 14918 14921 14925 14928 14931 14935 14939 14942 14945
+ 14947 14949 14952 14957 14960 14964 14967 14972 14975 14979 14986 14990 14993
+ 14995 14998 15003 15006 15009 15011 15016 15019 15023 15026 15031 15034 15038
+ 15046 15050 15053 15055 15058 15061 15063 15067 15070 15073 15077 15081 15084
+ 15089 15092 15097 15100 15105 15111 15115 15118 15122 15125 15128 15132 15136
+ 15139 15143 15146 15150 15153 15156 15160 15164 15167 15170 15172 15174 15177
+ 15181 15184 15188 15191 15196 15199 15203 15210 15214 15217 15219 15222 15226
+ 15229 15232 15234 15238 15241 15245 15248 15253 15256 15260 15268 15272 15275
+ 15277 15280 15283 15285 15289 15292 15295 15299 15303 15306 15311 15314 15319
+ 15322 15327 15333 15337 15340 15344 15347 15350 15354 15358 15361 15365 15368
+ 15372 15375 15378 15382 15386 15389 15392 15394 15396 15399 15403 15406 15410
+ 15413 15418 15421 15425 15432 15436 15439 15441 15444 15448 15451 15454 15456
+ 15460 15463 15467 15470 15475 15478 15482 15490 15494 15497 15499 15502 15505
+ 15507 15511 15514 15517 15521 15525 15528 15532 15535 15540 15543 15547 15555
+ 15559 15562 15566 15569 15572 15576 15580 15583 15587 15590 15594 15597 15600
+ 15604 15608 15611 15614 15616 15618 15621 15625 15628 15632 15635 15640 15643
+ 15647 15654 15658 15661 15663 15666 15670 15673 15676 15678 15682 15685 15689
+ 15692 15697 15700 15704 15711 15715 15718 15720 15723 15726 15728 15732 15735
+ 15738 15742 15746 15749 15753 15756 15761 15764 15769 15775 15779 15782 15786
+ 15789 15792 15796 15800 15803 15807 15810 15814 15817 15820 15824 15828 15831
+ 15834 15836 15838 15841 15845 15848 15852 15855 15860 15863 15867 15874 15878
+ 15881 15883 15886 15890 15893 15896 15898 15902 15905 15909 15912 15917 15920
+ 15924 15932 15936 15939 15941 15944 15947 15949 15953 15956 15959 15963 15967
+ 15970 15974 15977 15982 15985 15990 15996 16000 16003 16007 16010 16013 16017
+ 16021 16024 16028 16031 16035 16038 16041 16045 16049 16052 16055 16057 16059
+ 16062 16066 16069 16073 16076 16081 16084 16088 16095 16099 16102 16104 16107
+ 16111 16114 16117 16119 16123 16126 16130 16133 16138 16141 16145 16152 16156
+ 16159 16161 16164 16167 16169 16173 16176 16179 16183 16187 16190 16194 16197
+ 16202 16205 16210 16216 16220 16223 16227 16230 16233 16237 16241 16244 16248
+ 16251 16255 16258 16261 16265 16269 16272 16275 16277 16279 16282 16286 16289
+ 16293 16296 16301 16304 16308 16315 16319 16322 16324 16327 16331 16334 16337
+ 16339 16343 16346 16351 16354 16359 16362 16367 16373 16377 16380 16382 16385
+ 16388 16390 16394 16397 16400 16404 16408 16411 16416 16419 16424 16427 16432
+ 16438 16442 16445 16449 16452 16455 16459 16463 16466 16470 16473 16477 16480
+ 16483 16487 16491 16494 16497 16499 16501 16504 16508 16511 16515 16518 16523
+ 16526 16530 16537 16541 16544 16546 16549 16553 16556 16559 16561 16565 16568
+ 16572 16575 16579 16582 16585 16589 16594 16597 16600 16603 16606 16608 16612
+ 16615 16620 16623 16628 16631 16635 16638 16641 16645 16649 16652 16656 16662
+ 16668 16671 16674 16676 16680 16683 16686 16691 16697 16700 16704 16707 16711
+ 16714 16717 16722 16725 16727 16731 16734 16738 16741 16744 16748 16752 16755
+ 16758 16763 16767 16770 16773 16775 16781 16784 16787 16793 16797 16800 16804
+ 16807 16810 16814 16818 16821 16824 16827 16830 16832 16836 16839 16844 16847
+ 16851 16854 16858 16861 16864 16868 16872 16875 16879 16887 16892 16895 16899
+ 16902 16906 16909 16914 16920 16925 16928 16932 16935 16939 16942 16945 16950
+ 16953 16955 16959 16962 16966 16969 16972 16976 16980 16983 16986 16991 16997
+ 17000 17003 17005 17009 17012 17017 17023 17027 17030 17034 17037 17040 17044
+ 17048 17051 17054 17057 17060 17062 17066 17069 17074 17077 17081 17084 17088
+ 17091 17094 17098 17102 17105 17110 17116 17121 17124 17128 17131 17135 17138
+ 17142 17150 17155 17158 17162 17165 17169 17172 17175 17180 17183 17185 17189
+ 17192 17196 17199 17202 17206 17210 17213 17216 17221 17227 17230 17233 17235
+ 17241 17244 17248 17255 17259 17262 17266 17269 17272 17276 17280 17283 17286
+ 17289 17292 17294 17298 17301 17306 17309 17313 17316 17320 17323 17326 17330
+ 17334 17337 17342 17348 17353 17356 17360 17363 17367 17370 17374 17382 17387
+ 17390 17394 17397 17401 17404 17407 17412 17415 17417 17421 17424 17428 17431
+ 17434 17438 17442 17445 17448 17453 17457 17460 17463 17465 17471 17474 17479
+ 17485 17489 17492 17496 17499 17502 17506 17510 17513 17516 17519 17522 17524
+ 17528 17531 17536 17539 17544 17547 17551 17554 17557 17561 17565 17568 17573
+ 17579 17584 17587 17591 17594 17598 17601 17606 17612 17617 17620 17624 17627
+ 17631 17634 17637 17642 17645 17647 17651 17654 17658 17661 17664 17668 17672
+ 17675 17678 17683 17689 17692 17695 17697 17701 17704 17709 17715 17719 17722
+ 17726 17729 17732 17736 17740 17743 17746 17749 17752 17754 17758 17761 17766
+ 17769 17774 17777 17781 17784 17787 17791 17795 17798 17802 17809 17814 17817
+ 17821 17824 17828 17831 17836 17842 17847 17850 17854 17857 17861 17864 17867
+ 17872 17875 17877 17881 17884 17888 17891 17894 17898 17902 17905 17908 17913
+ 17919 17922 17925 17927 17933 17936 17941 17947 17951 17954 17958 17961 17964
+ 17968 17972 17975 17978 17981 17984 17986 17990 17993 17998 18001 18006 18009
+ 18013 18016 18019 18023 18027 18030 18035 18041 18046 18049 18053 18056 18060
+ 18063 18068 18074 18079 18082 18086 18089 18093 18096 18099 18104 18107 18109
+ 18113 18116 18121 18124 18127 18131 18135 18138 18141 18146 18152 18155 18158
+ 18160 18164 18167 18172 18178 18182 18185 18189 18192 18195 18199 18203 18206
+ 18209 18212 18215 18217 18221 18224 18229 18232 18236 18239 18243 18246 18249
+ 18253 18257 18260 18264 18272 18277 18280 18284 18287 18291 18294 18298 18306
+ 18311 18314 18318 18321 18325 18328 18331 18336 18339 18341 18345 18348 18352
+ 18355 18358 18362 18366 18369 18372 18377 18383 18386 18389 18391 18397 18400
+ 18405 18411 18415 18418 18422 18425 18428 18432 18436 18439 18442 18445 18448
+ 18450 18454 18457 18462 18465 18469 18472 18476 18479 18482 18486 18490 18493
+ 18497 18505 18510 18513 18517 18520 18524 18527 18531 18539 18544 18547 18551
+ 18554 18558 18561 18564 18569 18572 18574 18578 18581 18585 18588 18591 18595
+ 18599 18602 18605 18610 18616 18619 18622 18624 18630 18633 18637 18644 18648
+ 18651 18655 18658 18661 18665 18669 18672 18675 18678 18681 18683 18687 18690
+ 18695 18698 18702 18705 18709 18712 18715 18719 18723 18726 18731 18737 18742
+ 18745 18749 18752 18756 18759 18764 18770 18775 18778 18782 18785 18789 18792
+ 18795 18800 18803 18805 18809 18812 18816 18819 18822 18826 18830 18833 18836
+ 18841 18847 18850 18853 18855 18861 18864 18869 18875 18879 18882 18886 18889
+ 18892 18896 18900 18903 18906 18909 18912 18914 18918 18921 18926 18929 18933
+ 18936 18940 18943 18946 18950 18954 18957 18962 18968 18973 18976 18980 18983
+ 18987 18990 18995 19001 19006 19009 19013 19016 19020 19023 19026 19031 19034
+ 19036 19040 19043 19047 19050 19053 19057 19061 19064 19067 19072 19076 19079
+ 19082 19084 19090 19093 19098 19104 19108 19111 19115 19118 19121 19125 19129
+ 19132 19135 19138 19141 19143 19147 19150 19155 19158 19162 19165 19169 19172
+ 19175 19179 19183 19186 19191 19197 19202 19205 19209 19212 19216 19219 19224
+ 19230 19235 19238 19242 19245 19249 19252 19255 19260 19263 19265 19269 19272
+ 19276 19279 19282 19286 19290 19293 19296 19301 19307 19310 19313 19315 19321
+ 19324 19329 19335 19339 19342 19346 19349 19352 19356 19360 19363 19366 19369
+ 19372 19374 19378 19381 19386 19389 19393 19396 19400 19403 19406 19410 19414
+ 19417 19422 19428 19433 19436 19440 19443 19447 19450 19454 19461 19466 19469
+ 19473 19476 19480 19483 19486 19491 19494 19496 19500 19503 19507 19510 19513
+ 19517 19521 19524 19527 19532 19538 19541 19544 19546 19552 19555 19559 19566
+ 19570 19573 19577 19580 19583 19587 19591 19594 19597 19600 19603 19605 19609
+ 19612 19617 19620 19624 19627 19631 19634 19637 19641 19645 19648 19653 19659
+ 19664 19667 19671 19674 19678 19681 19685 19693 19698 19701 19705 19708 19712
+ 19715 19718 19723 19726 19728 19732 19735 19739 19742 19745 19749 19753 19756
+ 19759 19764 19770 19773 19776 19778 19784 19787 19792 19798 19802 19805 19809
+ 19812 19815 19819 19823 19826 19829 19832 19835 19837 19841 19844 19849 19852
+ 19856 19859 19863 19866 19869 19873 19877 19880 19884 19892 19897 19900 19904
+ 19907 19911 19914 19919 19925 19930 19933 19937 19940 19944 19947 19950 19955
+ 19958 19960 19964 19967 19971 19974 19977 19981 19985 19988 19991 19996 20000
+ 20003 20006 20008 20014 20017 20022 20028 20032 20035 20039 20042 20045 20049
+ 20053 20056 20059 20062 20065 20067 20071 20074 20079 20082 20086 20089 20093
+ 20096 20099 20103 20107 20110 20114 20121 20126 20129 20133 20136 20140 20143
+ 20147 20155 20160 20163 20167 20170 20174 20177 20180 20185 20188 20190 20194
+ 20197 20202 20205 20208 20212 20216 20219 20222 20227 20233 20236 20239 20241
+ 20245 20248 20253 20259 20263 20266 20270 20273 20276 20280 20284 20287 20290
+ 20293 20296 20298 20302 20305 20310 20313 20317 20320 20324 20327 20330 20334
+ 20338 20341 20346 20352 20357 20360 20364 20367 20371 20374 20378 20385 20390
+ 20393 20397 20400 20404 20407 20410 20415 20418 20420 20424 20427 20431 20434
+ 20437 20441 20445 20448 20451 20456 20462 20465 20468 20470 20476 20479 20483
+ 20490 20494 20497 20501 20504 20507 20511 20515 20518 20521 20524 20527 20529
+ 20533 20536 20541 20544 20548 20551 20555 20558 20561 20565 20569 20572 20576
+ 20583 20588 20591 20595 20598 20602 20605 20609 20616 20621 20624 20628 20631
+ 20635 20638 20641 20646 20649 20651 20655 20658 20662 20665 20668 20672 20676
+ 20679 20682 20687 20693 20696 20699 20701 20707 20710 20714 20721 20725 20728
+ 20732 20735 20738 20742 20746 20749 20752 20755 20758 20760 20764 20767 20772
+ 20775 20779 20782 20786 20789 20792 20796 20800 20803 20808 20814 20819 20822
+ 20826 20829 20833 20836 20841 20847 20852 20855 20859 20862 20866 20869 20872
+ 20877 20880 20882 20886 20889 20893 20896 20899 20903 20907 20910 20913 20918
+ 20924 20927 20930 20932 20938 20941 20946 20952 20956 20959 20963 20966 20969
+ 20973 20977 20980 20983 20986 20989 20991 20995 20998 21003 21006 21010 21013
+ 21017 21020 21023 21027 21031 21034 21038 21046 21051 21054 21058 21061 21065
+ 21068 21072 21079 21084 21087 21091 21094 21098 21101 21105 21108 21111 21113
+ 21117 21120 21124 21127 21130 21134 21138 21141 21145 21148 21153 21156 21159
+ 21161 21166 21169 21173 21180 21184 21187 21191 21194 21197 21201 21205 21208
+ 21211 21214 21217 21219 21223 21226 21231 21234 21239 21242 21246 21249 21252
+ 21256 21260 21263 21268 21273 21277 21280 21284 21287 21291 21294 21298 21304
+ 21308 21311 21315 21318 21322 21325 21328 21334 21337 21339 21343 21346 21350
+ 21353 21356 21360 21364 21367 21371 21374 21380 21383 21386 21388 21392 21395
+ 21399 21406 21410 21413 21417 21420 21423 21427 21431 21434 21437 21440 21443
+ 21445 21449 21452 21457 21460 21465 21468 21472 21475 21478 21482 21486 21489
+ 21494 21499 21503 21506 21510 21513 21517 21520 21524 21530 21534 21537 21541
+ 21544 21548 21551 21555 21558 21561 21563 21567 21570 21574 21577 21580 21584
+ 21588 21591 21595 21598 21603 21606 21609 21611 21616 21619 21624 21630 21634
+ 21637 21641 21644 21647 21651 21656 21659 21662 21665 21668 21670 21674 21677
+ 21682 21685 21690 21693 21697 21700 21703 21707 21711 21714 21718 21724 21728
+ 21731 21735 21738 21742 21745 21749 21755 21759 21762 21766 21769 21773 21776
+ 21780 21783 21786 21788 21792 21795 21799 21802 21805 21809 21813 21816 21819
+ 21825 21829 21832 21835 21837 21843 21846 21851 21857 21861 21864 21868 21871
+ 21874 21878 21882 21885 21888 21891 21894 21896 21900 21903 21908 21911 21916
+ 21919 21923 21926 21929 21933 21937 21940 21945 21950 21954 21957 21961 21964
+ 21968 21971 21975 21981 21985 21988 21992 21995 21999 22002 22006 22009 22012
+ 22014 22018 22021 22025 22028 22031 22035 22039 22042 22046 22049 22054 22057
+ 22060 22062 22067 22070 22074 22081 22085 22088 22092 22095 22098 22102 22106
+ 22109 22112 22115 22118 22120 22124 22127 22132 22135 22140 22143 22147 22150
+ 22153 22157 22161 22164 22168 22174 22178 22181 22185 22188 22192 22195 22200
+ 22206 22211 22214 22218 22221 22226 22229 22233 22236 22239 22241 22245 22248
+ 22252 22255 22258 22262 22266 22269 22273 22276 22282 22285 22288 22290 22296
+ 22299 22304 22310 22314 22317 22321 22324 22327 22331 22335 22338 22341 22344
+ 22347 22349 22353 22356 22361 22364 22369 22372 22376 22379 22382 22386 22390
+ 22393 22397 22404 22408 22411 22415 22418 22422 22425 22429 22436 22440 22443
+ 22447 22450 22454 22457 22461 22465 22468 22470 22474 22477 22481 22484 22487
+ 22491 22495 22498 22502 22505 22509 22512 22515 22517 22521 22524 22528 22535
+ 22539 22542 22546 22549 22552 22556 22560 22563 22566 22569 22572 22574 22578
+ 22581 22586 22589 22594 22597 22601 22604 22607 22611 22615 22618 22622 22630
+ 22635 22638 22642 22645 22649 22652 22656 22664 22669 22672 22676 22679 22683
+ 22686 22690 22693 22696 22698 22702 22705 22709 22712 22715 22719 22723 22726
+ 22730 22733 22737 22740 22743 22745 22749 22752 22756 22763 22767 22770 22774
+ 22777 22780 22784 22788 22791 22794 22797 22800 22802 22806 22809 22813 22816
+ 22820 22823 22827 22830 22833 22837 22841 22844 22848 22856 22861 22864 22868
+ 22871 22875 22878 22882 22889 22893 22896 22900 22903 22907 22910 22914 22917
+ 22920 22922 22926 22929 22933 22936 22939 22943 22947 22950 22954 22958 22962
+ 22965 22968 22970 22974 22977 22981 22987 22991 22994 22998 23001 23004 23008
+ 23012 23015 23018 23021 23024 23026 23030 23033 23038 23041 23045 23048 23052
+ 23055 23058 23062 23066 23069 23073 23081 23086 23089 23093 23096 23100 23103
+ 23107 23114 23119 23122 23126 23129 23133 23136 23140 23143 23146 23148 23152
+ 23155 23159 23162 23165 23169 23173 23176 23180 23183 23187 23190 23193 23195
+ 23199 23202 23206 23213 23217 23220 23224 23227 23230 23234 23238 23241 23244
+ 23247 23250 23252 23256 23259 23263 23266 23270 23273 23277 23280 23283 23287
+ 23291 23294 23298 23306 23311 23314 23318 23321 23325 23328 23332 23339 23343
+ 23346 23350 23353 23357 23360 23364 23367 23370 23372 23376 23379 23383 23386
+ 23389 23393 23397 23400 23404 23408 23412 23415 23418 23420 23424 23427 23432
+ 23437 23441 23444 23448 23451 23454 23458 23462 23465 23468 23471 23474 23476
+ 23480 23483 23487 23490 23494 23497 23501 23504 23507 23511 23515 23518 23522
+ 23530 23535 23538 23542 23545 23549 23552 23556 23563 23567 23570 23574 23577
+ 23581 23584 23588 23591 23594 23596 23600 23603 23607 23610 23613 23617 23621
+ 23624 23628 23632 23636 23639 23642 23644 23648 23651 23656 23661 23666 23669
+ 23673 23676 23679 23683 23687 23690 23693 23696 23699 23701 23705 23708 23713
+ 23716 23721 23724 23728 23731 23734 23738 23742 23745 23750 23755 23759 23762
+ 23766 23769 23773 23776 23781 23786 23790 23793 23797 23800 23804 23807 23811
+ 23815 23818 23820 23824 23827 23831 23834 23837 23841 23845 23848 23852 23855
+ 23859 23862 23865 23867 23871 23874 23879 23885
+    1    2    4    9   14   15   35   39   40   47   48   50   51   63  993 1069
+ 1070 1079 1080 1091 1092 2977 3064 3067 3068 3071 3072 3099 3100    1    2    4
+    5  993  994  997  998 2977 2978 2979 2982    3  995  998    1    2    4    8
+    9  995  996  998 1005 2979 2980 2989    2    5  997  998  999 2981 2982 2983
+    6  999 1000 1002 1003 2983 2984 2985    7 2986 2987    4    8    9 1005 1006
+ 1008 1009 2989 2990 3037 3038    1    4    8    9   14   15   17   19   26 1009
+ 1010 1021 1022 1025 1026 1039 2991 2992 3001 3002 3013 3014 3021 3038   10 2991
+ 3038   11 1011 2993   12   14   15 1011 1012 1015 2993 2994 2995 2998   13 1013
+ 1016    1    9   12   14   15   17 1013 1014 1015 1016 1019 2995 2996 2999    1
+    9   12   14   15   19   20 1015 1016 1027 2997 2998 3007   16   17 1017 1023
+ 1024 3003 3004 3005    9   14   16   17 1019 1020 1022 1023 2999 3000 3001 3002
+ 3003   18 3002 3003    9   15   19   20 1025 1031 1032 3011 3012 3013   15   19
+   20 1027 1028 1030 1031 3007 3008 3009 3010 3011   21 3010 3011   22   23 1033
+ 1037 1038 3019 3035 3036   22   23   24 1036 1037 1053 1054 3015 3016 3017 3018
+ 3035   23   24   26 1041 1042 1044 1053 3015 3023 3024   25 3018 3035    9   24
+   26   28 1039 1040 1043 1044 3021 3022 3023 3026   27 1041 1044   26   28   29
+ 1043 1044 1045 3025 3026 3027   28   29   31 1045 1046 1048 1049 3027 3028 3029
+ 3030   30   31 3030 3031   29   30   31 1049 1050 1051 3030 3031 3032 3033   32
+   33   35   63   70   71   78   82 1055 1131 1132 1141 1142 1153 1154 3039 3129
+ 3130 3133 3134 3161 3162   32   33   35   36 1055 1056 1059 3039 3040 3041 3044
+   34 1057 1060    1   32   33   35   36   39   40   63 1057 1058 1059 1060 1067
+ 3041 3042 3051   33   35   36   37 1059 1060 1061 3043 3044 3045   36   37 1061
+ 1062 1064 1065 3045 3046 3047   38 3048 3049    1   35   39   40 1067 1068 1070
+ 1071 3051 3052 3099    1   35   39   40   48   50   57   63 1071 1072 1083 1084
+ 1087 1088 1101 3053 3054 3063 3064 3075 3076 3083 3099 3100   41 3053 3100   42
+   43 1073 3055   42   43   45   46   48 1073 1074 1077 1078 3055 3056 3057 3060
+   44 1075 1078   43   45   48 1075 1076 1078 1081 3057 3058 3061   43   46   48
+   51 1077 1078 1089 3059 3060 3069    1   47   48 1079 1085 1086 3064 3065 3066
+ 3067    1   40   43   45   46   47   48   50   51 1078 1081 1082 1084 1085 3061
+ 3062 3063 3064   49 3064 3065    1   40   48   50   51 1087 1093 1094 3071 3072
+ 3073 3074 3075    1   46   48   50   51 1089 1090 1092 1093 3069 3070 3071   52
+ 3072 3073   53   54   56 1095 1099 1100 3080 3081 3097 3098   53   54   55 1098
+ 1099 1115 1116 3077 3078 3079 3080   54   55   57 1103 1104 1106 1115 3077 3085
+ 3086   53   56 3080 3097   40   55   57   59 1101 1102 1105 1106 3083 3084 3085
+ 3088   58 1103 1106   57   59   60 1105 1106 1107 3087 3088 3089   59   60   62
+ 1107 1108 1110 1111 3089 3090 3091 3092 3093   61 3092 3093   60   62 1111 1112
+ 1113 3093 3094 3095    1   32   35   40   63   64   70  101  102  109  110  113
+  156  280 1117 1193 1194 1203 1204 1215 1216 3101 3191 3192 3195 3196 3223 3224
+   63   64   66   67   70 1117 1118 1121 1122 3101 3102 3103 3106   65 1119 1122
+   64   66   70 1119 1120 1122 1129 3103 3104 3113   64   67   68 1121 1122 1123
+ 3105 3106 3107   67   68 1123 1124 1126 1127 3107 3108 3109   69 3110 3111   32
+   63   64   66   70   71 1122 1129 1130 1132 1133 3113 3114 3161 3162   32   70
+   71   78   79   81   82   88 1133 1134 1145 1146 1149 1150 1163 3115 3116 3125
+ 3126 3134 3137 3138 3145 3162   72 3115 3162   73   74 1135 3117   73   74   76
+   77 1135 1136 1139 1140 3117 3118 3119 3122   75 1137 1140   74   76   77   79
+ 1137 1138 1140 1143 3119 3120 3123   74   76   77   79   82 1139 1140 1151 3121
+ 3122 3131   32   71   78   79   82 1141 1147 1148 3125 3126 3127 3128 3129   71
+   76   77   78   79   82 1143 1144 1146 1147 3123 3124 3125   80 3126 3127   71
+   81   82 1149 1155 1156 3134 3135 3136 3137   32   71   77   78   79   81   82
+ 1151 1152 1154 1155 3131 3132 3133 3134   83 3134 3135   84   85 1157 1161 1162
+ 3143 3159 3160   84   85   86 1160 1161 1177 1178 3139 3140 3141 3142 3159   85
+   86   88 1165 1166 1168 1177 3139 3147 3148   87 3142 3159   71   86   88   90
+ 1163 1164 1167 1168 3145 3146 3147 3150   89 1165 1168   88   90   91 1167 1168
+ 1169 3149 3150 3151   90   91   93 1169 1170 1172 1173 3151 3152 3153 3154 3155
+   92 3154 3155   91   93 1173 1174 1175 3155 3156 3157   94   95  101  125  132
+  133  140  143  144 1179 1255 1256 1265 1266 1277 1278 3163 3253 3254 3257 3258
+ 3285 3286   94   95   97   98  101 1179 1180 1183 1184 3163 3164 3165 3168   96
+ 1181 1184   95   97  101 1181 1182 1184 1191 3165 3166 3175   95   98   99 1183
+ 1184 1185 3167 3168 3169   98   99 1185 1186 1188 1189 3169 3170 3171  100 3172
+ 3173   63   94   95   97  101  102  125  156 1191 1192 1194 1195 3175 3176 3223
+   63  101  102  109  110  112  113  119  156  280 1195 1196 1207 1208 1211 1212
+ 1225 3177 3178 3187 3188 3196 3199 3200 3207 3223 3224  103 3177 3224  104  105
+ 1197 3179  104  105  107  108  110  113 1197 1198 1201 1202 3179 3180 3181 3184
+  106 1199 1202  105  107  110 1199 1200 1202 1205 3181 3182 3185  105  108  113
+ 1201 1202 1213 3183 3184 3193   63  102  109  110 1203 1209 1210 3187 3188 3189
+ 3190 3191   63  102  105  107  109  110  113 1202 1205 1206 1208 1209 3185 3186
+ 3187  111 3188 3189  102  112  113 1211 1217 1218 3196 3197 3198 3199   63  102
+  105  108  110  112  113 1202 1213 1214 1216 1217 3193 3194 3195 3196  114 3196
+ 3197  115  116 1219 1223 1224 3205 3221 3222  115  116  117 1222 1223 1239 1240
+ 3201 3202 3203 3204 3221  116  117  119  120 1227 1228 1230 1239 3201 3209 3210
+  118 3204 3221  102  117  119  121 1225 1226 1229 1230 3207 3208 3209 3212  117
+  120 1227 1230  119  121  122 1229 1230 1231 3211 3212 3213  121  122  124 1231
+ 1232 1234 1235 3213 3214 3215 3216  123  124 3216 3217  122  123  124 1235 1236
+ 1237 3216 3217 3218 3219   94  101  125  126  128  132  133  156  159  163  164
+  171  174  175 1241 1317 1318 1327 1328 1339 1340 3225 3315 3316 3319 3320 3347
+ 3348  125  126  128  129 1241 1242 1245 1246 3225 3226 3227 3230  127  128 1243
+ 1246  125  126  127  128  132 1243 1244 1246 1253 3227 3228 3237  126  129  130
+ 1245 1246 1247 3229 3230 3231  129  130 1247 1248 1250 1251 3231 3232 3233  131
+ 3234 3235   94  125  128  132  133 1253 1254 1256 1257 3237 3238 3285   94  125
+  132  133  140  141  143  144  150 1257 1258 1269 1270 1273 1274 1287 3239 3240
+ 3249 3250 3261 3262 3269 3285 3286  134 3239 3286  135 1259 3241  136  138  139
+ 1259 1260 1263 3241 3242 3243 3246  137 1261 1264  136  138  139  141 1261 1262
+ 1263 1264 1267 3243 3244 3247  136  138  139  141  144 1263 1264 1275 3245 3246
+ 3255   94  133  140  141  144 1265 1271 1272 3249 3250 3251 3252 3253  133  138
+  139  140  141  144 1267 1268 1270 1271 3247 3248 3249  142 3250 3251   94  133
+  143  144 1273 1279 1280 3257 3258 3259 3260 3261   94  133  139  140  141  143
+  144 1275 1276 1278 1279 3255 3256 3257  145 3258 3259  146  147  149 1281 1285
+ 1286 3266 3267 3283 3284  146  147  148 1284 1285 1301 1302 3263 3264 3265 3266
+  147  148  150 1289 1290 1301 3263 3271 3272  146  149 3266 3283  133  148  150
+  151  152 1287 1288 1289 1291 1292 3269 3270 3271 3274  150  151 1289 1292  150
+  152  153 1291 1292 1293 3273 3274 3275  152  153  155 1293 1294 1296 1297 3275
+ 3276 3277 3278 3279  154 3278 3279  153  155 1297 1298 1299 3279 3280 3281   63
+  101  102  125  156  157  159  187  190  194  195  202  205  206  280 1303 1379
+ 1380 1389 1390 1401 1402 3287 3377 3378 3381 3382 3409 3410  156  157  159  160
+ 1303 1304 1307 3287 3288 3289 3292  158  159 1305 1308  125  156  157  158  159
+  160  163  164 1305 1306 1307 1308 1315 3289 3290 3299  157  159  160  161 1307
+ 1308 1309 3291 3292 3293  160  161 1309 1310 1312 1313 3293 3294 3295  162 3296
+ 3297  125  159  163  164 1315 1316 1318 1319 3299 3300 3347  125  159  163  164
+  171  172  174  175  181 1319 1320 1331 1332 1335 1336 1349 3301 3302 3311 3312
+ 3323 3324 3331 3347 3348  165 3301 3348  166 1321 3303  167  169  170 1321 1322
+ 1325 1326 3303 3304 3305 3308  168  169 1323 1326  167  168  169  170  172 1323
+ 1324 1326 1329 3305 3306 3309  167  169  170  172  175 1325 1326 1337 3307 3308
+ 3317  125  164  171  172  175 1327 1333 1334 3312 3313 3314 3315  164  169  170
+  171  172  175 1329 1330 1332 1333 3309 3310 3311 3312  173 3312 3313  125  164
+  174  175 1335 1341 1342 3319 3320 3321 3322 3323  125  164  170  171  172  174
+  175 1337 1338 1340 1341 3317 3318 3319  176 3320 3321  177  178 1343 1347 1348
+ 3329 3345 3346  177  178  179 1346 1347 1363 1364 3325 3326 3327 3328 3345  178
+  179  181 1351 1352 1354 1363 3325 3333 3334  180 3328 3345  164  179  181  183
+ 1349 1350 1353 1354 3331 3332 3333 3336  182 1351 1354  181  183  184 1353 1354
+ 1355 3335 3336 3337  183  184  186 1355 1356 1358 1359 3337 3338 3339 3340 3341
+  185 3340 3341  184  186 1359 1360 1361 3341 3342 3343  156  187  188  190  195
+  218  221  225  226  233  234  237  249  280 1365 1441 1442 1451 1452 1463 1464
+ 3349 3436 3439 3440 3443 3444 3471 3472  187  188  190  191 1365 1366 1369 3349
+ 3350 3351 3354  189  190 1367 1370  156  187  188  189  190  191  194  195 1367
+ 1368 1369 1370 1377 3351 3352 3361  188  190  191  192 1369 1370 1371 3353 3354
+ 3355  191  192 1371 1372 1374 1375 3355 3356 3357  193 3358 3359  156  190  194
+  195 1377 1378 1380 1381 3361 3362 3409  156  187  190  194  195  202  203  205
+  206  212 1381 1382 1393 1394 1397 1398 1411 3363 3364 3373 3374 3385 3386 3393
+ 3409 3410  196 3363 3410  197 1383 3365  198  200  201  203  206 1383 1384 1387
+ 1388 3365 3366 3367 3370  199 1385 1388  198  200  203 1385 1386 1388 1391 3367
+ 3368 3371  198  201  206 1387 1388 1399 3369 3370 3379  156  195  202  203  206
+ 1389 1395 1396 3373 3374 3375 3376 3377  195  198  200  202  203  206 1388 1391
+ 1392 1394 1395 3371 3372 3373  204 3374 3375  156  195  205  206 1397 1403 1404
+ 3381 3382 3383 3384 3385  156  195  198  201  202  203  205  206 1388 1399 1400
+ 1402 1403 3379 3380 3381  207 3382 3383  208  209  211 1405 1409 1410 3390 3391
+ 3407 3408  208  209  210 1408 1409 1425 1426 3387 3388 3389 3390  209  210  213
+ 1413 1414 1416 1425 3387 3395 3396  208  211 3390 3407  195  212  214 1411 1412
+ 1415 1416 3393 3394 3395 3398  210  213 1413 1416  212  214  215 1415 1416 1417
+ 3397 3398 3399  214  215  217 1417 1418 1420 1421 3399 3400 3401 3402 3403  216
+ 3402 3403  215  217 1421 1422 1423 3403 3404 3405  187  218  219  221  226  249
+  256  257  264  265  268 1427 1503 1504 1513 1514 1525 1526 3411 3498 3501 3502
+ 3505 3506 3533 3534  218  219  221  222 1427 1428 1431 1432 3411 3412 3413 3416
+  220  221 1429 1432  187  218  219  220  221  225  226 1429 1430 1432 1439 3413
+ 3414 3423  219  222  223 1431 1432 1433 3415 3416 3417  222  223 1433 1434 1436
+ 1437 3417 3418 3419  224 3420 3421  187  221  225  226 1439 1440 1442 1443 3423
+ 3424 3471  187  218  221  225  226  234  236  237  243  249 1443 1444 1455 1456
+ 1459 1460 1473 3425 3426 3435 3436 3444 3447 3448 3455 3471 3472  227 3425 3472
+  228  229 1445 3427  228  229  231  232 1445 1446 1449 1450 3427 3428 3429 3432
+  230 1447 1450  229  231  232  234  237 1447 1448 1450 1453 3429 3430 3433  229
+  231  232  237 1449 1450 1461 3431 3432 3441  187  233  234 1451 1457 1458 3436
+ 3437 3438 3439  187  226  231  233  234  237 1453 1454 1456 1457 3433 3434 3435
+ 3436  235 3436 3437  226  236  237 1459 1465 1466 3444 3445 3446 3447  187  226
+  231  232  234  236  237 1461 1462 1464 1465 3441 3442 3443 3444  238 3444 3445
+  239  240 1467 1471 1472 3453 3469 3470  239  240  241 1470 1471 1487 1488 3449
+ 3450 3451 3452 3469  240  241  243 1475 1476 1478 1487 3449 3457 3458  242 3452
+ 3469  226  241  243  245 1473 1474 1477 1478 3455 3456 3457 3460  244 1475 1478
+  243  245  246 1477 1478 1479 3459 3460 3461  245  246  248 1479 1480 1482 1483
+ 3461 3462 3463 3464 3465  247 3464 3465  246  248 1483 1484 1485 3465 3466 3467
+  187  218  226  249  250  252  256  257  280  287  288  294  295  296  298  299
+ 1489 1565 1566 1575 1576 1587 1588 3473 3563 3564 3567 3568 3595 3596  249  250
+  252  253 1489 1490 1493 1494 3473 3474 3475 3478  251 1491 1494  249  250  252
+  256 1491 1492 1494 1501 3475 3476 3485  250  253  254 1493 1494 1495 3477 3478
+ 3479  253  254 1495 1496 1498 1499 3479 3480 3481  255 3482 3483  218  249  252
+  256  257 1501 1502 1504 1505 3485 3486 3533 3534  218  249  256  257  265  267
+  268  274 1505 1506 1517 1518 1521 1522 1535 3487 3488 3497 3498 3506 3509 3510
+ 3517 3534  258 3487 3534  259  260 1507 3489  259  260  262  263  265  268 1507
+ 1508 1511 1512 3489 3490 3491 3494  261 1509 1512  260  262  265 1509 1510 1512
+ 1515 3491 3492 3495  260  263  268 1511 1512 1523 3493 3494 3503  218  264  265
+ 1513 1519 1520 3498 3499 3500 3501  218  257  260  262  264  265  268 1512 1515
+ 1516 1518 1519 3495 3496 3497 3498  266 3498 3499  257  267  268 1521 1527 1528
+ 3506 3507 3508 3509  218  257  260  263  265  267  268 1512 1523 1524 1526 1527
+ 3503 3504 3505 3506  269 3506 3507  270  271 1529 1533 1534 3515 3531 3532  270
+  271  272 1532 1533 1549 1550 3511 3512 3513 3514 3531  271  272  274 1537 1538
+ 1540 1549 3511 3519 3520  273 3514 3531  257  272  274  276 1535 1536 1539 1540
+ 3517 3518 3519 3522  275 1537 1540  274  276  277 1539 1540 1541 3521 3522 3523
+  276  277  279 1541 1542 1544 1545 3523 3524 3525 3526 3527  278 3526 3527  277
+  279 1545 1546 1547 3527 3528 3529   63  102  156  187  249  280  281  283  287
+  288  311  318  319  324  325  326  329  330 1551 1627 1628 1637 1638 1649 1650
+ 3535 3625 3626 3629 3630 3657 3658  280  281  283  284 1551 1552 1555 1556 3535
+ 3536 3537 3540  282 1553 1556  280  281  283  287 1553 1554 1556 1563 3537 3538
+ 3547  281  284  285 1555 1556 1557 3539 3540 3541  284  285 1557 1558 1560 1561
+ 3541 3542 3543  286 3544 3545  249  280  283  287  288 1563 1564 1566 1567 3547
+ 3548 3595  249  280  287  288  294  295  296  298  305 1567 1568 1579 1580 1583
+ 1584 1597 3549 3550 3559 3560 3571 3572 3579 3595 3596  289 3549 3596  290  291
+ 1569 3551  290  291  293  294 1569 1570 1573 1574 3551 3552 3553 3556  292 1571
+ 1574  291  293  294  296 1571 1572 1574 1577 3553 3554 3557  249  288  291  293
+  294  296  298  299 1573 1574 1585 3555 3556 3565  249  288  295  296 1575 1581
+ 1582 3559 3560 3561 3562 3563  249  288  293  294  295  296 1577 1578 1580 1581
+ 3557 3558 3559  297 3560 3561  249  288  294  298  299 1583 1589 1590 3567 3568
+ 3569 3570 3571  249  294  298  299 1585 1586 1588 1589 3565 3566 3567  300 3568
+ 3569  301  302 1591 1595 1596 3577 3593 3594  301  302  303 1594 1595 1611 1612
+ 3573 3574 3575 3576 3593  302  303  305 1599 1600 1602 1611 3573 3581 3582  304
+ 3576 3593  288  303  305  307 1597 1598 1601 1602 3579 3580 3581 3584  306 1599
+ 1602  305  307  308 1601 1602 1603 3583 3584 3585  307  308  310 1603 1604 1606
+ 1607 3585 3586 3587 3588 3589  309 3588 3589  308  310 1607 1608 1609 3589 3590
+ 3591  280  311  312  314  318  319  342  349  350  355  356  357  358  360  361
+  404 1613 1689 1690 1699 1700 1711 1712 3597 3687 3688 3691 3692 3719 3720  311
+  312  314  315 1613 1614 1617 1618 3597 3598 3599 3602  313 1615 1618  311  312
+  314  318 1615 1616 1618 1625 3599 3600 3609  312  315  316 1617 1618 1619 3601
+ 3602 3603  315  316 1619 1620 1622 1623 3603 3604 3605  317 3606 3607  280  311
+  314  318  319 1625 1626 1628 1629 3609 3610 3657  280  311  318  319  324  325
+  326  327  329  330  336 1629 1630 1641 1642 1645 1646 1659 3611 3612 3621 3622
+ 3633 3634 3641 3657 3658  320 3611 3658  321  322 1631 3613  321  322  324  325
+ 1631 1632 1635 3613 3614 3615 3618  323 1633 1636  280  319  322  324  325  326
+  327 1633 1634 1635 1636 1639 3615 3616 3619  280  319  322  324  325  330 1635
+ 1636 1647 3617 3618 3627  280  319  324  326  327 1637 1643 1644 3621 3622 3623
+ 3624 3625  319  324  326  327 1639 1640 1642 1643 3619 3620 3621  328 3622 3623
+  280  319  329  330 1645 1651 1652 3629 3630 3631 3632 3633  280  319  325  329
+  330 1647 1648 1650 1651 3627 3628 3629  331 3630 3631  332  333 1653 1657 1658
+ 3639 3655 3656  332  333  334 1656 1657 1673 1674 3635 3636 3637 3638 3655  333
+  334  336 1661 1662 1664 1673 3635 3643 3644  335 3638 3655  319  334  336  338
+ 1659 1660 1663 1664 3641 3642 3643 3646  337 1661 1664  336  338  339 1663 1664
+ 1665 3645 3646 3647  338  339  341 1665 1666 1668 1669 3647 3648 3649 3650  340
+  341 3650 3651  339  340  341 1669 1670 1671 3650 3651 3652 3653  311  342  343
+  345  349  350  373  376  380  381  386  387  388  391  392  404 1675 1751 1752
+ 1761 1762 1773 1774 3659 3749 3750 3753 3754 3781 3782  342  343  345  346 1675
+ 1676 1679 1680 3659 3660 3661 3664  344 1677 1680  342  343  345  349 1677 1678
+ 1680 1687 3661 3662 3671  343  346  347 1679 1680 1681 3663 3664 3665  346  347
+ 1681 1682 1684 1685 3665 3666 3667  348 3668 3669  311  342  345  349  350 1687
+ 1688 1690 1691 3671 3672 3719  311  342  349  350  355  356  357  358  360  367
+ 1691 1692 1703 1704 1707 1708 1721 3673 3674 3683 3684 3695 3696 3703 3719 3720
+  351 3673 3720  352  353 1693 3675  352  353  355  356 1693 1694 1697 3675 3676
+ 3677 3680  354 1695 1698  311  350  353  355  356  358 1695 1696 1697 1698 1701
+ 3677 3678 3681  311  350  353  355  356  360  361 1697 1698 1709 3679 3680 3689
+  311  350  357  358 1699 1705 1706 3683 3684 3685 3686 3687  311  350  355  357
+  358 1701 1702 1704 1705 3681 3682 3683  359 3684 3685  311  350  356  360  361
+ 1707 1713 1714 3691 3692 3693 3694 3695  311  356  360  361 1709 1710 1712 1713
+ 3689 3690 3691  362 3692 3693  363  364 1715 1719 1720 3701 3717 3718  363  364
+  365 1718 1719 1735 1736 3697 3698 3699 3700 3717  364  365  367 1723 1724 1726
+ 1735 3697 3705 3706  366 3700 3717  350  365  367  369 1721 1722 1725 1726 3703
+ 3704 3705 3708  368 1723 1726  367  369  370 1725 1726 1727 3707 3708 3709  369
+  370  372 1727 1728 1730 1731 3709 3710 3711 3712 3713  371 3712 3713  370  372
+ 1731 1732 1733 3713 3714 3715  342  373  374  376  381  404  407  411  412  418
+  419  420  423 1737 1813 1814 1823 1824 1835 1836 3721 3811 3812 3815 3816 3843
+ 3844  373  374  376  377 1737 1738 1741 1742 3721 3722 3723 3726  375 1739 1742
+  342  373  374  376  380  381 1739 1740 1742 1749 3723 3724 3733  374  377  378
+ 1741 1742 1743 3725 3726 3727  377  378 1743 1744 1746 1747 3727 3728 3729  379
+ 3730 3731  342  376  380  381 1749 1750 1752 1753 3733 3734 3781 3782  342  373
+  376  380  381  386  387  388  389  391  398 1753 1754 1765 1766 1769 1770 1783
+ 3735 3736 3745 3746 3757 3758 3765 3782  382 3735 3782  383  384 1755 3737  383
+  384  386  387 1755 1756 1759 3737 3738 3739 3742  385 1757 1760  342  381  384
+  386  387  388  389 1757 1758 1759 1760 1763 3739 3740 3743  342  381  384  386
+  387  391  392 1759 1760 1771 3741 3742 3751  342  381  386  388  389 1761 1767
+ 1768 3745 3746 3747 3748 3749  381  386  388  389 1763 1764 1766 1767 3743 3744
+ 3745  390 3746 3747  342  381  387  391  392 1769 1775 1776 3754 3755 3756 3757
+  342  387  391  392 1771 1772 1774 1775 3751 3752 3753 3754  393 3754 3755  394
+  395 1777 1781 1782 3763 3779 3780  394  395  396 1780 1781 1797 1798 3759 3760
+ 3761 3762 3779  395  396  398 1785 1786 1788 1797 3759 3767 3768  397 3762 3779
+  381  396  398  400 1783 1784 1787 1788 3765 3766 3767 3770  399 1785 1788  398
+  400  401 1787 1788 1789 3769 3770 3771  400  401  403 1789 1790 1792 1793 3771
+ 3772 3773 3774 3775  402 3774 3775  401  403 1793 1794 1795 3775 3776 3777  311
+  342  373  404  405  407  435  442  443  449  450  451  453  454  497 1799 1875
+ 1876 1885 1886 1897 1898 3783 3870 3873 3874 3877 3878 3905 3906  404  405  407
+  408 1799 1800 1803 1804 3783 3784 3785 3788  406 1801 1804  373  404  405  407
+  411  412 1801 1802 1804 1811 3785 3786 3795  405  408  409 1803 1804 1805 3787
+ 3788 3789  408  409 1805 1806 1808 1809 3789 3790 3791  410 3792 3793  373  407
+  411  412 1811 1812 1814 1815 3795 3796 3843  373  407  411  412  418  419  420
+  422  423  429 1815 1816 1827 1828 1831 1832 1845 3797 3798 3807 3808 3816 3819
+ 3820 3827 3843 3844  413 3797 3844  414  415 1817 3799  414  415  417  418 1817
+ 1818 1821 3799 3800 3801 3804  416 1819 1822  415  417  418  420 1819 1820 1821
+ 1822 1825 3801 3802 3805  373  412  415  417  418  420  423 1821 1822 1833 3803
+ 3804 3813  373  412  419  420 1823 1829 1830 3807 3808 3809 3810 3811  373  412
+  417  418  419  420 1825 1826 1828 1829 3805 3806 3807  421 3808 3809  412  422
+  423 1831 1837 1838 3816 3817 3818 3819  373  412  418  422  423 1833 1834 1836
+ 1837 3813 3814 3815 3816  424 3816 3817  425  426 1839 1843 1844 3825 3841 3842
+  425  426  427 1842 1843 1859 1860 3821 3822 3823 3824 3841  426  427  429 1847
+ 1848 1850 1859 3821 3829 3830  428 3824 3841  412  427  429  431 1845 1846 1849
+ 1850 3827 3828 3829 3832  430 1847 1850  429  431  432 1849 1850 1851 3831 3832
+ 3833  431  432  434 1851 1852 1854 1855 3833 3834 3835 3836 3837  433 3836 3837
+  432  434 1855 1856 1857 3837 3838 3839  404  435  436  442  443  466  473  474
+  479  480  481  485  497 1861 1937 1938 1947 1948 1959 1960 3845 3935 3936 3939
+ 3940 3967 3968  435  436  438  439  442 1861 1862 1865 1866 3845 3846 3847 3850
+  437 1863 1866  436  438  442 1863 1864 1866 1873 3847 3848 3857  436  439  440
+ 1865 1866 1867 3849 3850 3851  439  440 1867 1868 1870 1871 3851 3852 3853  441
+ 3854 3855  404  435  436  438  442  443 1866 1873 1874 1876 1877 3857 3858 3905
+  404  435  442  443  449  451  453  454  460  497 1877 1878 1889 1890 1893 1894
+ 1907 3859 3860 3869 3870 3881 3882 3889 3905 3906  444 3859 3906  445  446 1879
+ 3861  445  446  448  449  451 1879 1880 1883 1884 3861 3862 3863 3866  447 1881
+ 1884  446  448  451 1881 1882 1884 1887 3863 3864 3867  404  443  446  449  451
+  454 1883 1884 1895 3865 3866 3875  404  450  451 1885 1891 1892 3870 3871 3872
+ 3873  404  443  446  448  449  450  451 1884 1887 1888 1890 1891 3867 3868 3869
+ 3870  452 3870 3871  404  443  453  454 1893 1899 1900 3877 3878 3879 3880 3881
+  404  443  449  453  454 1895 1896 1898 1899 3875 3876 3877  455 3878 3879  456
+  457 1901 1905 1906 3887 3903 3904  456  457  458 1904 1905 1921 1922 3883 3884
+ 3885 3886 3903  457  458  460 1909 1910 1912 1921 3883 3891 3892  459 3886 3903
+  443  458  460  462 1907 1908 1911 1912 3889 3890 3891 3894  461 1909 1912  460
+  462  463 1911 1912 1913 3893 3894 3895  462  463  465 1913 1914 1916 1917 3895
+ 3896 3897 3898  464  465 3898 3899  463  464  465 1917 1918 1919 3898 3899 3900
+ 3901  435  466  467  473  474  497  504  505  512  513  515  516 1923 1999 2000
+ 2009 2010 2021 2022 3907 3997 3998 4001 4002 4029 4030  466  467  469  470  473
+ 1923 1924 1927 1928 3907 3908 3909 3912  468 1925 1928  467  469  473 1925 1926
+ 1928 1935 3909 3910 3919  467  470  471 1927 1928 1929 3911 3912 3913  470  471
+ 1929 1930 1932 1933 3913 3914 3915  472 3916 3917  435  466  467  469  473  474
+ 1928 1935 1936 1938 1939 3919 3920 3967  435  466  473  474  479  480  481  482
+  484  485  491  497 1939 1940 1951 1952 1955 1956 1969 3921 3922 3931 3932 3940
+ 3943 3944 3951 3967 3968  475 3921 3968  476  477 1941 3923  476  477  479  480
+ 1941 1942 1945 3923 3924 3925 3928  478 1943 1946  435  474  477  479  480  481
+  482 1943 1944 1945 1946 1949 3925 3926 3929  435  474  477  479  480  485 1945
+ 1946 1957 3927 3928 3937  435  474  479  481  482 1947 1953 1954 3932 3933 3934
+ 3935  474  479  481  482 1949 1950 1952 1953 3929 3930 3931 3932  483 3932 3933
+  474  484  485 1955 1961 1962 3940 3941 3942 3943  435  474  480  484  485 1957
+ 1958 1960 1961 3937 3938 3939 3940  486 3940 3941  487  488  490 1963 1967 1968
+ 3948 3949 3965 3966  487  488  489 1966 1967 1983 1984 3945 3946 3947 3948  488
+  489 1971 1972 1974 1983 3945 3953 3954  487  490 3948 3965  474  491  493 1969
+ 1970 1973 1974 3951 3952 3953 3956  492 1971 1974  491  493  494 1973 1974 1975
+ 3955 3956 3957  493  494  496 1975 1976 1978 1979 3957 3958 3959 3960 3961  495
+ 3960 3961  494  496 1979 1980 1981 3961 3962 3963  404  435  443  466  474  497
+  498  504  535  536  541  542  543  546  547  714  776  869 1985 2061 2062 2071
+ 2072 2083 2084 3969 4059 4060 4063 4064 4091 4092  497  498  500  501  504 1985
+ 1986 1989 1990 3969 3970 3971 3974  499 1987 1990  498  500  504 1987 1988 1990
+ 1997 3971 3972 3981  498  501  502 1989 1990 1991 3973 3974 3975  501  502 1991
+ 1992 1994 1995 3975 3976 3977  503 3978 3979  466  497  498  500  504  505 1990
+ 1997 1998 2000 2001 3981 3982 4029 4030  466  504  505  512  513  515  516  522
+ 2001 2002 2013 2014 2017 2018 2031 3983 3984 3993 3994 4005 4006 4013 4030  506
+ 3983 4030  507  508 2003 3985  507  508  510  511 2003 2004 2007 3985 3986 3987
+ 3990  509 2005 2008  508  510  511  513 2005 2006 2007 2008 2011 3987 3988 3991
+  508  510  511  513  516 2007 2008 2019 3989 3990 3999  466  505  512  513 2009
+ 2015 2016 3993 3994 3995 3996 3997  466  505  510  511  512  513  516 2011 2012
+ 2014 2015 3991 3992 3993  514 3994 3995  466  505  515  516 2017 2023 2024 4002
+ 4003 4004 4005  466  505  511  513  515  516 2019 2020 2022 2023 3999 4000 4001
+ 4002  517 4002 4003  518  519 2025 2029 2030 4011 4027 4028  518  519  520 2028
+ 2029 2045 2046 4007 4008 4009 4010 4027  519  520  522 2033 2034 2036 2045 4007
+ 4015 4016  521 4010 4027  505  520  522  524 2031 2032 2035 2036 4013 4014 4015
+ 4018  523 2033 2036  522  524  525 2035 2036 2037 4017 4018 4019  524  525  527
+ 2037 2038 2040 2041 4019 4020 4021 4022 4023  526 4022 4023  525  527 2041 2042
+ 2043 4023 4024 4025  528  529  531  535  559  566  567  573  574  577  578 2047
+ 2123 2124 2133 2134 2145 2146 4031 4121 4122 4125 4126 4153 4154  528  529  531
+  532 2047 2048 2051 2052 4031 4032 4033 4036  530 2049 2052  528  529  531  535
+ 2049 2050 2052 2059 4033 4034 4043  529  532  533 2051 2052 2053 4035 4036 4037
+  532  533 2053 2054 2056 2057 4037 4038 4039  534 4040 4041  497  528  531  535
+  536  559  590  621  652  655  686  714 2059 2060 2062 2063 4043 4044 4091 4092
+  497  535  536  541  542  543  544  546  553  714  776  869 2063 2064 2075 2076
+ 2079 2080 2093 4045 4046 4055 4056 4067 4068 4075 4092  537 4045 4092  538  539
+ 2065 4047  538  539  541  542 2065 2066 2069 2070 4047 4048 4049 4052  540 2067
+ 2070  497  536  539  541  542  543  544 2067 2068 2070 2073 4049 4050 4053  497
+  536  539  541  542  546  547 2069 2070 2081 4051 4052 4061  497  536  541  543
+  544 2071 2077 2078 4056 4057 4058 4059  536  541  543  544 2073 2074 2076 2077
+ 4053 4054 4055 4056  545 4056 4057  497  536  542  546  547 2079 2085 2086 4064
+ 4065 4066 4067  497  542  546  547 2081 2082 2084 2085 4061 4062 4063 4064  548
+ 4064 4065  549  550 2087 2091 2092 4073 4089 4090  549  550  551 2090 2091 2107
+ 2108 4069 4070 4071 4072 4089  550  551  553 2095 2096 2098 2107 4069 4077 4078
+  552 4072 4089  536  551  553  555 2093 2094 2097 2098 4075 4076 4077 4080  554
+ 2095 2098  553  555  556 2097 2098 2099 4079 4080 4081  555  556  558 2099 2100
+ 2102 2103 4081 4082 4083 4084 4085  557 4084 4085  556  558 2103 2104 2105 4085
+ 4086 4087  528  535  559  560  566  567  590  597  598  604  605  606  608  609
+ 2109 2185 2186 2195 2196 2207 2208 4093 4180 4183 4184 4187 4188 4215 4216  559
+  560  562  563  566 2109 2110 2113 2114 4093 4094 4095 4098  561 2111 2114  560
+  562  566 2111 2112 2114 2121 4095 4096 4105  560  563  564 2113 2114 2115 4097
+ 4098 4099  563  564 2115 2116 2118 2119 4099 4100 4101  565 4102 4103  528  559
+  560  562  566  567 2114 2121 2122 2124 2125 4105 4106 4153  528  559  566  567
+  572  573  574  575  577  584 2125 2126 2137 2138 2141 2142 2155 4107 4108 4117
+ 4118 4129 4130 4137 4153 4154  568 4107 4154  569  570 2127 4109  569  570  572
+  573 2127 2128 2131 4109 4110 4111 4114  571 2129 2132  567  570  572  573  574
+  575 2129 2130 2131 2132 2135 4111 4112 4115  528  567  570  572  573  574  577
+  578 2131 2132 2143 4113 4114 4123  528  567  572  573  574  575 2133 2139 2140
+ 4117 4118 4119 4120 4121  567  572  574  575 2135 2136 2138 2139 4115 4116 4117
+  576 4118 4119  528  567  573  577  578 2141 2147 2148 4125 4126 4127 4128 4129
+  528  573  577  578 2143 2144 2146 2147 4123 4124 4125  579 4126 4127  580  581
+ 2149 2153 2154 4135 4151 4152  580  581  582 2152 2153 2169 2170 4131 4132 4133
+ 4134 4151  581  582  584 2157 2158 2160 2169 4131 4139 4140  583 4134 4151  567
+  582  584  586 2155 2156 2159 2160 4137 4138 4139 4142  585 2157 2160  584  586
+  587 2159 2160 2161 4141 4142 4143  586  587  589 2161 2162 2164 2165 4143 4144
+ 4145 4146 4147  588 4146 4147  587  589 2165 2166 2167 4147 4148 4149  535  559
+  590  591  597  598  621  628  629  636  637  639  640 2171 2247 2248 2257 2258
+ 2269 2270 4155 4245 4246 4249 4250 4277 4278  590  591  593  594  597 2171 2172
+ 2175 2176 4155 4156 4157 4160  592 2173 2176  591  593  597 2173 2174 2176 2183
+ 4157 4158 4167  591  594  595 2175 2176 2177 4159 4160 4161  594  595 2177 2178
+ 2180 2181 4161 4162 4163  596 4164 4165  559  590  591  593  597  598 2183 2184
+ 2186 2187 4167 4168 4215 4216  559  590  597  598  604  606  608  609  615 2187
+ 2188 2199 2200 2203 2204 2217 4169 4170 4179 4180 4191 4192 4199 4216  599 4169
+ 4216  600  601 2189 4171  600  601  603  604 2189 2190 2193 4171 4172 4173 4176
+  602 2191 2194  601  603  604  606 2191 2192 2193 2194 2197 4173 4174 4177  559
+  598  601  603  604  606  609 2193 2194 2205 4175 4176 4185  559  605  606 2195
+ 2201 2202 4180 4181 4182 4183  559  598  603  604  605  606 2197 2198 2200 2201
+ 4177 4178 4179 4180  607 4180 4181  559  598  608  609 2203 2209 2210 4188 4189
+ 4190 4191  559  598  604  608  609 2205 2206 2208 2209 4185 4186 4187 4188  610
+ 4188 4189  611  612 2211 2215 2216 4197 4213 4214  611  612  613  614 2214 2215
+ 2231 2232 4193 4194 4195 4213  612  613  615 2219 2220 2222 2231 4193 4201 4202
+  612  614 4195 4196 4213  598  613  615  617 2217 2218 2221 2222 4199 4200 4201
+ 4204  616 2219 2222  615  617  618 2221 2222 2223 4203 4204 4205  617  618  619
+  620 2223 2224 2226 2227 4205 4206 4207 4209  618  619 4207 4208 4209  618  620
+ 2227 2228 2229 4209 4210 4211  535  590  621  622  628  629  655  659  660  667
+  668  670  671 2233 2309 2310 2319 2320 2331 2332 4217 4307 4308 4311 4312 4339
+ 4340  621  622  624  625  628 2233 2234 2237 4217 4218 4219 4222  623 2235 2238
+  622  624  625  628 2235 2236 2237 2238 2245 4219 4220 4229  622  624  625  626
+ 2237 2238 2239 4221 4222 4223  625  626 2239 2240 2242 2243 4223 4224 4225  627
+ 4226 4227  590  621  622  624  628  629 2245 2246 2248 2249 4229 4230 4277 4278
+  590  621  628  629  636  637  639  646 2249 2250 2261 2262 2265 2266 2279 4231
+ 4232 4241 4242 4253 4254 4261 4278  630 4231 4278  631 2251 4233  632  634  635
+ 2251 2252 2255 4233 4234 4235 4238  633 2253 2256  632  634  635  637  640 2253
+ 2254 2255 2256 2259 4235 4236 4239  632  634  635  640 2255 2256 2267 4237 4238
+ 4247  590  629  636  637  638 2257 2263 2264 4241 4242 4243 4244 4245  590  629
+  634  636  637  639  640 2259 2260 2262 2263 4239 4240 4241  636  638 4242 4243
+  590  629  637  639  640  641 2265 2271 2272 4250 4251 4252 4253  590  634  635
+  637  639  640 2267 2268 2270 2271 4247 4248 4249 4250  639  641 4250 4251  642
+  645 2273 2277 2278 4258 4259 4275 4276  643  644 2276 2277 2293 2294 4255 4256
+ 4257 4258  643  644  646 2281 2282 2284 2293 4255 4263 4264  642  645 4258 4275
+  629  644  646  648 2279 2280 2283 2284 4261 4262 4263 4266  647 2281 2284  646
+  648  649 2283 2284 2285 4265 4266 4267  648  649  650 2285 2286 2288 2289 4267
+ 4268 4269 4271  649  650 4269 4270 4271  651 2289 2290 2291 4271 4272 4273  535
+  652  653  655  686  690  691  698  701  702 2295 2371 2372 2381 2382 2393 2394
+ 4279 4369 4370 4373 4374 4401 4402  652  653  655  656 2295 2296 2299 4279 4280
+ 4281 4284  654 2297 2300  535  621  652  653  655  656  659  660 2297 2298 2299
+ 2300 2307 4281 4282 4291  653  655  656  657 2299 2300 2301 4283 4284 4285  656
+  657 2301 2302 2304 2305 4285 4286 4287  658 4288 4289  621  655  659  660 2307
+ 2308 2310 2311 4291 4292 4339  621  655  659  660  667  668  670  677 2311 2312
+ 2323 2324 2327 2328 2341 4293 4294 4303 4304 4315 4316 4323 4339 4340  661 4293
+ 4340  662 2313 4295  663  665  666 2313 2314 2317 4295 4296 4297 4300  664 2315
+ 2318  663  665  666  668  671 2315 2316 2317 2318 2321 4297 4298 4301  663  665
+  666  671 2317 2318 2329 4299 4300 4309  621  660  667  668  669 2319 2325 2326
+ 4303 4304 4305 4306 4307  621  660  665  667  668  670  671 2321 2322 2324 2325
+ 4301 4302 4303  667  669 4304 4305  621  660  668  670  671  672 2327 2333 2334
+ 4312 4313 4314 4315  621  665  666  668  670  671 2329 2330 2332 2333 4309 4310
+ 4311 4312  670  672 4312 4313  673  674 2335 2339 2340 4321 4337 4338  673  674
+  675  676 2338 2339 2355 2356 4317 4318 4319 4337  674  675  677 2343 2344 2346
+ 2355 4317 4325 4326  674  676 4319 4320 4337  660  675  677  679 2341 2342 2345
+ 2346 4323 4324 4325 4328  678 2343 2346  677  679  680 2345 2346 2347 4327 4328
+ 4329  679  680  681  682 2347 2348 2350 2351 4329 4330 4331 4333  680  681 4331
+ 4332 4333  680  682 2351 2352 2353 4333 4334 4335  683  684  686  717  721  722
+  729  730  732  733 2357 2433 2434 2443 2444 2455 2456 4341 4431 4432 4435 4436
+ 4463 4464  683  684  686  687 2357 2358 2361 4341 4342 4343 4346  685 2359 2362
+  535  652  683  684  686  687  690  691  714  715  717 2359 2360 2361 2362 2369
+ 4343 4344 4353  684  686  687 2361 2362 2363 4345 4346 4347  688 2363 2364 2366
+ 2367 4347 4348 4349  689 4350 4351  652  686  690  691 2369 2370 2372 2373 4353
+ 4354 4401  652  686  690  691  698  699  701  702  708 2373 2374 2385 2386 2389
+ 2390 2403 4355 4356 4365 4366 4377 4378 4385 4401 4402  692 4355 4402  693 2375
+ 4357  694  696  697  699  702 2375 2376 2379 2380 4357 4358 4359 4362  695 2377
+ 2380  694  696  699 2377 2378 2380 2383 4359 4360 4363  694  697  702 2379 2380
+ 2391 4361 4362 4371  652  691  698  699  700  702 2381 2387 2388 4366 4367 4368
+ 4369  691  694  696  698  699  702 2380 2383 2384 2386 2387 4363 4364 4365 4366
+  698  700 4366 4367  652  691  701  702  703 2389 2395 2396 4374 4375 4376 4377
+  652  691  694  697  698  699  701  702 2380 2391 2392 2394 2395 4371 4372 4373
+ 4374  701  703 4374 4375  704 2397 2401 2402 4383 4399 4400  705  706  707 2400
+ 2401 2417 2418 4379 4380 4381 4399  705  706  708 2405 2406 2408 2417 4379 4387
+ 4388  705  707 4381 4382 4399  691  706  708  710 2403 2404 2407 2408 4385 4386
+ 4387 4390  709 2405 2408  708  710  711 2407 2408 2409 4389 4390 4391  710  711
+ 2409 2410 2412 2413 4391 4392 4393 4394  712  713 4394 4395  712  713 2413 2414
+ 2415 4394 4395 4396 4397  497  535  536  686  714  715  745  748  752  753  760
+  761  763  764  776 2419 2495 2496 2505 2506 2517 2518 4403 4493 4494 4497 4498
+ 4525 4526  686  714  715  717  718 2419 2420 2423 4403 4404 4405 4408  716 2421
+ 2424  683  686  715  717  718  721  722 2421 2422 2423 2424 2431 4405 4406 4415
+  715  717  718  719 2423 2424 2425 4407 4408 4409  718  719 2425 2426 2428 2429
+ 4409 4410 4411  720 4412 4413  683  717  721  722 2431 2432 2434 2435 4415 4416
+ 4463 4464  683  717  721  722  729  730  732  733  739 2435 2436 2447 2448 2451
+ 2452 2465 4417 4418 4427 4428 4439 4440 4447 4464  723 4417 4464  724 2437 4419
+  725  727  728 2437 2438 2441 2442 4419 4420 4421 4424  726 2439 2442  725  727
+  728  730  733 2439 2440 2442 2445 4421 4422 4425  725  727  728  733 2441 2442
+ 2453 4423 4424 4433  683  722  729  730  731 2443 2449 2450 4427 4428 4429 4430
+ 4431  683  722  727  729  730  733 2445 2446 2448 2449 4425 4426 4427  729  731
+ 4428 4429  683  722  732  733  734 2451 2457 2458 4436 4437 4438 4439  683  722
+  727  728  730  732  733 2453 2454 2456 2457 4433 4434 4435 4436  732  734 4436
+ 4437  735  736 2459 2463 2464 4445 4461 4462  735  736  737  738 2462 2463 2479
+ 2480 4441 4442 4443 4461  736  737  739 2467 2468 2470 2479 4441 4449 4450  736
+  738 4443 4444 4461  722  737  739  741 2465 2466 2469 2470 4447 4448 4449 4452
+  740 2467 2470  739  741  742 2469 2470 2471 4451 4452 4453  741  742  743  744
+ 2471 2472 2474 2475 4453 4454 4455 4457  742  743 4455 4456 4457  742  744 2475
+ 2476 2477 4457 4458 4459  714  745  746  748  753  776  783  784  791  792  795
+ 2481 2557 2558 2567 2568 2579 2580 4465 4552 4555 4556 4559 4560 4587 4588  745
+  746  748  749 2481 2482 2485 4465 4466 4467 4470  747 2483 2486  714  745  746
+  748  749  752  753 2483 2484 2485 2486 2493 4467 4468 4477  746  748  749  750
+ 2485 2486 2487 4469 4470 4471  749  750 2487 2488 2490 2491 4471 4472 4473  751
+ 4474 4475  714  748  752  753 2493 2494 2496 2497 4477 4478 4525  714  745  748
+  752  753  760  761  763  770 2497 2498 2509 2510 2513 2514 2527 4479 4480 4489
+ 4490 4501 4502 4509 4525 4526  754 4479 4526  755 2499 4481  756  758  759 2499
+ 2500 2503 4481 4482 4483 4486  757 2501 2504  756  758  759  761  764 2501 2502
+ 2503 2504 2507 4483 4484 4487  756  758  759  764 2503 2504 2515 4485 4486 4495
+  714  753  760  761  762 2505 2511 2512 4490 4491 4492 4493  714  753  758  760
+  761  763  764 2507 2508 2510 2511 4487 4488 4489 4490  760  762 4490 4491  714
+  753  761  763  764 2513 2519 2520 4497 4498 4499 4500 4501  714  758  759  761
+  763  764 2515 2516 2518 2519 4495 4496 4497  765 4498 4499  766 2521 2525 2526
+ 4507 4523 4524  767  769 2524 2525 2541 2542 4503 4504 4505 4523  768  770 2529
+ 2530 2532 2541 4503 4511 4512  767  769 4505 4506 4523  753  768  770  772 2527
+ 2528 2531 2532 4509 4510 4511 4514  771 2529 2532  770  772  773 2531 2532 2533
+ 4513 4514 4515  772  773  774 2533 2534 2536 2537 4515 4516 4517 4519  773  774
+ 4517 4518 4519  775 2537 2538 2539 4519 4520 4521  497  536  714  745  776  777
+  783  784  807  814  815  822  823  826  869 2543 2619 2620 2629 2630 2641 2642
+ 4527 4614 4617 4618 4621 4622 4649 4650  776  777  779  780  783 2543 2544 2547
+ 2548 4527 4528 4529 4532  778 2545 2548  777  779  783 2545 2546 2548 2555 4529
+ 4530 4539  777  780  781 2547 2548 2549 4531 4532 4533  780  781 2549 2550 2552
+ 2553 4533 4534 4535  782 4536 4537  745  776  777  779  783  784 2548 2555 2556
+ 2558 2559 4539 4540 4587 4588  745  776  783  784  792  794  795  801 2559 2560
+ 2571 2572 2575 2576 2589 4541 4542 4551 4552 4560 4563 4564 4571 4588  785 4541
+ 4588  786 2561 4543  787  789  790 2561 2562 2565 4543 4544 4545 4548  788 2563
+ 2566  787  789  790  792  795 2563 2564 2565 2566 2569 4545 4546 4549  787  789
+  790  795 2565 2566 2577 4547 4548 4557  745  791  792  793 2567 2573 2574 4552
+ 4553 4554 4555  745  784  789  791  792  795 2569 2570 2572 2573 4549 4550 4551
+ 4552  791  793 4552 4553  784  794  795  796 2575 2581 2582 4560 4561 4562 4563
+  745  784  789  790  792  794  795 2577 2578 2580 2581 4557 4558 4559 4560  794
+  796 4560 4561  797  798  800 2583 2587 2588 4567 4568 4569 4585 4586  797  798
+  799 2586 2587 2603 2604 4565 4566 4567  798  799  801 2591 2592 2594 2603 4565
+ 4573 4574  797  800 4568 4585  784  799  801  803 2589 2590 2593 2594 4571 4572
+ 4573 4576  802 2591 2594  801  803  804 2593 2594 2595 4575 4576 4577  803  804
+  805  806 2595 2596 2598 2599 4577 4578 4579  804  805  806 4579 4580 4581  804
+  805  806 2599 2600 2601 4581 4582 4583  776  807  808  814  815  838  845  846
+  852  853  854  857  869 2605 2681 2682 2691 2692 2703 2704 4589 4676 4679 4680
+ 4683 4684 4711 4712  807  808  810  811  814 2605 2606 2609 2610 4589 4590 4591
+ 4594  809 2607 2610  808  810  814 2607 2608 2610 2617 4591 4592 4601  808  811
+  812 2609 2610 2611 4593 4594 4595  811  812 2611 2612 2614 2615 4595 4596 4597
+  813 4598 4599  776  807  808  810  814  815 2610 2617 2618 2620 2621 4601 4602
+ 4649 4650  776  807  814  815  823  825  826  832 2621 2622 2633 2634 2637 2638
+ 2651 4603 4604 4613 4614 4622 4625 4626 4633 4650  816 4603 4650  817 2623 4605
+  818  820  821 2623 2624 2627 4605 4606 4607 4610  819 2625 2628  818  820  821
+  823  826 2625 2626 2627 2628 2631 4607 4608 4611  818  820  821  826 2627 2628
+ 2639 4609 4610 4619  776  822  823 2629 2635 2636 4614 4615 4616 4617  776  815
+  820  822  823  826 2631 2632 2634 2635 4611 4612 4613 4614  824 4614 4615  815
+  825  826 2637 2643 2644 4622 4623 4624 4625  776  815  820  821  823  825  826
+ 2639 2640 2642 2643 4619 4620 4621 4622  827 4622 4623  828  829  831 2645 2649
+ 2650 4631 4647 4648  828  829  830  831 2648 2649 2665 2666 4627 4628 4629  829
+  830  832 2653 2654 2656 2665 4627 4635 4636  828  829  831 4629 4630 4647  815
+  830  832  834 2651 2652 2655 2656 4633 4634 4635 4638  833 2653 2656  832  834
+  835 2655 2656 2657 4637 4638 4639  834  835  836  837 2657 2658 2660 2661 4639
+ 4640 4641  835  836  837 4641 4642 4643  835  836  837 2661 2662 2663 4643 4644
+ 4645  807  838  839  845  846  869  876  877  884  885  887  888 2667 2743 2744
+ 2753 2754 2765 2766 4651 4738 4741 4742 4745 4746 4773 4774  838  839  841  842
+  845 2667 2668 2671 2672 4651 4652 4653 4656  840 2669 2672  839  841  845 2669
+ 2670 2672 2679 4653 4654 4663  839  842  843 2671 2672 2673 4655 4656 4657  842
+  843 2673 2674 2676 2677 4657 4658 4659  844 4660 4661  807  838  839  841  845
+  846 2672 2679 2680 2682 2683 4663 4664 4711 4712  807  838  845  846  847  852
+  854  856  857  863  869 2683 2684 2695 2696 2699 2700 2713 4665 4666 4675 4676
+ 4684 4687 4688 4695 4712  846  847 4665 4712  848  849 2685 4667  848  849  851
+  852  854 2685 2686 2689 2690 4667 4668 4669 4672  850 2687 2690  849  851  854
+ 2687 2688 2690 2693 4669 4670 4673  807  846  849  852  854  857 2689 2690 2701
+ 4671 4672 4681  807  853  854 2691 2697 2698 4676 4677 4678 4679  807  846  849
+  851  852  853  854 2690 2693 2694 2696 2697 4673 4674 4675 4676  855 4676 4677
+  846  856  857  858 2699 2705 2706 4684 4685 4686 4687  807  846  852  856  857
+ 2701 2702 2704 2705 4681 4682 4683 4684  856  858 4684 4685  859  860  862 2707
+ 2711 2712 4693 4709 4710  859  860  861  862 2710 2711 2727 2728 4689 4690 4691
+  860  861  863 2715 2716 2718 2727 4689 4697 4698  859  860  862 4691 4692 4709
+  846  861  863  865 2713 2714 2717 2718 4695 4696 4697 4700  864 2715 2718  863
+  865  866 2717 2718 2719 4699 4700 4701  865  866  868 2719 2720 2722 2723 4701
+ 4702 4703  867  868 4704 4705  866  867  868 2723 2724 2725 4703 4704 4705 4706
+ 4707  497  536  776  807  838  846  869  870  872  876  900  907  908  915  916
+  919 2729 2805 2806 2815 2816 2827 2828 4713 4800 4803 4804 4807 4808 4835 4836
+  869  870  872  873 2729 2730 2733 2734 4713 4714 4715 4718  871 2731 2734  869
+  870  872  876 2731 2732 2734 2741 4715 4716 4725  870  873  874 2733 2734 2735
+ 4717 4718 4719  873  874 2735 2736 2738 2739 4719 4720 4721  875 4722 4723  838
+  869  872  876  877 2741 2742 2744 2745 4725 4726 4773 4774  838  876  877  885
+  887  888  894 2745 2746 2757 2758 2761 2762 2775 4727 4728 4737 4738 4749 4750
+ 4757 4774  878 4727 4774  879  880 2747 4729  879  880  882  883 2747 2748 2751
+ 4729 4730 4731 4734  881 2749 2752  880  882  883  885 2749 2750 2751 2752 2755
+ 4731 4732 4735  880  882  883  885  888 2751 2752 2763 4733 4734 4743  838  884
+  885 2753 2759 2760 4738 4739 4740 4741  838  877  882  883  884  885  888 2755
+ 2756 2758 2759 4735 4736 4737 4738  886 4738 4739  838  877  887  888 2761 2767
+ 2768 4746 4747 4748 4749  838  877  883  885  887  888 2763 2764 2766 2767 4743
+ 4744 4745 4746  889 4746 4747  890  891  893 2769 2773 2774 4755 4771 4772  890
+  891  892  893 2772 2773 2789 2790 4751 4752 4753  891  892  894 2777 2778 2780
+ 2789 4751 4759 4760  890  891  893 4753 4754 4771  877  892  894  896 2775 2776
+ 2779 2780 4757 4758 4759 4762  895 2777 2780  894  896  897 2779 2780 2781 4761
+ 4762 4763  896  897  898  899 2781 2782 2784 2785 4763 4764 4765  897  898  899
+ 4765 4766 4767  897  898  899 2785 2786 2787 4767 4768 4769  869  900  901  907
+  908  934  938  939  946  947  950 2791 2867 2868 2877 2878 2889 2890 4775 4862
+ 4865 4866 4869 4870 4897 4898  900  901  903  904  907 2791 2792 2795 2796 4775
+ 4776 4777 4780  902 2793 2796  901  903  907 2793 2794 2796 2803 4777 4778 4787
+  901  904  905 2795 2796 2797 4779 4780 4781  904  905 2797 2798 2800 2801 4781
+ 4782 4783  906 4784 4785  869  900  901  903  907  908 2796 2803 2804 2806 2807
+ 4787 4788 4835  869  900  907  908  909  916  918  919  925 2807 2808 2819 2820
+ 2823 2824 2837 4789 4790 4799 4800 4808 4811 4812 4819 4835 4836  908  909 4789
+ 4836  910  911 2809 4791  910  911  913  914 2809 2810 2813 4791 4792 4793 4796
+  912 2811 2814  911  913  914  916 2811 2812 2813 2814 2817 4793 4794 4797  911
+  913  914  916  919 2813 2814 2825 4795 4796 4805  869  915  916 2815 2821 2822
+ 4800 4801 4802 4803  869  908  913  914  915  916  919 2817 2818 2820 2821 4797
+ 4798 4799 4800  917 4800 4801  908  918  919  920 2823 2829 2830 4808 4809 4810
+ 4811  869  908  914  916  918  919 2825 2826 2828 2829 4805 4806 4807 4808  918
+  920 4808 4809  921  922  924 2831 2835 2836 4817 4833 4834  921  922  923  924
+ 2834 2835 2851 2852 4813 4814 4815  922  923  925 2839 2840 2842 2851 4813 4821
+ 4822  921  922  924 4815 4816 4833  908  923  925  927 2837 2838 2841 2842 4819
+ 4820 4821 4824  926 2839 2842  925  927  928 2841 2842 2843 4823 4824 4825  927
+  928  930 2843 2844 2846 2847 4825 4826 4827  929  930 4828 4829  928  929  930
+ 2847 2848 2849 4827 4828 4829 4830 4831  931  932  969  970  975  977  980  981
+ 2853 2929 2930 2939 2940 2951 2952 4837 4927 4928 4931 4932 4959 4960  931  932
+  934  935 2853 2854 2857 2858 4837 4838 4839 4842  933 2855 2858  900  932  934
+  938  939 2855 2856 2858 2865 4839 4840 4849  932  935  936 2857 2858 2859 4841
+ 4842 4843  935  936 2859 2860 2862 2863 4843 4844 4845  937 4846 4847  900  934
+  938  939 2865 2866 2868 2869 4849 4850 4897  900  934  938  939  940  947  949
+  950  956 2869 2870 2881 2882 2885 2886 2899 4851 4852 4861 4862 4870 4873 4874
+ 4881 4897 4898  939  940 4851 4898  941  942 2871 4853  941  942  944  945 2871
+ 2872 2875 4853 4854 4855 4858  943 2873 2876  942  944  945  947  950 2873 2874
+ 2875 2876 2879 4855 4856 4859  942  944  945  950 2875 2876 2887 4857 4858 4867
+  900  946  947 2877 2883 2884 4862 4863 4864 4865  900  939  944  946  947  950
+ 2879 2880 2882 2883 4859 4860 4861 4862  948 4862 4863  939  949  950  951 2885
+ 2891 2892 4870 4871 4872 4873  900  939  944  945  947  949  950 2887 2888 2890
+ 2891 4867 4868 4869 4870  949  951 4870 4871  952  953  955 2893 2897 2898 4879
+ 4895 4896  952  953  954  955 2896 2897 2913 2914 4875 4876 4877  953  954  956
+ 2901 2902 2904 2913 4875 4883 4884  952  953  955 4877 4878 4895  939  954  956
+  958 2899 2900 2903 2904 4881 4882 4883 4886  957 2901 2904  956  958  959 2903
+ 2904 2905 4885 4886 4887  958  959  961 2905 2906 2908 2909 4887 4888 4889  960
+  961 4890 4891  959  960  961 2909 2910 2911 4889 4890 4891 4892 4893  962 2915
+ 4899  963  965  966 2915 2916 2919 4899 4900 4901 4904  964 2917 2920  963  965
+  966  969 2917 2918 2919 2920 2927 4901 4902 4911  963  965  966  967 2919 2920
+ 2921 4903 4904 4905  966  967 2921 2922 2924 2925 4905 4906 4907  968 4908 4909
+  931  965  969  970 2927 2928 2930 2931 4911 4912 4959  931  969  970  975  977
+  978  980  981  987 2931 2932 2943 2944 2947 2948 2961 4913 4914 4923 4924 4935
+ 4936 4943 4959 4960  971 4913 4960  972 2933 4915  973  975  976 2933 2934 2937
+ 4915 4916 4917 4920  974 2935 2938  931  970  973  975  976  977  978  981 2935
+ 2936 2937 2938 2941 4917 4918 4921  973  975  976  981 2937 2938 2949 4919 4920
+ 4929  931  970  975  977  978  979 2939 2945 2946 4923 4924 4925 4926 4927  970
+  975  977  978 2941 2942 2944 2945 4921 4922 4923  977  979 4924 4925  931  970
+  980  981  982 2947 2953 2954 4931 4932 4933 4934 4935  931  970  975  976  980
+  981 2949 2950 2952 2953 4929 4930 4931  980  982 4932 4933  983  984  986 2955
+ 2959 2960 4939 4940 4941 4957 4958  983  984  985 2958 2959 2975 2976 4937 4938
+ 4939  984  985  987 2963 2964 2966 2975 4937 4945 4946  983  986 4940 4957  970
+  985  987  989 2961 2962 2965 2966 4943 4944 4945 4948  988 2963 2966  987  989
+  990 2965 2966 2967 4947 4948 4949  989  990  991  992 2967 2968 2970 2971 4949
+ 4950 4951  990  991  992 4951 4952 4953  990  991  992 2971 2972 2973 4953 4954
+ 4955    1    2  993  994    2  993  994    3    4  995  996  998    4  995  996
+    2    5  997  998    2    3    4    5  995  997  998    5    6  999 1000 2983
+    6  999 1000 1001 1002    6 1001 1002    6 1003 1004 1003 1004    4    8 1005
+ 1006    8 1005 1006 1007 1008    8 1007 1008    8    9 1009 1010    9 1009 1010
+   11   12 1011 1012 2993   12 1011 1012   13   14 1013 1014 1016   14 1013 1014
+   12   14   15 1015 1016   13   14   15 1013 1015 1016   16 1017 1018 1017 1018
+   14   17 1019 1020   17 1019 1020    9 1021 1022    9   17 1021 1022   16   17
+ 1023 1024   16 1023 1024    9   19 1025 1026    9 1025 1026   15   20 1027 1028
+   20 1027 1028 1029 1030   20 1029 1030   19   20 1031 1032   19 1031 1032   22
+ 1033 1034 1033 1034 1035 1036   23 1035 1036   22   23 1037 1038   22 1037 1038
+    9   26 1039 1040   26 1039 1040   24   27 1041 1042 1044   24 1041 1042   26
+   28 1043 1044   24   26   27   28 1041 1043 1044   28   29 1045 1046   29 1045
+ 1046 1047 1048   29 1047 1048   29   31 1049 1050   31 1049 1050   31 1051 1052
+ 1051 1052   23   24 1053 1054   23 1053 1054   32   33 1055 1056   33 1055 1056
+   34   35 1057 1058 1060   35 1057 1058   33   35   36 1059 1060   34   35   36
+ 1057 1059 1060   36   37 1061 1062   37 1061 1062 1063 1064   37 1063 1064   37
+ 1065 1066 1065 1066   35   39 1067 1068   39 1067 1068    1 1069 1070    1   39
+ 1069 1070   39   40 1071 1072   40 1071 1072   42   43 1073 1074   43 1073 1074
+   44   45 1075 1076 1078   45 1075 1076   43   46 1077 1078   43   44   45   46
+   48 1075 1077 1078    1   47 1079 1080    1 1079 1080   45   48 1081 1082   48
+ 1081 1082   40 1083 1084   40   48 1083 1084   47   48 1085 1086   47 1085 1086
+   40   50 1087 1088   40 1087 1088   46   51 1089 1090   51 1089 1090    1 1091
+ 1092    1   51 1091 1092   50   51 1093 1094   50 1093 1094   53 1095 1096 1095
+ 1096 1097 1098   54 1097 1098   53   54 1099 1100   53 1099 1100   40   57 1101
+ 1102   57 1101 1102   55   58 1103 1104 1106   55 1103 1104   57   59 1105 1106
+   55   57   58   59 1103 1105 1106   59   60 1107 1108   60 1107 1108 1109 1110
+   60 1109 1110   60   62 1111 1112   62 1111 1112   62 1113 1114 1113 1114   54
+   55 1115 1116   54 1115 1116   63   64 1117 1118   64 1117 1118   65   66 1119
+ 1120 1122   66 1119 1120   64   67 1121 1122   64   65   66   67   70 1119 1121
+ 1122   67   68 1123 1124   68 1123 1124 1125 1126   68 1125 1126   68 1127 1128
+ 1127 1128   66   70 1129 1130   70 1129 1130   32 1131 1132   32   70 1131 1132
+   70   71 1133 1134   71 1133 1134   73   74 1135 1136   74 1135 1136   75   76
+ 1137 1138 1140   76 1137 1138   74   77 1139 1140   74   75   76   77 1137 1139
+ 1140   32   78 1141 1142   32 1141 1142   76   79 1143 1144   79 1143 1144   71
+ 1145 1146   71   79 1145 1146   78   79 1147 1148   78 1147 1148   71   81 1149
+ 1150   71 1149 1150   77   82 1151 1152   82 1151 1152   32 1153 1154   32   82
+ 1153 1154   81   82 1155 1156   81 1155 1156   84 1157 1158 1157 1158 1159 1160
+   85 1159 1160   84   85 1161 1162   84 1161 1162   71   88 1163 1164   88 1163
+ 1164   86   89 1165 1166 1168   86 1165 1166   88   90 1167 1168   86   88   89
+   90 1165 1167 1168   90   91 1169 1170   91 1169 1170 1171 1172   91 1171 1172
+   91   93 1173 1174   93 1173 1174   93 1175 1176 1175 1176   85   86 1177 1178
+   85 1177 1178   94   95 1179 1180   95 1179 1180   96   97 1181 1182 1184   97
+ 1181 1182   95   98 1183 1184   95   96   97   98 1181 1183 1184   98   99 1185
+ 1186   99 1185 1186 1187 1188   99 1187 1188   99 1189 1190 1189 1190   97  101
+ 1191 1192  101 1191 1192   63 1193 1194   63  101 1193 1194  101  102 1195 1196
+  102 1195 1196  104  105 1197 1198  105 1197 1198  106  107 1199 1200 1202  107
+ 1199 1200  105  108 1201 1202  105  106  107  108  110  113 1199 1201 1202   63
+  109 1203 1204   63 1203 1204  107  110 1205 1206  110 1205 1206  102 1207 1208
+  102  110 1207 1208  109  110 1209 1210  109 1209 1210  102  112 1211 1212  102
+ 1211 1212  108  113 1213 1214  113 1213 1214   63 1215 1216   63  113 1215 1216
+  112  113 1217 1218  112 1217 1218  115 1219 1220 1219 1220 1221 1222  116 1221
+ 1222  115  116 1223 1224  115 1223 1224  102  119 1225 1226  119 1225 1226  117
+  120 1227 1228  117 1227 1228  119  121 1229 1230  117  119  120  121 1229 1230
+  121  122 1231 1232  122 1231 1232 1233 1234  122 1233 1234  122  124 1235 1236
+  124 1235 1236  124 1237 1238 1237 1238  116  117 1239 1240  116 1239 1240  125
+  126 1241 1242  126 1241 1242  127  128 1243 1244  128 1243 1244  126  129 1245
+ 1246  126  127  128  129 1245 1246  129  130 1247 1248  130 1247 1248 1249 1250
+  130 1249 1250  130 1251 1252 1251 1252  128  132 1253 1254  132 1253 1254   94
+ 1255 1256   94  132 1255 1256  132  133 1257 1258  133 1257 1258  135  136 1259
+ 1260 3241  136 1259 1260  137  138 1261 1262 1264  138 1261 1262  136  138  139
+ 1263 1264  137  138  139 1261 1263 1264   94  140 1265 1266   94 1265 1266  138
+  141 1267 1268  141 1267 1268  133 1269 1270  133  141 1269 1270  140  141 1271
+ 1272  140 1271 1272  133  143 1273 1274  133 1273 1274  139  144 1275 1276  144
+ 1275 1276   94 1277 1278   94  144 1277 1278  143  144 1279 1280  143 1279 1280
+  146 1281 1282 1281 1282 1283 1284  147 1283 1284  146  147 1285 1286  146 1285
+ 1286  133  150 1287 1288  150 1287 1288  148  150  151 1289 1290  148 1289 1290
+  150  152 1291 1292  150  151  152 1291 1292  152  153 1293 1294  153 1293 1294
+ 1295 1296  153 1295 1296  153  155 1297 1298  155 1297 1298  155 1299 1300 1299
+ 1300  147  148 1301 1302  147 1301 1302  156  157 1303 1304  157 1303 1304  158
+  159 1305 1306  159 1305 1306  157  159  160 1307 1308  158  159  160 1307 1308
+  160  161 1309 1310  161 1309 1310 1311 1312  161 1311 1312  161 1313 1314 1313
+ 1314  159  163 1315 1316  163 1315 1316  125 1317 1318  125  163 1317 1318  163
+  164 1319 1320  164 1319 1320  166  167 1321 1322 3303  167 1321 1322  168  169
+ 1323 1324  169 1323 1324  167  170 1325 1326  167  168  169  170 1325 1326  125
+  171 1327 1328  125 1327 1328  169  172 1329 1330  172 1329 1330  164 1331 1332
+  164  172 1331 1332  171  172 1333 1334  171 1333 1334  164  174 1335 1336  164
+ 1335 1336  170  175 1337 1338  175 1337 1338  125 1339 1340  125  175 1339 1340
+  174  175 1341 1342  174 1341 1342  177 1343 1344 1343 1344 1345 1346  178 1345
+ 1346  177  178 1347 1348  177 1347 1348  164  181 1349 1350  181 1349 1350  179
+  182 1351 1352 1354  179 1351 1352  181  183 1353 1354  179  181  182  183 1351
+ 1353 1354  183  184 1355 1356  184 1355 1356 1357 1358  184 1357 1358  184  186
+ 1359 1360  186 1359 1360  186 1361 1362 1361 1362  178  179 1363 1364  178 1363
+ 1364  187  188 1365 1366  188 1365 1366  189  190 1367 1368  190 1367 1368  188
+  190  191 1369 1370  189  190  191 1369 1370  191  192 1371 1372  192 1371 1372
+ 1373 1374  192 1373 1374  192 1375 1376 1375 1376  190  194 1377 1378  194 1377
+ 1378  156 1379 1380  156  194 1379 1380  194  195 1381 1382  195 1381 1382  197
+  198 1383 1384 3365  198 1383 1384  199  200 1385 1386 1388  200 1385 1386  198
+  201 1387 1388  198  199  200  201  203  206 1385 1387 1388  156  202 1389 1390
+  156 1389 1390  200  203 1391 1392  203 1391 1392  195 1393 1394  195  203 1393
+ 1394  202  203 1395 1396  202 1395 1396  195  205 1397 1398  195 1397 1398  201
+  206 1399 1400  206 1399 1400  156 1401 1402  156  206 1401 1402  205  206 1403
+ 1404  205 1403 1404  208 1405 1406 1405 1406 1407 1408  209 1407 1408  208  209
+ 1409 1410  208 1409 1410  195  212 1411 1412  212 1411 1412  210  213 1413 1414
+  210 1413 1414  212  214 1415 1416  210  212  213  214 1415 1416 3395  214  215
+ 1417 1418  215 1417 1418 1419 1420  215 1419 1420  215  217 1421 1422  217 1421
+ 1422  217 1423 1424 1423 1424  209  210 1425 1426  209 1425 1426  218  219 1427
+ 1428  219 1427 1428  220  221 1429 1430  221 1429 1430  219  222 1431 1432  219
+  220  221  222 1431 1432  222  223 1433 1434  223 1433 1434 1435 1436  223 1435
+ 1436  223 1437 1438 1437 1438  221  225 1439 1440  225 1439 1440  187 1441 1442
+  187  225 1441 1442  225  226 1443 1444  226 1443 1444  228  229 1445 1446  229
+ 1445 1446  230  231 1447 1448 1450  231 1447 1448  229  232 1449 1450  229  230
+  231  232 1447 1449 1450  187  233 1451 1452  187 1451 1452  231  234 1453 1454
+  234 1453 1454  226 1455 1456  226  234 1455 1456  233  234 1457 1458  233 1457
+ 1458  226  236 1459 1460  226 1459 1460  232  237 1461 1462  237 1461 1462  187
+ 1463 1464  187  237 1463 1464  236  237 1465 1466  236 1465 1466  239 1467 1468
+ 1467 1468 1469 1470  240 1469 1470  239  240 1471 1472  239 1471 1472  226  243
+ 1473 1474  243 1473 1474  241  244 1475 1476 1478  241 1475 1476  243  245 1477
+ 1478  241  243  244  245 1475 1477 1478  245  246 1479 1480  246 1479 1480 1481
+ 1482  246 1481 1482  246  248 1483 1484  248 1483 1484  248 1485 1486 1485 1486
+  240  241 1487 1488  240 1487 1488  249  250 1489 1490  250 1489 1490  251  252
+ 1491 1492 1494  252 1491 1492  250  253 1493 1494  250  251  252  253 1491 1493
+ 1494  253  254 1495 1496  254 1495 1496 1497 1498  254 1497 1498  254 1499 1500
+ 1499 1500  252  256 1501 1502  256 1501 1502  218 1503 1504  218  256 1503 1504
+  256  257 1505 1506  257 1505 1506  259  260 1507 1508  260 1507 1508  261  262
+ 1509 1510 1512  262 1509 1510  260  263 1511 1512  260  261  262  263  265  268
+ 1509 1511 1512  218  264 1513 1514  218 1513 1514  262  265 1515 1516  265 1515
+ 1516  257 1517 1518  257  265 1517 1518  264  265 1519 1520  264 1519 1520  257
+  267 1521 1522  257 1521 1522  263  268 1523 1524  268 1523 1524  218 1525 1526
+  218  268 1525 1526  267  268 1527 1528  267 1527 1528  270 1529 1530 1529 1530
+ 1531 1532  271 1531 1532  270  271 1533 1534  270 1533 1534  257  274 1535 1536
+  274 1535 1536  272  275 1537 1538 1540  272 1537 1538  274  276 1539 1540  272
+  274  275  276 1537 1539 1540  276  277 1541 1542  277 1541 1542 1543 1544  277
+ 1543 1544  277  279 1545 1546  279 1545 1546  279 1547 1548 1547 1548  271  272
+ 1549 1550  271 1549 1550  280  281 1551 1552  281 1551 1552  282  283 1553 1554
+ 1556  283 1553 1554  281  284 1555 1556  281  282  283  284 1553 1555 1556  284
+  285 1557 1558  285 1557 1558 1559 1560  285 1559 1560  285 1561 1562 1561 1562
+  283  287 1563 1564  287 1563 1564  249 1565 1566  249  287 1565 1566  287  288
+ 1567 1568  288 1567 1568  290  291 1569 1570  291 1569 1570  292  293 1571 1572
+ 1574  293 1571 1572  291  294 1573 1574  291  292  293  294 1571 1573 1574  249
+  295 1575 1576  249 1575 1576  293  296 1577 1578  296 1577 1578  288 1579 1580
+  288  296 1579 1580  295  296 1581 1582  295 1581 1582  288  298 1583 1584  288
+ 1583 1584  294  299 1585 1586  299 1585 1586  249 1587 1588  249  299 1587 1588
+  298  299 1589 1590  298 1589 1590  301 1591 1592 1591 1592 1593 1594  302 1593
+ 1594  301  302 1595 1596  301 1595 1596  288  305 1597 1598  305 1597 1598  303
+  306 1599 1600 1602  303 1599 1600  305  307 1601 1602  303  305  306  307 1599
+ 1601 1602  307  308 1603 1604  308 1603 1604 1605 1606  308 1605 1606  308  310
+ 1607 1608  310 1607 1608  310 1609 1610 1609 1610  302  303 1611 1612  302 1611
+ 1612  311  312 1613 1614  312 1613 1614  313  314 1615 1616 1618  314 1615 1616
+  312  315 1617 1618  312  313  314  315 1615 1617 1618  315  316 1619 1620  316
+ 1619 1620 1621 1622  316 1621 1622  316 1623 1624 1623 1624  314  318 1625 1626
+  318 1625 1626  280 1627 1628  280  318 1627 1628  318  319 1629 1630  319 1629
+ 1630  321  322 1631 1632  322 1631 1632  323  324 1633 1634 1636  324 1633 1634
+  322  324  325 1635 1636  323  324  325 1633 1635 1636  280  326 1637 1638  280
+ 1637 1638  324  327 1639 1640  327 1639 1640  319 1641 1642  319  327 1641 1642
+  326  327 1643 1644  326 1643 1644  319  329 1645 1646  319 1645 1646  325  330
+ 1647 1648  330 1647 1648  280 1649 1650  280  330 1649 1650  329  330 1651 1652
+  329 1651 1652  332 1653 1654 1653 1654 1655 1656  333 1655 1656  332  333 1657
+ 1658  332 1657 1658  319  336 1659 1660  336 1659 1660  334  337 1661 1662 1664
+  334 1661 1662  336  338 1663 1664  334  336  337  338 1661 1663 1664  338  339
+ 1665 1666  339 1665 1666 1667 1668  339 1667 1668  339  341 1669 1670  341 1669
+ 1670  341 1671 1672 1671 1672  333  334 1673 1674  333 1673 1674  342  343 1675
+ 1676  343 1675 1676  344  345 1677 1678 1680  345 1677 1678  343  346 1679 1680
+  343  344  345  346 1677 1679 1680  346  347 1681 1682  347 1681 1682 1683 1684
+  347 1683 1684  347 1685 1686 1685 1686  345  349 1687 1688  349 1687 1688  311
+ 1689 1690  311  349 1689 1690  349  350 1691 1692  350 1691 1692  352  353 1693
+ 1694  353 1693 1694  354  355 1695 1696 1698  355 1695 1696  353  355  356 1697
+ 1698  354  355  356 1695 1697 1698  311  357 1699 1700  311 1699 1700  355  358
+ 1701 1702  358 1701 1702  350 1703 1704  350  358 1703 1704  357  358 1705 1706
+  357 1705 1706  350  360 1707 1708  350 1707 1708  356  361 1709 1710  361 1709
+ 1710  311 1711 1712  311  361 1711 1712  360  361 1713 1714  360 1713 1714  363
+ 1715 1716 1715 1716 1717 1718  364 1717 1718  363  364 1719 1720  363 1719 1720
+  350  367 1721 1722  367 1721 1722  365  368 1723 1724 1726  365 1723 1724  367
+  369 1725 1726  365  367  368  369 1723 1725 1726  369  370 1727 1728  370 1727
+ 1728 1729 1730  370 1729 1730  370  372 1731 1732  372 1731 1732  372 1733 1734
+ 1733 1734  364  365 1735 1736  364 1735 1736  373  374 1737 1738  374 1737 1738
+  375  376 1739 1740 1742  376 1739 1740  374  377 1741 1742  374  375  376  377
+ 1739 1741 1742  377  378 1743 1744  378 1743 1744 1745 1746  378 1745 1746  378
+ 1747 1748 1747 1748  376  380 1749 1750  380 1749 1750  342 1751 1752  342  380
+ 1751 1752  380  381 1753 1754  381 1753 1754  383  384 1755 1756  384 1755 1756
+  385  386 1757 1758 1760  386 1757 1758  384  386  387 1759 1760  385  386  387
+ 1757 1759 1760  342  388 1761 1762  342 1761 1762  386  389 1763 1764  389 1763
+ 1764  381 1765 1766  381  389 1765 1766  388  389 1767 1768  388 1767 1768  381
+  391 1769 1770  381 1769 1770  387  392 1771 1772  392 1771 1772  342 1773 1774
+  342  392 1773 1774  391  392 1775 1776  391 1775 1776  394 1777 1778 1777 1778
+ 1779 1780  395 1779 1780  394  395 1781 1782  394 1781 1782  381  398 1783 1784
+  398 1783 1784  396  399 1785 1786 1788  396 1785 1786  398  400 1787 1788  396
+  398  399  400 1785 1787 1788  400  401 1789 1790  401 1789 1790 1791 1792  401
+ 1791 1792  401  403 1793 1794  403 1793 1794  403 1795 1796 1795 1796  395  396
+ 1797 1798  395 1797 1798  404  405 1799 1800  405 1799 1800  406  407 1801 1802
+ 1804  407 1801 1802  405  408 1803 1804  405  406  407  408 1801 1803 1804  408
+  409 1805 1806  409 1805 1806 1807 1808  409 1807 1808  409 1809 1810 1809 1810
+  407  411 1811 1812  411 1811 1812  373 1813 1814  373  411 1813 1814  411  412
+ 1815 1816  412 1815 1816  414  415 1817 1818  415 1817 1818  416  417 1819 1820
+ 1822  417 1819 1820  415  417  418 1821 1822  416  417  418 1819 1821 1822  373
+  419 1823 1824  373 1823 1824  417  420 1825 1826  420 1825 1826  412 1827 1828
+  412  420 1827 1828  419  420 1829 1830  419 1829 1830  412  422 1831 1832  412
+ 1831 1832  418  423 1833 1834  423 1833 1834  373 1835 1836  373  423 1835 1836
+  422  423 1837 1838  422 1837 1838  425 1839 1840 1839 1840 1841 1842  426 1841
+ 1842  425  426 1843 1844  425 1843 1844  412  429 1845 1846  429 1845 1846  427
+  430 1847 1848 1850  427 1847 1848  429  431 1849 1850  427  429  430  431 1847
+ 1849 1850  431  432 1851 1852  432 1851 1852 1853 1854  432 1853 1854  432  434
+ 1855 1856  434 1855 1856  434 1857 1858 1857 1858  426  427 1859 1860  426 1859
+ 1860  435  436 1861 1862  436 1861 1862  437  438 1863 1864 1866  438 1863 1864
+  436  439 1865 1866  436  437  438  439  442 1863 1865 1866  439  440 1867 1868
+  440 1867 1868 1869 1870  440 1869 1870  440 1871 1872 1871 1872  438  442 1873
+ 1874  442 1873 1874  404 1875 1876  404  442 1875 1876  442  443 1877 1878  443
+ 1877 1878  445  446 1879 1880  446 1879 1880  447  448 1881 1882 1884  448 1881
+ 1882  446  449 1883 1884  446  447  448  449  451 1881 1883 1884  404  450 1885
+ 1886  404 1885 1886  448  451 1887 1888  451 1887 1888  443 1889 1890  443  451
+ 1889 1890  450  451 1891 1892  450 1891 1892  443  453 1893 1894  443 1893 1894
+  449  454 1895 1896  454 1895 1896  404 1897 1898  404  454 1897 1898  453  454
+ 1899 1900  453 1899 1900  456 1901 1902 1901 1902 1903 1904  457 1903 1904  456
+  457 1905 1906  456 1905 1906  443  460 1907 1908  460 1907 1908  458  461 1909
+ 1910 1912  458 1909 1910  460  462 1911 1912  458  460  461  462 1909 1911 1912
+  462  463 1913 1914  463 1913 1914 1915 1916  463 1915 1916  463  465 1917 1918
+  465 1917 1918  465 1919 1920 1919 1920  457  458 1921 1922  457 1921 1922  466
+  467 1923 1924  467 1923 1924  468  469 1925 1926 1928  469 1925 1926  467  470
+ 1927 1928  467  468  469  470  473 1925 1927 1928  470  471 1929 1930  471 1929
+ 1930 1931 1932  471 1931 1932  471 1933 1934 1933 1934  469  473 1935 1936  473
+ 1935 1936  435 1937 1938  435  473 1937 1938  473  474 1939 1940  474 1939 1940
+  476  477 1941 1942  477 1941 1942  478  479 1943 1944 1946  479 1943 1944  477
+  479  480 1945 1946  478  479  480 1943 1945 1946  435  481 1947 1948  435 1947
+ 1948  479  482 1949 1950  482 1949 1950  474 1951 1952  474  482 1951 1952  481
+  482 1953 1954  481 1953 1954  474  484 1955 1956  474 1955 1956  480  485 1957
+ 1958  485 1957 1958  435 1959 1960  435  485 1959 1960  484  485 1961 1962  484
+ 1961 1962  487 1963 1964 1963 1964 1965 1966  488 1965 1966  487  488 1967 1968
+  487 1967 1968  474  491 1969 1970  491 1969 1970  489  492 1971 1972 1974  489
+ 1971 1972  491  493 1973 1974  489  491  492  493 1971 1973 1974 3953  493  494
+ 1975 1976  494 1975 1976 1977 1978  494 1977 1978  494  496 1979 1980  496 1979
+ 1980  496 1981 1982 1981 1982  488  489 1983 1984  488 1983 1984  497  498 1985
+ 1986  498 1985 1986  499  500 1987 1988 1990  500 1987 1988  498  501 1989 1990
+  498  499  500  501  504 1987 1989 1990  501  502 1991 1992  502 1991 1992 1993
+ 1994  502 1993 1994  502 1995 1996 1995 1996  500  504 1997 1998  504 1997 1998
+  466 1999 2000  466  504 1999 2000  504  505 2001 2002  505 2001 2002  507  508
+ 2003 2004  508 2003 2004  509  510 2005 2006 2008  510 2005 2006  508  510  511
+ 2007 2008  509  510  511 2005 2007 2008  466  512 2009 2010  466 2009 2010  510
+  513 2011 2012  513 2011 2012  505 2013 2014  505  513 2013 2014  512  513 2015
+ 2016  512 2015 2016  505  515 2017 2018  505 2017 2018  511  516 2019 2020  516
+ 2019 2020  466 2021 2022  466  516 2021 2022  515  516 2023 2024  515 2023 2024
+  518 2025 2026 2025 2026 2027 2028  519 2027 2028  518  519 2029 2030  518 2029
+ 2030  505  522 2031 2032  522 2031 2032  520  523 2033 2034 2036  520 2033 2034
+  522  524 2035 2036  520  522  523  524 2033 2035 2036  524  525 2037 2038  525
+ 2037 2038 2039 2040  525 2039 2040  525  527 2041 2042  527 2041 2042  527 2043
+ 2044 2043 2044  519  520 2045 2046  519 2045 2046  528  529 2047 2048  529 2047
+ 2048  530  531 2049 2050 2052  531 2049 2050  529  532 2051 2052  529  530  531
+  532 2049 2051 2052  532  533 2053 2054  533 2053 2054 2055 2056  533 2055 2056
+  533 2057 2058 2057 2058  531  535 2059 2060  535 2059 2060  497 2061 2062  497
+  535 2061 2062  535  536 2063 2064  536 2063 2064  538  539 2065 2066  539 2065
+ 2066  540  541 2067 2068 2070  541 2067 2068  539  542 2069 2070  539  540  541
+  542 2067 2069 2070  497  543 2071 2072  497 2071 2072  541  544 2073 2074  544
+ 2073 2074  536 2075 2076  536  544 2075 2076  543  544 2077 2078  543 2077 2078
+  536  546 2079 2080  536 2079 2080  542  547 2081 2082  547 2081 2082  497 2083
+ 2084  497  547 2083 2084  546  547 2085 2086  546 2085 2086  549 2087 2088 2087
+ 2088 2089 2090  550 2089 2090  549  550 2091 2092  549 2091 2092  536  553 2093
+ 2094  553 2093 2094  551  554 2095 2096 2098  551 2095 2096  553  555 2097 2098
+  551  553  554  555 2095 2097 2098  555  556 2099 2100  556 2099 2100 2101 2102
+  556 2101 2102  556  558 2103 2104  558 2103 2104  558 2105 2106 2105 2106  550
+  551 2107 2108  550 2107 2108  559  560 2109 2110  560 2109 2110  561  562 2111
+ 2112 2114  562 2111 2112  560  563 2113 2114  560  561  562  563  566 2111 2113
+ 2114  563  564 2115 2116  564 2115 2116 2117 2118  564 2117 2118  564 2119 2120
+ 2119 2120  562  566 2121 2122  566 2121 2122  528 2123 2124  528  566 2123 2124
+  566  567 2125 2126  567 2125 2126  569  570 2127 2128  570 2127 2128  571  572
+ 2129 2130 2132  572 2129 2130  570  572  573 2131 2132  571  572  573 2129 2131
+ 2132  528  574 2133 2134  528 2133 2134  572  575 2135 2136  575 2135 2136  567
+ 2137 2138  567  575 2137 2138  574  575 2139 2140  574 2139 2140  567  577 2141
+ 2142  567 2141 2142  573  578 2143 2144  578 2143 2144  528 2145 2146  528  578
+ 2145 2146  577  578 2147 2148  577 2147 2148  580 2149 2150 2149 2150 2151 2152
+  581 2151 2152  580  581 2153 2154  580 2153 2154  567  584 2155 2156  584 2155
+ 2156  582  585 2157 2158 2160  582 2157 2158  584  586 2159 2160  582  584  585
+  586 2157 2159 2160  586  587 2161 2162  587 2161 2162 2163 2164  587 2163 2164
+  587  589 2165 2166  589 2165 2166  589 2167 2168 2167 2168  581  582 2169 2170
+  581 2169 2170  590  591 2171 2172  591 2171 2172  592  593 2173 2174 2176  593
+ 2173 2174  591  594 2175 2176  591  592  593  594 2173 2175 2176  594  595 2177
+ 2178  595 2177 2178 2179 2180  595 2179 2180  595 2181 2182 2181 2182  593  597
+ 2183 2184  597 2183 2184  559 2185 2186  559  597 2185 2186  597  598 2187 2188
+  598 2187 2188  600  601 2189 2190  601 2189 2190  602  603 2191 2192 2194  603
+ 2191 2192  601  603  604 2193 2194  602  603  604 2191 2193 2194  559  605 2195
+ 2196  559 2195 2196  603  606 2197 2198  606 2197 2198  598 2199 2200  598  606
+ 2199 2200  605  606 2201 2202  605 2201 2202  598  608 2203 2204  598 2203 2204
+  604  609 2205 2206  609 2205 2206  559 2207 2208  559  609 2207 2208  608  609
+ 2209 2210  608 2209 2210  611 2211 2212 2211 2212 2213 2214  612 2213 2214  611
+  612 2215 2216  611 2215 2216  598  615 2217 2218  615 2217 2218  613  616 2219
+ 2220 2222  613 2219 2220  615  617 2221 2222  613  615  616  617 2219 2221 2222
+  617  618 2223 2224  618 2223 2224 2225 2226  618 2225 2226  618  620 2227 2228
+  620 2227 2228  620 2229 2230 2229 2230  612  613 2231 2232  612 2231 2232  621
+  622 2233 2234  622 2233 2234  623  624 2235 2236 2238  624 2235 2236  622  624
+  625 2237 2238  623  624  625 2235 2237 2238  625  626 2239 2240  626 2239 2240
+ 2241 2242  626 2241 2242  626 2243 2244 2243 2244  624  628 2245 2246  628 2245
+ 2246  590 2247 2248  590  628 2247 2248  628  629 2249 2250  629 2249 2250  631
+  632 2251 2252 4233  632 2251 2252  633  634 2253 2254 2256  634 2253 2254  632
+  634  635 2255 2256  633  634  635 2253 2255 2256  590  636 2257 2258  590 2257
+ 2258  634  637 2259 2260  637 2259 2260  629 2261 2262  629  637 2261 2262  636
+  637 2263 2264  636 2263 2264  629  639 2265 2266  629 2265 2266  635  640 2267
+ 2268  640 2267 2268  590 2269 2270  590  640 2269 2270  639  640 2271 2272  639
+ 2271 2272  642 2273 2274 2273 2274 2275 2276  643 2275 2276  642  643 2277 2278
+ 4258  642 2277 2278  629  646 2279 2280  646 2279 2280  644  647 2281 2282 2284
+  644 2281 2282  646  648 2283 2284  644  646  647  648 2281 2283 2284  648  649
+ 2285 2286  649 2285 2286 2287 2288  649 2287 2288  649  651 2289 2290 4271  651
+ 2289 2290  651 2291 2292 2291 2292  643  644 2293 2294  643 2293 2294  652  653
+ 2295 2296  653 2295 2296  654  655 2297 2298 2300  655 2297 2298  653  655  656
+ 2299 2300  654  655  656 2297 2299 2300  656  657 2301 2302  657 2301 2302 2303
+ 2304  657 2303 2304  657 2305 2306 2305 2306  655  659 2307 2308  659 2307 2308
+  621 2309 2310  621  659 2309 2310  659  660 2311 2312  660 2311 2312  662  663
+ 2313 2314 4295  663 2313 2314  664  665 2315 2316 2318  665 2315 2316  663  665
+  666 2317 2318  664  665  666 2315 2317 2318  621  667 2319 2320  621 2319 2320
+  665  668 2321 2322  668 2321 2322  660 2323 2324  660  668 2323 2324  667  668
+ 2325 2326  667 2325 2326  660  670 2327 2328  660 2327 2328  666  671 2329 2330
+  671 2329 2330  621 2331 2332  621  671 2331 2332  670  671 2333 2334  670 2333
+ 2334  673 2335 2336 2335 2336 2337 2338  674 2337 2338  673  674 2339 2340  673
+ 2339 2340  660  677 2341 2342  677 2341 2342  675  678 2343 2344 2346  675 2343
+ 2344  677  679 2345 2346  675  677  678  679 2343 2345 2346  679  680 2347 2348
+  680 2347 2348 2349 2350  680 2349 2350  680  682 2351 2352  682 2351 2352  682
+ 2353 2354 2353 2354  674  675 2355 2356  674 2355 2356  683  684 2357 2358  684
+ 2357 2358  685  686 2359 2360 2362  686 2359 2360  684  686  687 2361 2362  685
+  686  687 2359 2361 2362  687  688 2363 2364 4347  688 2363 2364 2365 2366  688
+ 2365 2366  688 2367 2368 2367 2368  686  690 2369 2370  690 2369 2370  652 2371
+ 2372  652  690 2371 2372  690  691 2373 2374  691 2373 2374  693  694 2375 2376
+ 4357  694 2375 2376  695  696 2377 2378 2380  696 2377 2378  694  697 2379 2380
+  694  695  696  697  699  702 2377 2379 2380  652  698 2381 2382  652 2381 2382
+  696  699 2383 2384  699 2383 2384  691 2385 2386  691  699 2385 2386  698  699
+ 2387 2388  698 2387 2388  691  701 2389 2390  691 2389 2390  697  702 2391 2392
+  702 2391 2392  652 2393 2394  652  702 2393 2394  701  702 2395 2396  701 2395
+ 2396  704 2397 2398 2397 2398 2399 2400  705 2399 2400  704  705 2401 2402 4399
+  704 2401 2402  691  708 2403 2404  708 2403 2404  706  709 2405 2406 2408  706
+ 2405 2406  708  710 2407 2408  706  708  709  710 2405 2407 2408  710  711 2409
+ 2410  711 2409 2410 2411 2412  711 2411 2412  711  713 2413 2414 4394  713 2413
+ 2414  713 2415 2416 2415 2416  705  706 2417 2418  705 2417 2418  714  715 2419
+ 2420  715 2419 2420  716  717 2421 2422 2424  717 2421 2422  715  717  718 2423
+ 2424  716  717  718 2421 2423 2424  718  719 2425 2426  719 2425 2426 2427 2428
+  719 2427 2428  719 2429 2430 2429 2430  717  721 2431 2432  721 2431 2432  683
+ 2433 2434  683  721 2433 2434  721  722 2435 2436  722 2435 2436  724  725 2437
+ 2438 4419  725 2437 2438  726  727 2439 2440 2442  727 2439 2440  725  728 2441
+ 2442  725  726  727  728 2439 2441 2442  683  729 2443 2444  683 2443 2444  727
+  730 2445 2446  730 2445 2446  722 2447 2448  722  730 2447 2448  729  730 2449
+ 2450  729 2449 2450  722  732 2451 2452  722 2451 2452  728  733 2453 2454  733
+ 2453 2454  683 2455 2456  683  733 2455 2456  732  733 2457 2458  732 2457 2458
+  735 2459 2460 2459 2460 2461 2462  736 2461 2462  735  736 2463 2464  735 2463
+ 2464  722  739 2465 2466  739 2465 2466  737  740 2467 2468 2470  737 2467 2468
+  739  741 2469 2470  737  739  740  741 2467 2469 2470  741  742 2471 2472  742
+ 2471 2472 2473 2474  742 2473 2474  742  744 2475 2476  744 2475 2476  744 2477
+ 2478 2477 2478  736  737 2479 2480  736 2479 2480  745  746 2481 2482  746 2481
+ 2482  747  748 2483 2484 2486  748 2483 2484  746  748  749 2485 2486  747  748
+  749 2483 2485 2486  749  750 2487 2488  750 2487 2488 2489 2490  750 2489 2490
+  750 2491 2492 2491 2492  748  752 2493 2494  752 2493 2494  714 2495 2496  714
+  752 2495 2496  752  753 2497 2498  753 2497 2498  755  756 2499 2500 4481  756
+ 2499 2500  757  758 2501 2502 2504  758 2501 2502  756  758  759 2503 2504  757
+  758  759 2501 2503 2504  714  760 2505 2506  714 2505 2506  758  761 2507 2508
+  761 2507 2508  753 2509 2510  753  761 2509 2510  760  761 2511 2512  760 2511
+ 2512  753  763 2513 2514  753 2513 2514  759  764 2515 2516  764 2515 2516  714
+ 2517 2518  714  764 2517 2518  763  764 2519 2520  763 2519 2520  766 2521 2522
+ 2521 2522 2523 2524  767 2523 2524  766  767 2525 2526 4523  766 2525 2526  753
+  770 2527 2528  770 2527 2528  768  771 2529 2530 2532  768 2529 2530  770  772
+ 2531 2532  768  770  771  772 2529 2531 2532  772  773 2533 2534  773 2533 2534
+ 2535 2536  773 2535 2536  773  775 2537 2538 4519  775 2537 2538  775 2539 2540
+ 2539 2540  767  768 2541 2542 4503  767 2541 2542  776  777 2543 2544  777 2543
+ 2544  778  779 2545 2546 2548  779 2545 2546  777  780 2547 2548  777  778  779
+  780  783 2545 2547 2548  780  781 2549 2550  781 2549 2550 2551 2552  781 2551
+ 2552  781 2553 2554 2553 2554  779  783 2555 2556  783 2555 2556  745 2557 2558
+  745  783 2557 2558  783  784 2559 2560  784 2559 2560  786  787 2561 2562 4543
+  787 2561 2562  788  789 2563 2564 2566  789 2563 2564  787  789  790 2565 2566
+  788  789  790 2563 2565 2566  745  791 2567 2568  745 2567 2568  789  792 2569
+ 2570  792 2569 2570  784 2571 2572  784  792 2571 2572  791  792 2573 2574  791
+ 2573 2574  784  794 2575 2576  784 2575 2576  790  795 2577 2578  795 2577 2578
+  745 2579 2580  745  795 2579 2580  794  795 2581 2582  794 2581 2582  797 2583
+ 2584 2583 2584 2585 2586  798 2585 2586  797  798 2587 2588  797 2587 2588  784
+  801 2589 2590  801 2589 2590  799  802 2591 2592 2594  799 2591 2592  801  803
+ 2593 2594  799  801  802  803 2591 2593 2594  803  804 2595 2596  804 2595 2596
+ 2597 2598  804 2597 2598  804  806 2599 2600  806 2599 2600  806 2601 2602 2601
+ 2602  798  799 2603 2604  798 2603 2604  807  808 2605 2606  808 2605 2606  809
+  810 2607 2608 2610  810 2607 2608  808  811 2609 2610  808  809  810  811  814
+ 2607 2609 2610  811  812 2611 2612  812 2611 2612 2613 2614  812 2613 2614  812
+ 2615 2616 2615 2616  810  814 2617 2618  814 2617 2618  776 2619 2620  776  814
+ 2619 2620  814  815 2621 2622  815 2621 2622  817  818 2623 2624 4605  818 2623
+ 2624  819  820 2625 2626 2628  820 2625 2626  818  820  821 2627 2628  819  820
+  821 2625 2627 2628  776  822 2629 2630  776 2629 2630  820  823 2631 2632  823
+ 2631 2632  815 2633 2634  815  823 2633 2634  822  823 2635 2636  822 2635 2636
+  815  825 2637 2638  815 2637 2638  821  826 2639 2640  826 2639 2640  776 2641
+ 2642  776  826 2641 2642  825  826 2643 2644  825 2643 2644  828 2645 2646 2645
+ 2646 2647 2648  829 2647 2648  828  829 2649 2650  828 2649 2650  815  832 2651
+ 2652  832 2651 2652  830  833 2653 2654 2656  830 2653 2654  832  834 2655 2656
+  830  832  833  834 2653 2655 2656  834  835 2657 2658  835 2657 2658 2659 2660
+  835 2659 2660  835  837 2661 2662  837 2661 2662  837 2663 2664 2663 2664  829
+  830 2665 2666  829 2665 2666  838  839 2667 2668  839 2667 2668  840  841 2669
+ 2670 2672  841 2669 2670  839  842 2671 2672  839  840  841  842  845 2669 2671
+ 2672  842  843 2673 2674  843 2673 2674 2675 2676  843 2675 2676  843 2677 2678
+ 2677 2678  841  845 2679 2680  845 2679 2680  807 2681 2682  807  845 2681 2682
+  845  846 2683 2684  846 2683 2684  848  849 2685 2686  849 2685 2686  850  851
+ 2687 2688 2690  851 2687 2688  849  852 2689 2690  849  850  851  852  854 2687
+ 2689 2690  807  853 2691 2692  807 2691 2692  851  854 2693 2694  854 2693 2694
+  846 2695 2696  846  854 2695 2696  853  854 2697 2698  853 2697 2698  846  856
+ 2699 2700  846 2699 2700  852  857 2701 2702  857 2701 2702  807 2703 2704  807
+  857 2703 2704  856  857 2705 2706  856 2705 2706  859 2707 2708 2707 2708 2709
+ 2710  860 2709 2710  859  860 2711 2712  859 2711 2712  846  863 2713 2714  863
+ 2713 2714  861  864 2715 2716 2718  861 2715 2716  863  865 2717 2718  861  863
+  864  865 2715 2717 2718  865  866 2719 2720  866 2719 2720 2721 2722  866 2721
+ 2722  866  868 2723 2724  868 2723 2724  868 2725 2726 2725 2726  860  861 2727
+ 2728  860 2727 2728  869  870 2729 2730  870 2729 2730  871  872 2731 2732 2734
+  872 2731 2732  870  873 2733 2734  870  871  872  873 2731 2733 2734  873  874
+ 2735 2736  874 2735 2736 2737 2738  874 2737 2738  874 2739 2740 2739 2740  872
+  876 2741 2742  876 2741 2742  838 2743 2744  838  876 2743 2744  876  877 2745
+ 2746  877 2745 2746  879  880 2747 2748  880 2747 2748  881  882 2749 2750 2752
+  882 2749 2750  880  882  883 2751 2752  881  882  883 2749 2751 2752  838  884
+ 2753 2754  838 2753 2754  882  885 2755 2756  885 2755 2756  877 2757 2758  877
+  885 2757 2758  884  885 2759 2760  884 2759 2760  877  887 2761 2762  877 2761
+ 2762  883  888 2763 2764  888 2763 2764  838 2765 2766  838  888 2765 2766  887
+  888 2767 2768  887 2767 2768  890 2769 2770 2769 2770 2771 2772  891 2771 2772
+  890  891 2773 2774  890 2773 2774  877  894 2775 2776  894 2775 2776  892  895
+ 2777 2778 2780  892 2777 2778  894  896 2779 2780  892  894  895  896 2777 2779
+ 2780  896  897 2781 2782  897 2781 2782 2783 2784  897 2783 2784  897  899 2785
+ 2786  899 2785 2786  899 2787 2788 2787 2788  891  892 2789 2790  891 2789 2790
+  900  901 2791 2792  901 2791 2792  902  903 2793 2794 2796  903 2793 2794  901
+  904 2795 2796  901  902  903  904  907 2793 2795 2796  904  905 2797 2798  905
+ 2797 2798 2799 2800  905 2799 2800  905 2801 2802 2801 2802  903  907 2803 2804
+  907 2803 2804  869 2805 2806  869  907 2805 2806  907  908 2807 2808  908 2807
+ 2808  910  911 2809 2810  911 2809 2810  912  913 2811 2812 2814  913 2811 2812
+  911  913  914 2813 2814  912  913  914 2811 2813 2814  869  915 2815 2816  869
+ 2815 2816  913  916 2817 2818  916 2817 2818  908 2819 2820  908  916 2819 2820
+  915  916 2821 2822  915 2821 2822  908  918 2823 2824  908 2823 2824  914  919
+ 2825 2826  919 2825 2826  869 2827 2828  869  919 2827 2828  918  919 2829 2830
+  918 2829 2830  921 2831 2832 2831 2832 2833 2834  922 2833 2834  921  922 2835
+ 2836  921 2835 2836  908  925 2837 2838  925 2837 2838  923  926 2839 2840 2842
+  923 2839 2840  925  927 2841 2842  923  925  926  927 2839 2841 2842  927  928
+ 2843 2844  928 2843 2844 2845 2846  928 2845 2846  928  930 2847 2848  930 2847
+ 2848  930 2849 2850 2849 2850  922  923 2851 2852  922 2851 2852  931  932 2853
+ 2854  932 2853 2854  933  934 2855 2856 2858  934 2855 2856  932  935 2857 2858
+  932  933  934  935 2855 2857 2858  935  936 2859 2860  936 2859 2860 2861 2862
+  936 2861 2862  936 2863 2864 2863 2864  934  938 2865 2866  938 2865 2866  900
+ 2867 2868  900  938 2867 2868  938  939 2869 2870  939 2869 2870  941  942 2871
+ 2872  942 2871 2872  943  944 2873 2874 2876  944 2873 2874  942  944  945 2875
+ 2876  943  944  945 2873 2875 2876  900  946 2877 2878  900 2877 2878  944  947
+ 2879 2880  947 2879 2880  939 2881 2882  939  947 2881 2882  946  947 2883 2884
+  946 2883 2884  939  949 2885 2886  939 2885 2886  945  950 2887 2888  950 2887
+ 2888  900 2889 2890  900  950 2889 2890  949  950 2891 2892  949 2891 2892  952
+ 2893 2894 2893 2894 2895 2896  953 2895 2896  952  953 2897 2898  952 2897 2898
+  939  956 2899 2900  956 2899 2900  954  957 2901 2902 2904  954 2901 2902  956
+  958 2903 2904  954  956  957  958 2901 2903 2904  958  959 2905 2906  959 2905
+ 2906 2907 2908  959 2907 2908  959  961 2909 2910  961 2909 2910  961 2911 2912
+ 2911 2912  953  954 2913 2914  953 2913 2914  962  963 2915 2916 4899  963 2915
+ 2916  964  965 2917 2918 2920  965 2917 2918  963  965  966 2919 2920  964  965
+  966 2917 2919 2920  966  967 2921 2922  967 2921 2922 2923 2924  967 2923 2924
+  967 2925 2926 2925 2926  965  969 2927 2928  969 2927 2928  931 2929 2930  931
+  969 2929 2930  969  970 2931 2932  970 2931 2932  972  973 2933 2934 4915  973
+ 2933 2934  974  975 2935 2936 2938  975 2935 2936  973  975  976 2937 2938  974
+  975  976 2935 2937 2938  931  977 2939 2940  931 2939 2940  975  978 2941 2942
+  978 2941 2942  970 2943 2944  970  978 2943 2944  977  978 2945 2946  977 2945
+ 2946  970  980 2947 2948  970 2947 2948  976  981 2949 2950  981 2949 2950  931
+ 2951 2952  931  981 2951 2952  980  981 2953 2954  980 2953 2954  983 2955 2956
+ 2955 2956 2957 2958  984 2957 2958  983  984 2959 2960  983 2959 2960  970  987
+ 2961 2962  987 2961 2962  985  988 2963 2964 2966  985 2963 2964  987  989 2965
+ 2966  985  987  988  989 2963 2965 2966  989  990 2967 2968  990 2967 2968 2969
+ 2970  990 2969 2970  990  992 2971 2972  992 2971 2972  992 2973 2974 2973 2974
+  984  985 2975 2976  984 2975 2976    1    2 2977 2978    2 2977 2978    2    4
+ 2979 2980    4 2979 2980    5 2981 2982    2    5 2981 2982    5    6  999 2983
+ 2984    6 2983 2984    6 2985 2986    7 2985 2986    7 2987 2988 2987 2988    4
+    8 2989 2990    8 2989 2990    9   10 2991 2992 3038    9 2991 2992   11   12
+ 1011 2993 2994   12 2993 2994   12   14 2995 2996   14 2995 2996   15 2997 2998
+   12   15 2997 2998   14   17 2999 3000   17 2999 3000    9   17 3001 3002    9
+   17   18 3001 3002 3003   16   17   18 3002 3003 3004   16 3003 3004   16 3005
+ 3006 3005 3006   15   20 3007 3008   20 3007 3008   20 3009 3010   20   21 3009
+ 3010 3011   19   20   21 3010 3011 3012   19 3011 3012    9   19 3013 3014    9
+ 3013 3014   23   24 3015 3016   23 3015 3016   23 3017 3018   23   25 3017 3018
+ 3035   22 3019 3020 3019 3020    9   26 3021 3022   26 3021 3022   24   26 3023
+ 3024   24 3023 3024   28 3025 3026   26   28 3025 3026   28   29 3027 3028   29
+ 3027 3028   29 3029 3030   29   30   31 3029 3030   30   31 3031 3032   31 3031
+ 3032   31 3033 3034 3033 3034   22   23   25 3018 3035 3036   22 3035 3036    8
+ 3037 3038    8    9   10 2991 3037 3038   32   33 3039 3040   33 3039 3040   33
+   35 3041 3042   35 3041 3042   36 3043 3044   33   36 3043 3044   36   37 3045
+ 3046   37 3045 3046   37 3047 3048   38 3047 3048   38 3049 3050 3049 3050   35
+   39 3051 3052   39 3051 3052   40   41 3053 3054 3100   40 3053 3054   42   43
+ 3055 3056   43 3055 3056   43   45 3057 3058   45 3057 3058   46 3059 3060   43
+   46 3059 3060   45   48 3061 3062   48 3061 3062   40   48 3063 3064    1   40
+   47   48   49 3063 3064 3065   47   49 3064 3065 3066   47 3065 3066    1   47
+ 3067 3068    1 3067 3068   46   51 3069 3070   51 3069 3070    1   50   51 3071
+ 3072    1   50   52 3071 3072 3073   50   52 3072 3073 3074   50 3073 3074   40
+   50 3075 3076   40 3075 3076   54   55 3077 3078   54 3077 3078   54 3079 3080
+   53   54   56 3079 3080   53 3081 3082 3081 3082   40   57 3083 3084   57 3083
+ 3084   55   57 3085 3086   55 3085 3086   59 3087 3088   57   59 3087 3088   59
+   60 3089 3090   60 3089 3090   60 3091 3092   60   61 3091 3092 3093   60   61
+   62 3092 3093 3094   62 3093 3094   62 3095 3096 3095 3096   53   56 3097 3098
+   53 3097 3098    1   39   40 3099 3100    1   40   41 3053 3099 3100   63   64
+ 3101 3102   64 3101 3102   64   66 3103 3104   66 3103 3104   67 3105 3106   64
+   67 3105 3106   67   68 3107 3108   68 3107 3108   68 3109 3110   69 3109 3110
+   69 3111 3112 3111 3112   66   70 3113 3114   70 3113 3114   71   72 3115 3116
+ 3162   71 3115 3116   73   74 3117 3118   74 3117 3118   74   76 3119 3120   76
+ 3119 3120   77 3121 3122   74   77 3121 3122   76   79 3123 3124   79 3123 3124
+   71   78   79 3125 3126   71   78   80 3125 3126 3127   78   80 3126 3127 3128
+   78 3127 3128   32   78 3129 3130   32 3129 3130   77   82 3131 3132   82 3131
+ 3132   32   82 3133 3134   32   71   81   82   83 3133 3134 3135   81   83 3134
+ 3135 3136   81 3135 3136   71   81 3137 3138   71 3137 3138   85   86 3139 3140
+   85 3139 3140   85 3141 3142   85   87 3141 3142 3159   84 3143 3144 3143 3144
+   71   88 3145 3146   88 3145 3146   86   88 3147 3148   86 3147 3148   90 3149
+ 3150   88   90 3149 3150   90   91 3151 3152   91 3151 3152   91 3153 3154   91
+   92 3153 3154 3155   91   92   93 3154 3155 3156   93 3155 3156   93 3157 3158
+ 3157 3158   84   85   87 3142 3159 3160   84 3159 3160   32   70 3161 3162   32
+   70   71   72 3115 3161 3162   94   95 3163 3164   95 3163 3164   95   97 3165
+ 3166   97 3165 3166   98 3167 3168   95   98 3167 3168   98   99 3169 3170   99
+ 3169 3170   99 3171 3172  100 3171 3172  100 3173 3174 3173 3174   97  101 3175
+ 3176  101 3175 3176  102  103 3177 3178 3224  102 3177 3178  104  105 3179 3180
+  105 3179 3180  105  107 3181 3182  107 3181 3182  108 3183 3184  105  108 3183
+ 3184  107  110 3185 3186  110 3185 3186  102  109  110 3187 3188  102  109  111
+ 3187 3188 3189  109  111 3188 3189 3190  109 3189 3190   63  109 3191 3192   63
+ 3191 3192  108  113 3193 3194  113 3193 3194   63  113 3195 3196   63  102  112
+  113  114 3195 3196 3197  112  114 3196 3197 3198  112 3197 3198  102  112 3199
+ 3200  102 3199 3200  116  117 3201 3202  116 3201 3202  116 3203 3204  116  118
+ 3203 3204 3221  115 3205 3206 3205 3206  102  119 3207 3208  119 3207 3208  117
+  119 3209 3210  117 3209 3210  121 3211 3212  119  121 3211 3212  121  122 3213
+ 3214  122 3213 3214  122 3215 3216  122  123  124 3215 3216  123  124 3217 3218
+  124 3217 3218  124 3219 3220 3219 3220  115  116  118 3204 3221 3222  115 3221
+ 3222   63  101  102 3223 3224   63  102  103 3177 3223 3224  125  126 3225 3226
+  126 3225 3226  126  128 3227 3228  128 3227 3228  129 3229 3230  126  129 3229
+ 3230  129  130 3231 3232  130 3231 3232  130 3233 3234  131 3233 3234  131 3235
+ 3236 3235 3236  128  132 3237 3238  132 3237 3238  133  134 3239 3240 3286  133
+ 3239 3240  135  136 1259 3241 3242  136 3241 3242  136  138 3243 3244  138 3243
+ 3244  139 3245 3246  136  139 3245 3246  138  141 3247 3248  141 3247 3248  133
+  140  141 3249 3250  133  140  142 3249 3250 3251  140  142 3250 3251 3252  140
+ 3251 3252   94  140 3253 3254   94 3253 3254  139  144 3255 3256  144 3255 3256
+   94  143  144 3257 3258   94  143  145 3257 3258 3259  143  145 3258 3259 3260
+  143 3259 3260  133  143 3261 3262  133 3261 3262  147  148 3263 3264  147 3263
+ 3264  147 3265 3266  146  147  149 3265 3266  146 3267 3268 3267 3268  133  150
+ 3269 3270  150 3269 3270  148  150 3271 3272  148 3271 3272  152 3273 3274  150
+  152 3273 3274  152  153 3275 3276  153 3275 3276  153 3277 3278  153  154 3277
+ 3278 3279  153  154  155 3278 3279 3280  155 3279 3280  155 3281 3282 3281 3282
+  146  149 3283 3284  146 3283 3284   94  132  133 3285 3286   94  133  134 3239
+ 3285 3286  156  157 3287 3288  157 3287 3288  157  159 3289 3290  159 3289 3290
+  160 3291 3292  157  160 3291 3292  160  161 3293 3294  161 3293 3294  161 3295
+ 3296  162 3295 3296  162 3297 3298 3297 3298  159  163 3299 3300  163 3299 3300
+  164  165 3301 3302 3348  164 3301 3302  166  167 1321 3303 3304  167 3303 3304
+  167  169 3305 3306  169 3305 3306  170 3307 3308  167  170 3307 3308  169  172
+ 3309 3310  172 3309 3310  164  172 3311 3312  164  171  172  173 3311 3312 3313
+  171  173 3312 3313 3314  171 3313 3314  125  171 3315 3316  125 3315 3316  170
+  175 3317 3318  175 3317 3318  125  174  175 3319 3320  125  174  176 3319 3320
+ 3321  174  176 3320 3321 3322  174 3321 3322  164  174 3323 3324  164 3323 3324
+  178  179 3325 3326  178 3325 3326  178 3327 3328  178  180 3327 3328 3345  177
+ 3329 3330 3329 3330  164  181 3331 3332  181 3331 3332  179  181 3333 3334  179
+ 3333 3334  183 3335 3336  181  183 3335 3336  183  184 3337 3338  184 3337 3338
+  184 3339 3340  184  185 3339 3340 3341  184  185  186 3340 3341 3342  186 3341
+ 3342  186 3343 3344 3343 3344  177  178  180 3328 3345 3346  177 3345 3346  125
+  163  164 3347 3348  125  164  165 3301 3347 3348  187  188 3349 3350  188 3349
+ 3350  188  190 3351 3352  190 3351 3352  191 3353 3354  188  191 3353 3354  191
+  192 3355 3356  192 3355 3356  192 3357 3358  193 3357 3358  193 3359 3360 3359
+ 3360  190  194 3361 3362  194 3361 3362  195  196 3363 3364 3410  195 3363 3364
+  197  198 1383 3365 3366  198 3365 3366  198  200 3367 3368  200 3367 3368  201
+ 3369 3370  198  201 3369 3370  200  203 3371 3372  203 3371 3372  195  202  203
+ 3373 3374  195  202  204 3373 3374 3375  202  204 3374 3375 3376  202 3375 3376
+  156  202 3377 3378  156 3377 3378  201  206 3379 3380  206 3379 3380  156  205
+  206 3381 3382  156  205  207 3381 3382 3383  205  207 3382 3383 3384  205 3383
+ 3384  195  205 3385 3386  195 3385 3386  209  210 3387 3388  209 3387 3388  209
+ 3389 3390  208  209  211 3389 3390  208 3391 3392 3391 3392  195  212 3393 3394
+  212 3393 3394  210  212 1416 3395 3396  210 3395 3396  214 3397 3398  212  214
+ 3397 3398  214  215 3399 3400  215 3399 3400  215 3401 3402  215  216 3401 3402
+ 3403  215  216  217 3402 3403 3404  217 3403 3404  217 3405 3406 3405 3406  208
+  211 3407 3408  208 3407 3408  156  194  195 3409 3410  156  195  196 3363 3409
+ 3410  218  219 3411 3412  219 3411 3412  219  221 3413 3414  221 3413 3414  222
+ 3415 3416  219  222 3415 3416  222  223 3417 3418  223 3417 3418  223 3419 3420
+  224 3419 3420  224 3421 3422 3421 3422  221  225 3423 3424  225 3423 3424  226
+  227 3425 3426 3472  226 3425 3426  228  229 3427 3428  229 3427 3428  229  231
+ 3429 3430  231 3429 3430  232 3431 3432  229  232 3431 3432  231  234 3433 3434
+  234 3433 3434  226  234 3435 3436  187  226  233  234  235 3435 3436 3437  233
+  235 3436 3437 3438  233 3437 3438  187  233 3439 3440  187 3439 3440  232  237
+ 3441 3442  237 3441 3442  187  237 3443 3444  187  226  236  237  238 3443 3444
+ 3445  236  238 3444 3445 3446  236 3445 3446  226  236 3447 3448  226 3447 3448
+  240  241 3449 3450  240 3449 3450  240 3451 3452  240  242 3451 3452 3469  239
+ 3453 3454 3453 3454  226  243 3455 3456  243 3455 3456  241  243 3457 3458  241
+ 3457 3458  245 3459 3460  243  245 3459 3460  245  246 3461 3462  246 3461 3462
+  246 3463 3464  246  247 3463 3464 3465  246  247  248 3464 3465 3466  248 3465
+ 3466  248 3467 3468 3467 3468  239  240  242 3452 3469 3470  239 3469 3470  187
+  225  226 3471 3472  187  226  227 3425 3471 3472  249  250 3473 3474  250 3473
+ 3474  250  252 3475 3476  252 3475 3476  253 3477 3478  250  253 3477 3478  253
+  254 3479 3480  254 3479 3480  254 3481 3482  255 3481 3482  255 3483 3484 3483
+ 3484  252  256 3485 3486  256 3485 3486  257  258 3487 3488 3534  257 3487 3488
+  259  260 3489 3490  260 3489 3490  260  262 3491 3492  262 3491 3492  263 3493
+ 3494  260  263 3493 3494  262  265 3495 3496  265 3495 3496  257  265 3497 3498
+  218  257  264  265  266 3497 3498 3499  264  266 3498 3499 3500  264 3499 3500
+  218  264 3501 3502  218 3501 3502  263  268 3503 3504  268 3503 3504  218  268
+ 3505 3506  218  257  267  268  269 3505 3506 3507  267  269 3506 3507 3508  267
+ 3507 3508  257  267 3509 3510  257 3509 3510  271  272 3511 3512  271 3511 3512
+  271 3513 3514  271  273 3513 3514 3531  270 3515 3516 3515 3516  257  274 3517
+ 3518  274 3517 3518  272  274 3519 3520  272 3519 3520  276 3521 3522  274  276
+ 3521 3522  276  277 3523 3524  277 3523 3524  277 3525 3526  277  278 3525 3526
+ 3527  277  278  279 3526 3527 3528  279 3527 3528  279 3529 3530 3529 3530  270
+  271  273 3514 3531 3532  270 3531 3532  218  256 3533 3534  218  256  257  258
+ 3487 3533 3534  280  281 3535 3536  281 3535 3536  281  283 3537 3538  283 3537
+ 3538  284 3539 3540  281  284 3539 3540  284  285 3541 3542  285 3541 3542  285
+ 3543 3544  286 3543 3544  286 3545 3546 3545 3546  283  287 3547 3548  287 3547
+ 3548  288  289 3549 3550 3596  288 3549 3550  290  291 3551 3552  291 3551 3552
+  291  293 3553 3554  293 3553 3554  294 3555 3556  291  294 3555 3556  293  296
+ 3557 3558  296 3557 3558  288  295  296 3559 3560  288  295  297 3559 3560 3561
+  295  297 3560 3561 3562  295 3561 3562  249  295 3563 3564  249 3563 3564  294
+  299 3565 3566  299 3565 3566  249  298  299 3567 3568  249  298  300 3567 3568
+ 3569  298  300 3568 3569 3570  298 3569 3570  288  298 3571 3572  288 3571 3572
+  302  303 3573 3574  302 3573 3574  302 3575 3576  302  304 3575 3576 3593  301
+ 3577 3578 3577 3578  288  305 3579 3580  305 3579 3580  303  305 3581 3582  303
+ 3581 3582  307 3583 3584  305  307 3583 3584  307  308 3585 3586  308 3585 3586
+  308 3587 3588  308  309 3587 3588 3589  308  309  310 3588 3589 3590  310 3589
+ 3590  310 3591 3592 3591 3592  301  302  304 3576 3593 3594  301 3593 3594  249
+  287  288 3595 3596  249  288  289 3549 3595 3596  311  312 3597 3598  312 3597
+ 3598  312  314 3599 3600  314 3599 3600  315 3601 3602  312  315 3601 3602  315
+  316 3603 3604  316 3603 3604  316 3605 3606  317 3605 3606  317 3607 3608 3607
+ 3608  314  318 3609 3610  318 3609 3610  319  320 3611 3612 3658  319 3611 3612
+  321  322 3613 3614  322 3613 3614  322  324 3615 3616  324 3615 3616  325 3617
+ 3618  322  325 3617 3618  324  327 3619 3620  327 3619 3620  319  326  327 3621
+ 3622  319  326  328 3621 3622 3623  326  328 3622 3623 3624  326 3623 3624  280
+  326 3625 3626  280 3625 3626  325  330 3627 3628  330 3627 3628  280  329  330
+ 3629 3630  280  329  331 3629 3630 3631  329  331 3630 3631 3632  329 3631 3632
+  319  329 3633 3634  319 3633 3634  333  334 3635 3636  333 3635 3636  333 3637
+ 3638  333  335 3637 3638 3655  332 3639 3640 3639 3640  319  336 3641 3642  336
+ 3641 3642  334  336 3643 3644  334 3643 3644  338 3645 3646  336  338 3645 3646
+  338  339 3647 3648  339 3647 3648  339 3649 3650  339  340  341 3649 3650  340
+  341 3651 3652  341 3651 3652  341 3653 3654 3653 3654  332  333  335 3638 3655
+ 3656  332 3655 3656  280  318  319 3657 3658  280  319  320 3611 3657 3658  342
+  343 3659 3660  343 3659 3660  343  345 3661 3662  345 3661 3662  346 3663 3664
+  343  346 3663 3664  346  347 3665 3666  347 3665 3666  347 3667 3668  348 3667
+ 3668  348 3669 3670 3669 3670  345  349 3671 3672  349 3671 3672  350  351 3673
+ 3674 3720  350 3673 3674  352  353 3675 3676  353 3675 3676  353  355 3677 3678
+  355 3677 3678  356 3679 3680  353  356 3679 3680  355  358 3681 3682  358 3681
+ 3682  350  357  358 3683 3684  350  357  359 3683 3684 3685  357  359 3684 3685
+ 3686  357 3685 3686  311  357 3687 3688  311 3687 3688  356  361 3689 3690  361
+ 3689 3690  311  360  361 3691 3692  311  360  362 3691 3692 3693  360  362 3692
+ 3693 3694  360 3693 3694  350  360 3695 3696  350 3695 3696  364  365 3697 3698
+  364 3697 3698  364 3699 3700  364  366 3699 3700 3717  363 3701 3702 3701 3702
+  350  367 3703 3704  367 3703 3704  365  367 3705 3706  365 3705 3706  369 3707
+ 3708  367  369 3707 3708  369  370 3709 3710  370 3709 3710  370 3711 3712  370
+  371 3711 3712 3713  370  371  372 3712 3713 3714  372 3713 3714  372 3715 3716
+ 3715 3716  363  364  366 3700 3717 3718  363 3717 3718  311  349  350 3719 3720
+  311  350  351 3673 3719 3720  373  374 3721 3722  374 3721 3722  374  376 3723
+ 3724  376 3723 3724  377 3725 3726  374  377 3725 3726  377  378 3727 3728  378
+ 3727 3728  378 3729 3730  379 3729 3730  379 3731 3732 3731 3732  376  380 3733
+ 3734  380 3733 3734  381  382 3735 3736 3782  381 3735 3736  383  384 3737 3738
+  384 3737 3738  384  386 3739 3740  386 3739 3740  387 3741 3742  384  387 3741
+ 3742  386  389 3743 3744  389 3743 3744  381  388  389 3745 3746  381  388  390
+ 3745 3746 3747  388  390 3746 3747 3748  388 3747 3748  342  388 3749 3750  342
+ 3749 3750  387  392 3751 3752  392 3751 3752  342  392 3753 3754  342  391  392
+  393 3753 3754 3755  391  393 3754 3755 3756  391 3755 3756  381  391 3757 3758
+  381 3757 3758  395  396 3759 3760  395 3759 3760  395 3761 3762  395  397 3761
+ 3762 3779  394 3763 3764 3763 3764  381  398 3765 3766  398 3765 3766  396  398
+ 3767 3768  396 3767 3768  400 3769 3770  398  400 3769 3770  400  401 3771 3772
+  401 3771 3772  401 3773 3774  401  402 3773 3774 3775  401  402  403 3774 3775
+ 3776  403 3775 3776  403 3777 3778 3777 3778  394  395  397 3762 3779 3780  394
+ 3779 3780  342  380 3781 3782  342  380  381  382 3735 3781 3782  404  405 3783
+ 3784  405 3783 3784  405  407 3785 3786  407 3785 3786  408 3787 3788  405  408
+ 3787 3788  408  409 3789 3790  409 3789 3790  409 3791 3792  410 3791 3792  410
+ 3793 3794 3793 3794  407  411 3795 3796  411 3795 3796  412  413 3797 3798 3844
+  412 3797 3798  414  415 3799 3800  415 3799 3800  415  417 3801 3802  417 3801
+ 3802  418 3803 3804  415  418 3803 3804  417  420 3805 3806  420 3805 3806  412
+  419  420 3807 3808  412  419  421 3807 3808 3809  419  421 3808 3809 3810  419
+ 3809 3810  373  419 3811 3812  373 3811 3812  418  423 3813 3814  423 3813 3814
+  373  423 3815 3816  373  412  422  423  424 3815 3816 3817  422  424 3816 3817
+ 3818  422 3817 3818  412  422 3819 3820  412 3819 3820  426  427 3821 3822  426
+ 3821 3822  426 3823 3824  426  428 3823 3824 3841  425 3825 3826 3825 3826  412
+  429 3827 3828  429 3827 3828  427  429 3829 3830  427 3829 3830  431 3831 3832
+  429  431 3831 3832  431  432 3833 3834  432 3833 3834  432 3835 3836  432  433
+ 3835 3836 3837  432  433  434 3836 3837 3838  434 3837 3838  434 3839 3840 3839
+ 3840  425  426  428 3824 3841 3842  425 3841 3842  373  411  412 3843 3844  373
+  412  413 3797 3843 3844  435  436 3845 3846  436 3845 3846  436  438 3847 3848
+  438 3847 3848  439 3849 3850  436  439 3849 3850  439  440 3851 3852  440 3851
+ 3852  440 3853 3854  441 3853 3854  441 3855 3856 3855 3856  438  442 3857 3858
+  442 3857 3858  443  444 3859 3860 3906  443 3859 3860  445  446 3861 3862  446
+ 3861 3862  446  448 3863 3864  448 3863 3864  449 3865 3866  446  449 3865 3866
+  448  451 3867 3868  451 3867 3868  443  451 3869 3870  404  443  450  451  452
+ 3869 3870 3871  450  452 3870 3871 3872  450 3871 3872  404  450 3873 3874  404
+ 3873 3874  449  454 3875 3876  454 3875 3876  404  453  454 3877 3878  404  453
+  455 3877 3878 3879  453  455 3878 3879 3880  453 3879 3880  443  453 3881 3882
+  443 3881 3882  457  458 3883 3884  457 3883 3884  457 3885 3886  457  459 3885
+ 3886 3903  456 3887 3888 3887 3888  443  460 3889 3890  460 3889 3890  458  460
+ 3891 3892  458 3891 3892  462 3893 3894  460  462 3893 3894  462  463 3895 3896
+  463 3895 3896  463 3897 3898  463  464  465 3897 3898  464  465 3899 3900  465
+ 3899 3900  465 3901 3902 3901 3902  456  457  459 3886 3903 3904  456 3903 3904
+  404  442  443 3905 3906  404  443  444 3859 3905 3906  466  467 3907 3908  467
+ 3907 3908  467  469 3909 3910  469 3909 3910  470 3911 3912  467  470 3911 3912
+  470  471 3913 3914  471 3913 3914  471 3915 3916  472 3915 3916  472 3917 3918
+ 3917 3918  469  473 3919 3920  473 3919 3920  474  475 3921 3922 3968  474 3921
+ 3922  476  477 3923 3924  477 3923 3924  477  479 3925 3926  479 3925 3926  480
+ 3927 3928  477  480 3927 3928  479  482 3929 3930  482 3929 3930  474  482 3931
+ 3932  474  481  482  483 3931 3932 3933  481  483 3932 3933 3934  481 3933 3934
+  435  481 3935 3936  435 3935 3936  480  485 3937 3938  485 3937 3938  435  485
+ 3939 3940  435  474  484  485  486 3939 3940 3941  484  486 3940 3941 3942  484
+ 3941 3942  474  484 3943 3944  474 3943 3944  488  489 3945 3946  488 3945 3946
+  488 3947 3948  487  488  490 3947 3948  487 3949 3950 3949 3950  474  491 3951
+ 3952  491 3951 3952  489  491 1974 3953 3954  489 3953 3954  493 3955 3956  491
+  493 3955 3956  493  494 3957 3958  494 3957 3958  494 3959 3960  494  495 3959
+ 3960 3961  494  495  496 3960 3961 3962  496 3961 3962  496 3963 3964 3963 3964
+  487  490 3965 3966  487 3965 3966  435  473  474 3967 3968  435  474  475 3921
+ 3967 3968  497  498 3969 3970  498 3969 3970  498  500 3971 3972  500 3971 3972
+  501 3973 3974  498  501 3973 3974  501  502 3975 3976  502 3975 3976  502 3977
+ 3978  503 3977 3978  503 3979 3980 3979 3980  500  504 3981 3982  504 3981 3982
+  505  506 3983 3984 4030  505 3983 3984  507  508 3985 3986  508 3985 3986  508
+  510 3987 3988  510 3987 3988  511 3989 3990  508  511 3989 3990  510  513 3991
+ 3992  513 3991 3992  505  512  513 3993 3994  505  512  514 3993 3994 3995  512
+  514 3994 3995 3996  512 3995 3996  466  512 3997 3998  466 3997 3998  511  516
+ 3999 4000  516 3999 4000  466  516 4001 4002  466  515  516  517 4001 4002 4003
+  515  517 4002 4003 4004  515 4003 4004  505  515 4005 4006  505 4005 4006  519
+  520 4007 4008  519 4007 4008  519 4009 4010  519  521 4009 4010 4027  518 4011
+ 4012 4011 4012  505  522 4013 4014  522 4013 4014  520  522 4015 4016  520 4015
+ 4016  524 4017 4018  522  524 4017 4018  524  525 4019 4020  525 4019 4020  525
+ 4021 4022  525  526 4021 4022 4023  525  526  527 4022 4023 4024  527 4023 4024
+  527 4025 4026 4025 4026  518  519  521 4010 4027 4028  518 4027 4028  466  504
+ 4029 4030  466  504  505  506 3983 4029 4030  528  529 4031 4032  529 4031 4032
+  529  531 4033 4034  531 4033 4034  532 4035 4036  529  532 4035 4036  532  533
+ 4037 4038  533 4037 4038  533 4039 4040  534 4039 4040  534 4041 4042 4041 4042
+  531  535 4043 4044  535 4043 4044  536  537 4045 4046 4092  536 4045 4046  538
+  539 4047 4048  539 4047 4048  539  541 4049 4050  541 4049 4050  542 4051 4052
+  539  542 4051 4052  541  544 4053 4054  544 4053 4054  536  544 4055 4056  536
+  543  544  545 4055 4056 4057  543  545 4056 4057 4058  543 4057 4058  497  543
+ 4059 4060  497 4059 4060  542  547 4061 4062  547 4061 4062  497  547 4063 4064
+  497  546  547  548 4063 4064 4065  546  548 4064 4065 4066  546 4065 4066  536
+  546 4067 4068  536 4067 4068  550  551 4069 4070  550 4069 4070  550 4071 4072
+  550  552 4071 4072 4089  549 4073 4074 4073 4074  536  553 4075 4076  553 4075
+ 4076  551  553 4077 4078  551 4077 4078  555 4079 4080  553  555 4079 4080  555
+  556 4081 4082  556 4081 4082  556 4083 4084  556  557 4083 4084 4085  556  557
+  558 4084 4085 4086  558 4085 4086  558 4087 4088 4087 4088  549  550  552 4072
+ 4089 4090  549 4089 4090  497  535 4091 4092  497  535  536  537 4045 4091 4092
+  559  560 4093 4094  560 4093 4094  560  562 4095 4096  562 4095 4096  563 4097
+ 4098  560  563 4097 4098  563  564 4099 4100  564 4099 4100  564 4101 4102  565
+ 4101 4102  565 4103 4104 4103 4104  562  566 4105 4106  566 4105 4106  567  568
+ 4107 4108 4154  567 4107 4108  569  570 4109 4110  570 4109 4110  570  572 4111
+ 4112  572 4111 4112  573 4113 4114  570  573 4113 4114  572  575 4115 4116  575
+ 4115 4116  567  574  575 4117 4118  567  574  576 4117 4118 4119  574  576 4118
+ 4119 4120  574 4119 4120  528  574 4121 4122  528 4121 4122  573  578 4123 4124
+  578 4123 4124  528  577  578 4125 4126  528  577  579 4125 4126 4127  577  579
+ 4126 4127 4128  577 4127 4128  567  577 4129 4130  567 4129 4130  581  582 4131
+ 4132  581 4131 4132  581 4133 4134  581  583 4133 4134 4151  580 4135 4136 4135
+ 4136  567  584 4137 4138  584 4137 4138  582  584 4139 4140  582 4139 4140  586
+ 4141 4142  584  586 4141 4142  586  587 4143 4144  587 4143 4144  587 4145 4146
+  587  588 4145 4146 4147  587  588  589 4146 4147 4148  589 4147 4148  589 4149
+ 4150 4149 4150  580  581  583 4134 4151 4152  580 4151 4152  528  566  567 4153
+ 4154  528  567  568 4107 4153 4154  590  591 4155 4156  591 4155 4156  591  593
+ 4157 4158  593 4157 4158  594 4159 4160  591  594 4159 4160  594  595 4161 4162
+  595 4161 4162  595 4163 4164  596 4163 4164  596 4165 4166 4165 4166  593  597
+ 4167 4168  597 4167 4168  598  599 4169 4170 4216  598 4169 4170  600  601 4171
+ 4172  601 4171 4172  601  603 4173 4174  603 4173 4174  604 4175 4176  601  604
+ 4175 4176  603  606 4177 4178  606 4177 4178  598  606 4179 4180  559  598  605
+  606  607 4179 4180 4181  605  607 4180 4181 4182  605 4181 4182  559  605 4183
+ 4184  559 4183 4184  604  609 4185 4186  609 4185 4186  559  609 4187 4188  559
+  608  609  610 4187 4188 4189  608  610 4188 4189 4190  608 4189 4190  598  608
+ 4191 4192  598 4191 4192  612  613 4193 4194  612 4193 4194  612  614 4195 4196
+  614 4195 4196  611 4197 4198 4197 4198  598  615 4199 4200  615 4199 4200  613
+  615 4201 4202  613 4201 4202  617 4203 4204  615  617 4203 4204  617  618 4205
+ 4206  618 4205 4206  618  619 4207 4208  619 4207 4208  618  619  620 4209 4210
+  620 4209 4210  620 4211 4212 4211 4212  611  612  614 4213 4214  611 4213 4214
+  559  597 4215 4216  559  597  598  599 4169 4215 4216  621  622 4217 4218  622
+ 4217 4218  622  624 4219 4220  624 4219 4220  625 4221 4222  622  625 4221 4222
+  625  626 4223 4224  626 4223 4224  626 4225 4226  627 4225 4226  627 4227 4228
+ 4227 4228  624  628 4229 4230  628 4229 4230  629  630 4231 4232 4278  629 4231
+ 4232  631  632 2251 4233 4234  632 4233 4234  632  634 4235 4236  634 4235 4236
+  635 4237 4238  632  635 4237 4238  634  637 4239 4240  637 4239 4240  629  636
+  637 4241 4242  629  636  638 4241 4242  636  638 4243 4244  636 4243 4244  590
+  636 4245 4246  590 4245 4246  635  640 4247 4248  640 4247 4248  590  640 4249
+ 4250  590  639  640  641 4249 4250  639  641 4251 4252  639 4251 4252  629  639
+ 4253 4254  629 4253 4254  643  644 4255 4256  643 4255 4256  643 4257 4258  642
+  643  645 2277 4257 4258  642 4259 4260 4259 4260  629  646 4261 4262  646 4261
+ 4262  644  646 4263 4264  644 4263 4264  648 4265 4266  646  648 4265 4266  648
+  649 4267 4268  649 4267 4268  649  650 4269 4270  650 4269 4270  649  650  651
+ 2289 4271 4272  651 4271 4272  651 4273 4274 4273 4274  642  645 4275 4276  642
+ 4275 4276  590  628 4277 4278  590  628  629  630 4231 4277 4278  652  653 4279
+ 4280  653 4279 4280  653  655 4281 4282  655 4281 4282  656 4283 4284  653  656
+ 4283 4284  656  657 4285 4286  657 4285 4286  657 4287 4288  658 4287 4288  658
+ 4289 4290 4289 4290  655  659 4291 4292  659 4291 4292  660  661 4293 4294 4340
+  660 4293 4294  662  663 2313 4295 4296  663 4295 4296  663  665 4297 4298  665
+ 4297 4298  666 4299 4300  663  666 4299 4300  665  668 4301 4302  668 4301 4302
+  660  667  668 4303 4304  660  667  669 4303 4304  667  669 4305 4306  667 4305
+ 4306  621  667 4307 4308  621 4307 4308  666  671 4309 4310  671 4309 4310  621
+  671 4311 4312  621  670  671  672 4311 4312  670  672 4313 4314  670 4313 4314
+  660  670 4315 4316  660 4315 4316  674  675 4317 4318  674 4317 4318  674  676
+ 4319 4320  676 4319 4320  673 4321 4322 4321 4322  660  677 4323 4324  677 4323
+ 4324  675  677 4325 4326  675 4325 4326  679 4327 4328  677  679 4327 4328  679
+  680 4329 4330  680 4329 4330  680  681 4331 4332  681 4331 4332  680  681  682
+ 4333 4334  682 4333 4334  682 4335 4336 4335 4336  673  674  676 4337 4338  673
+ 4337 4338  621  659  660 4339 4340  621  660  661 4293 4339 4340  683  684 4341
+ 4342  684 4341 4342  684  686 4343 4344  686 4343 4344  687 4345 4346  684  687
+ 4345 4346  687  688 2363 4347 4348  688 4347 4348  688 4349 4350  689 4349 4350
+  689 4351 4352 4351 4352  686  690 4353 4354  690 4353 4354  691  692 4355 4356
+ 4402  691 4355 4356  693  694 2375 4357 4358  694 4357 4358  694  696 4359 4360
+  696 4359 4360  697 4361 4362  694  697 4361 4362  696  699 4363 4364  699 4363
+ 4364  691  699 4365 4366  691  698  699  700 4365 4366  698  700 4367 4368  698
+ 4367 4368  652  698 4369 4370  652 4369 4370  697  702 4371 4372  702 4371 4372
+  652  702 4373 4374  652  701  702  703 4373 4374  701  703 4375 4376  701 4375
+ 4376  691  701 4377 4378  691 4377 4378  705  706 4379 4380  705 4379 4380  705
+  707 4381 4382  707 4381 4382  704 4383 4384 4383 4384  691  708 4385 4386  708
+ 4385 4386  706  708 4387 4388  706 4387 4388  710 4389 4390  708  710 4389 4390
+  710  711 4391 4392  711 4391 4392  711 4393 4394  711  712  713 2413 4393 4394
+  712  713 4395 4396  713 4395 4396  713 4397 4398 4397 4398  704  705  707 2401
+ 4399 4400  704 4399 4400  652  690  691 4401 4402  652  691  692 4355 4401 4402
+  714  715 4403 4404  715 4403 4404  715  717 4405 4406  717 4405 4406  718 4407
+ 4408  715  718 4407 4408  718  719 4409 4410  719 4409 4410  719 4411 4412  720
+ 4411 4412  720 4413 4414 4413 4414  717  721 4415 4416  721 4415 4416  722  723
+ 4417 4418 4464  722 4417 4418  724  725 2437 4419 4420  725 4419 4420  725  727
+ 4421 4422  727 4421 4422  728 4423 4424  725  728 4423 4424  727  730 4425 4426
+  730 4425 4426  722  729  730 4427 4428  722  729  731 4427 4428  729  731 4429
+ 4430  729 4429 4430  683  729 4431 4432  683 4431 4432  728  733 4433 4434  733
+ 4433 4434  683  733 4435 4436  683  732  733  734 4435 4436  732  734 4437 4438
+  732 4437 4438  722  732 4439 4440  722 4439 4440  736  737 4441 4442  736 4441
+ 4442  736  738 4443 4444  738 4443 4444  735 4445 4446 4445 4446  722  739 4447
+ 4448  739 4447 4448  737  739 4449 4450  737 4449 4450  741 4451 4452  739  741
+ 4451 4452  741  742 4453 4454  742 4453 4454  742  743 4455 4456  743 4455 4456
+  742  743  744 4457 4458  744 4457 4458  744 4459 4460 4459 4460  735  736  738
+ 4461 4462  735 4461 4462  683  721 4463 4464  683  721  722  723 4417 4463 4464
+  745  746 4465 4466  746 4465 4466  746  748 4467 4468  748 4467 4468  749 4469
+ 4470  746  749 4469 4470  749  750 4471 4472  750 4471 4472  750 4473 4474  751
+ 4473 4474  751 4475 4476 4475 4476  748  752 4477 4478  752 4477 4478  753  754
+ 4479 4480 4526  753 4479 4480  755  756 2499 4481 4482  756 4481 4482  756  758
+ 4483 4484  758 4483 4484  759 4485 4486  756  759 4485 4486  758  761 4487 4488
+  761 4487 4488  753  761 4489 4490  753  760  761  762 4489 4490  760  762 4491
+ 4492  760 4491 4492  714  760 4493 4494  714 4493 4494  759  764 4495 4496  764
+ 4495 4496  714  763  764 4497 4498  714  763  765 4497 4498 4499  763  765 4498
+ 4499 4500  763 4499 4500  753  763 4501 4502  753 4501 4502  767  768 2541 4503
+ 4504  767 4503 4504  767  769 4505 4506  769 4505 4506  766 4507 4508 4507 4508
+  753  770 4509 4510  770 4509 4510  768  770 4511 4512  768 4511 4512  772 4513
+ 4514  770  772 4513 4514  772  773 4515 4516  773 4515 4516  773  774 4517 4518
+  774 4517 4518  773  774  775 2537 4519 4520  775 4519 4520  775 4521 4522 4521
+ 4522  766  767  769 2525 4523 4524  766 4523 4524  714  752  753 4525 4526  714
+  753  754 4479 4525 4526  776  777 4527 4528  777 4527 4528  777  779 4529 4530
+  779 4529 4530  780 4531 4532  777  780 4531 4532  780  781 4533 4534  781 4533
+ 4534  781 4535 4536  782 4535 4536  782 4537 4538 4537 4538  779  783 4539 4540
+  783 4539 4540  784  785 4541 4542 4588  784 4541 4542  786  787 2561 4543 4544
+  787 4543 4544  787  789 4545 4546  789 4545 4546  790 4547 4548  787  790 4547
+ 4548  789  792 4549 4550  792 4549 4550  784  792 4551 4552  745  784  791  792
+  793 4551 4552  791  793 4553 4554  791 4553 4554  745  791 4555 4556  745 4555
+ 4556  790  795 4557 4558  795 4557 4558  745  795 4559 4560  745  784  794  795
+  796 4559 4560  794  796 4561 4562  794 4561 4562  784  794 4563 4564  784 4563
+ 4564  798  799 4565 4566  798 4565 4566  797  798 4567 4568  797  800 4567 4568
+  797 4569 4570 4569 4570  784  801 4571 4572  801 4571 4572  799  801 4573 4574
+  799 4573 4574  803 4575 4576  801  803 4575 4576  803  804 4577 4578  804 4577
+ 4578  804  805 4579 4580  805 4579 4580  805  806 4581 4582  806 4581 4582  806
+ 4583 4584 4583 4584  797  800 4585 4586  797 4585 4586  745  783 4587 4588  745
+  783  784  785 4541 4587 4588  807  808 4589 4590  808 4589 4590  808  810 4591
+ 4592  810 4591 4592  811 4593 4594  808  811 4593 4594  811  812 4595 4596  812
+ 4595 4596  812 4597 4598  813 4597 4598  813 4599 4600 4599 4600  810  814 4601
+ 4602  814 4601 4602  815  816 4603 4604 4650  815 4603 4604  817  818 2623 4605
+ 4606  818 4605 4606  818  820 4607 4608  820 4607 4608  821 4609 4610  818  821
+ 4609 4610  820  823 4611 4612  823 4611 4612  815  823 4613 4614  776  815  822
+  823  824 4613 4614 4615  822  824 4614 4615 4616  822 4615 4616  776  822 4617
+ 4618  776 4617 4618  821  826 4619 4620  826 4619 4620  776  826 4621 4622  776
+  815  825  826  827 4621 4622 4623  825  827 4622 4623 4624  825 4623 4624  815
+  825 4625 4626  815 4625 4626  829  830 4627 4628  829 4627 4628  829  831 4629
+ 4630  831 4629 4630  828 4631 4632 4631 4632  815  832 4633 4634  832 4633 4634
+  830  832 4635 4636  830 4635 4636  834 4637 4638  832  834 4637 4638  834  835
+ 4639 4640  835 4639 4640  835  836 4641 4642  836 4641 4642  836  837 4643 4644
+  837 4643 4644  837 4645 4646 4645 4646  828  831 4647 4648  828 4647 4648  776
+  814 4649 4650  776  814  815  816 4603 4649 4650  838  839 4651 4652  839 4651
+ 4652  839  841 4653 4654  841 4653 4654  842 4655 4656  839  842 4655 4656  842
+  843 4657 4658  843 4657 4658  843 4659 4660  844 4659 4660  844 4661 4662 4661
+ 4662  841  845 4663 4664  845 4663 4664  846  847 4665 4666  846 4665 4666  848
+  849 4667 4668  849 4667 4668  849  851 4669 4670  851 4669 4670  852 4671 4672
+  849  852 4671 4672  851  854 4673 4674  854 4673 4674  846  854 4675 4676  807
+  846  853  854  855 4675 4676 4677  853  855 4676 4677 4678  853 4677 4678  807
+  853 4679 4680  807 4679 4680  852  857 4681 4682  857 4681 4682  807  857 4683
+ 4684  807  846  856  857  858 4683 4684  856  858 4685 4686  856 4685 4686  846
+  856 4687 4688  846 4687 4688  860  861 4689 4690  860 4689 4690  860  862 4691
+ 4692  862 4691 4692  859 4693 4694 4693 4694  846  863 4695 4696  863 4695 4696
+  861  863 4697 4698  861 4697 4698  865 4699 4700  863  865 4699 4700  865  866
+ 4701 4702  866 4701 4702  866  868 4703 4704  867  868 4703 4704  867  868 4705
+ 4706  868 4705 4706  868 4707 4708 4707 4708  859  862 4709 4710  859 4709 4710
+  807  845 4711 4712  807  845  846  847 4711 4712  869  870 4713 4714  870 4713
+ 4714  870  872 4715 4716  872 4715 4716  873 4717 4718  870  873 4717 4718  873
+  874 4719 4720  874 4719 4720  874 4721 4722  875 4721 4722  875 4723 4724 4723
+ 4724  872  876 4725 4726  876 4725 4726  877  878 4727 4728 4774  877 4727 4728
+  879  880 4729 4730  880 4729 4730  880  882 4731 4732  882 4731 4732  883 4733
+ 4734  880  883 4733 4734  882  885 4735 4736  885 4735 4736  877  885 4737 4738
+  838  877  884  885  886 4737 4738 4739  884  886 4738 4739 4740  884 4739 4740
+  838  884 4741 4742  838 4741 4742  883  888 4743 4744  888 4743 4744  838  888
+ 4745 4746  838  887  888  889 4745 4746 4747  887  889 4746 4747 4748  887 4747
+ 4748  877  887 4749 4750  877 4749 4750  891  892 4751 4752  891 4751 4752  891
+  893 4753 4754  893 4753 4754  890 4755 4756 4755 4756  877  894 4757 4758  894
+ 4757 4758  892  894 4759 4760  892 4759 4760  896 4761 4762  894  896 4761 4762
+  896  897 4763 4764  897 4763 4764  897  898 4765 4766  898 4765 4766  898  899
+ 4767 4768  899 4767 4768  899 4769 4770 4769 4770  890  893 4771 4772  890 4771
+ 4772  838  876 4773 4774  838  876  877  878 4727 4773 4774  900  901 4775 4776
+  901 4775 4776  901  903 4777 4778  903 4777 4778  904 4779 4780  901  904 4779
+ 4780  904  905 4781 4782  905 4781 4782  905 4783 4784  906 4783 4784  906 4785
+ 4786 4785 4786  903  907 4787 4788  907 4787 4788  908  909 4789 4790  908 4789
+ 4790  910  911 4791 4792  911 4791 4792  911  913 4793 4794  913 4793 4794  914
+ 4795 4796  911  914 4795 4796  913  916 4797 4798  916 4797 4798  908  916 4799
+ 4800  869  908  915  916  917 4799 4800 4801  915  917 4800 4801 4802  915 4801
+ 4802  869  915 4803 4804  869 4803 4804  914  919 4805 4806  919 4805 4806  869
+  919 4807 4808  869  908  918  919  920 4807 4808  918  920 4809 4810  918 4809
+ 4810  908  918 4811 4812  908 4811 4812  922  923 4813 4814  922 4813 4814  922
+  924 4815 4816  924 4815 4816  921 4817 4818 4817 4818  908  925 4819 4820  925
+ 4819 4820  923  925 4821 4822  923 4821 4822  927 4823 4824  925  927 4823 4824
+  927  928 4825 4826  928 4825 4826  928  930 4827 4828  929  930 4827 4828  929
+  930 4829 4830  930 4829 4830  930 4831 4832 4831 4832  921  924 4833 4834  921
+ 4833 4834  869  907  908 4835 4836  869  908  909 4835 4836  931  932 4837 4838
+  932 4837 4838  932  934 4839 4840  934 4839 4840  935 4841 4842  932  935 4841
+ 4842  935  936 4843 4844  936 4843 4844  936 4845 4846  937 4845 4846  937 4847
+ 4848 4847 4848  934  938 4849 4850  938 4849 4850  939  940 4851 4852  939 4851
+ 4852  941  942 4853 4854  942 4853 4854  942  944 4855 4856  944 4855 4856  945
+ 4857 4858  942  945 4857 4858  944  947 4859 4860  947 4859 4860  939  947 4861
+ 4862  900  939  946  947  948 4861 4862 4863  946  948 4862 4863 4864  946 4863
+ 4864  900  946 4865 4866  900 4865 4866  945  950 4867 4868  950 4867 4868  900
+  950 4869 4870  900  939  949  950  951 4869 4870  949  951 4871 4872  949 4871
+ 4872  939  949 4873 4874  939 4873 4874  953  954 4875 4876  953 4875 4876  953
+  955 4877 4878  955 4877 4878  952 4879 4880 4879 4880  939  956 4881 4882  956
+ 4881 4882  954  956 4883 4884  954 4883 4884  958 4885 4886  956  958 4885 4886
+  958  959 4887 4888  959 4887 4888  959  961 4889 4890  960  961 4889 4890  960
+  961 4891 4892  961 4891 4892  961 4893 4894 4893 4894  952  955 4895 4896  952
+ 4895 4896  900  938  939 4897 4898  900  939  940 4897 4898  962  963 2915 4899
+ 4900  963 4899 4900  963  965 4901 4902  965 4901 4902  966 4903 4904  963  966
+ 4903 4904  966  967 4905 4906  967 4905 4906  967 4907 4908  968 4907 4908  968
+ 4909 4910 4909 4910  965  969 4911 4912  969 4911 4912  970  971 4913 4914 4960
+  970 4913 4914  972  973 2933 4915 4916  973 4915 4916  973  975 4917 4918  975
+ 4917 4918  976 4919 4920  973  976 4919 4920  975  978 4921 4922  978 4921 4922
+  970  977  978 4923 4924  970  977  979 4923 4924  977  979 4925 4926  977 4925
+ 4926  931  977 4927 4928  931 4927 4928  976  981 4929 4930  981 4929 4930  931
+  980  981 4931 4932  931  980  982 4931 4932  980  982 4933 4934  980 4933 4934
+  970  980 4935 4936  970 4935 4936  984  985 4937 4938  984 4937 4938  983  984
+ 4939 4940  983  986 4939 4940  983 4941 4942 4941 4942  970  987 4943 4944  987
+ 4943 4944  985  987 4945 4946  985 4945 4946  989 4947 4948  987  989 4947 4948
+  989  990 4949 4950  990 4949 4950  990  991 4951 4952  991 4951 4952  991  992
+ 4953 4954  992 4953 4954  992 4955 4956 4955 4956  983  986 4957 4958  983 4957
+ 4958  931  969  970 4959 4960  931  970  971 4913 4959 4960
+  0.320886418015886993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435047786080177985E-03 -0.435050661596638019E-03 -0.435015088300818990E-03
+ -0.435084759117159022E-03 -0.435058330687056996E-03 -0.435040431898216988E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138286756523262001E-03
+ -0.140229438392076996E-03 -0.138900356244381991E-03 -0.139615838670957006E-03
+ -0.138288251646271991E-03 -0.140227943269067006E-03  0.000000000000000000E+00
+  0.419834067892811968E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305217256013E-03 -0.161424187158655996E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435071410144034006E-03 -0.435067418468867008E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255994304135462002E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980740464878987E-03 -0.435120576273734998E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286719937541003E-03
+ -0.140229474977798997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253690283189994988E-01 -0.434980401777350985E-03 -0.435120927891995981E-03
+ -0.666666666666666970E-02 -0.140229499401465997E-03 -0.138286695513873000E-03
+ -0.166666666666667011E-01  0.319795589003930014E-01 -0.161423181596791996E-03
+ -0.161424310779120013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063425289485017E-03 -0.435075493914430017E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320997920666244024E-01  0.000000000000000000E+00
+ -0.161423317051876997E-03 -0.161424175324035012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435064354014662996E-03 -0.435074543616996011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292675967555632993E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423746720073988E-03
+ -0.161423745655837994E-03 -0.161424013317901000E-03 -0.161423479058011009E-03
+ -0.161423479016896003E-03 -0.161424013359016006E-03 -0.666666666666666970E-02
+ -0.434770925530010013E-03 -0.435374516178396991E-03 -0.434923678631289005E-03
+ -0.435218350961854000E-03 -0.434767192010144024E-03 -0.435378332843076009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929664672890988E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014954239530986E-03
+ -0.435084898922045006E-03 -0.666666666666666970E-02 -0.138286666946142001E-03
+ -0.140229527969196996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255503077544010984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360613601998E-03 -0.161424131762310011E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068344066269984E-03
+ -0.435070463673477021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823531939237005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423118685809013E-03 -0.161424373690102996E-03 -0.666666666666666970E-02
+ -0.139258124485676013E-03 -0.139258070429663011E-03 -0.166666666666667011E-01
+  0.255489774258116989E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423314542717992E-03 -0.161424177833194993E-03 -0.434708406678657993E-03
+ -0.435438495815256975E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322022752306447974E-01
+ -0.435029928828667000E-03 -0.435069283946130990E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138272576530863001E-03 -0.140243618384475996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258515188587684006E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435046874781375994E-03
+ -0.435051611957959993E-03 -0.138907408932722987E-03 -0.139608785982616010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997478880679990E-01 -0.161423315884339007E-03 -0.161424176491573002E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068970788089987E-03
+ -0.435069822829565989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255013626659516997E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161778512187199996E-03 -0.161068980188712013E-03
+ -0.166666666666667011E-01 -0.430721544738902977E-03 -0.439524019069752994E-03
+  0.000000000000000000E+00  0.319967450902398018E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423540386620003E-03
+ -0.161423951989293009E-03 -0.436779536615916999E-03 -0.433396525451669977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257464062516243007E-01  0.000000000000000000E+00
+ -0.357123043190452993E-03 -0.513335165827105049E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849239149727987E-03
+ -0.139666955765612013E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600788316065025E-01  0.000000000000000000E+00 -0.438009740604286991E-03
+ -0.432217430813423007E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110342284848130999E-03 -0.562588969540806947E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.250247725514408000E-01
+  0.000000000000000000E+00  0.312521876302675970E-04 -0.718933610019259007E-03
+ -0.666666666666666970E-02 -0.143947962448449994E-03 -0.134568232466890006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321784029946046990E-01
+  0.000000000000000000E+00 -0.131393250908101988E-03 -0.556288171480889002E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.344954211373242007E-03
+ -0.526364720145119946E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252170876990303987E-01 -0.161423598610935996E-03 -0.161423893764976013E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435940683599852022E-03
+ -0.434217221119004976E-03 -0.166666666666667011E-01  0.320886759388818998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435055736276135005E-03
+ -0.435042919782540022E-03 -0.435014872722787977E-03 -0.435084983920160974E-03
+ -0.435084944266096979E-03 -0.435014910749576022E-03 -0.166666666666667011E-01
+ -0.138286772670705003E-03 -0.140229422244634997E-03 -0.138288005601589994E-03
+ -0.140228189313749003E-03 -0.138900291039652002E-03 -0.139615903875686995E-03
+  0.000000000000000000E+00  0.419834016282712025E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305078950007E-03 -0.161424187296962002E-03
+ -0.666666666666666970E-02 -0.435072265974590982E-03 -0.435066581504923002E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255995473964411996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434977413290588025E-03 -0.435124045681951985E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286961247241009E-03 -0.140229233668097989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253627459848576008E-01
+  0.000000000000000000E+00 -0.434982364929778985E-03 -0.435118882445508014E-03
+ -0.666666666666666970E-02 -0.140242815655179008E-03 -0.138273379260159989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319681677869403033E-01
+ -0.161422464571142990E-03 -0.161425027804768992E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.434576582527966993E-03 -0.435573489949336987E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997527624522980E-01  0.000000000000000000E+00 -0.161423316033646006E-03
+ -0.161424176342266003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435072976730357019E-03 -0.435065886383583010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.292687587508236001E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423746234547004E-03
+ -0.161423746141366008E-03 -0.161424006944943987E-03 -0.161423485430967995E-03
+ -0.161423485376057003E-03 -0.161424006999856008E-03 -0.666666666666666970E-02
+ -0.435021380261554999E-03 -0.435118476044887013E-03 -0.434889017803224000E-03
+ -0.435253786460061004E-03 -0.434968358559784986E-03 -0.435172681171430026E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929989771356011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014809641986002E-03 -0.435085049702760990E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286733578440991E-03 -0.140229461336898007E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502685417309007E-01  0.000000000000000000E+00 -0.161423359690042998E-03
+ -0.161424132685869011E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435070746654247023E-03 -0.435068067322774026E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252823730509023001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423118749988009E-03 -0.161424373625924000E-03
+ -0.666666666666666970E-02 -0.139258062928016002E-03 -0.139258131987322995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255489363703280993E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313440402000E-03
+ -0.161424178935510009E-03  0.000000000000000000E+00 -0.434870514603341021E-03
+ -0.435272746258683974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322040616077563027E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435018118913463021E-03 -0.435081599403137019E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138276442783731991E-03 -0.140239752131608009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258538650317957017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435040431700919026E-03
+ -0.435058330932033995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138909454179009012E-03 -0.139606740736330012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997274074339978E-01 -0.161423315341556987E-03
+ -0.161424177034354995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435070727065607990E-03 -0.435068086477601978E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256925653634595998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.430785237193071976E-03 -0.439488203700820024E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.138857649137926994E-03
+ -0.139658545777412004E-03  0.000000000000000000E+00  0.319726623957349021E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422835104515990E-03 -0.161424657271395992E-03 -0.427144778551284004E-03
+ -0.443167299433834994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256448637616590001E-01  0.000000000000000000E+00
+ -0.324936573418277007E-03 -0.544311326562666028E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138671855069575001E-03
+ -0.139844339845764999E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.421522789439464016E-01  0.000000000000000000E+00
+ -0.438342696270233999E-03 -0.431915478441594015E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.403316551935295006E-04 -0.713262909582468036E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.250271632275507992E-01  0.000000000000000000E+00  0.157694230048913999E-04
+ -0.703450845393883040E-03 -0.666666666666666970E-02 -0.143088258477573990E-03
+ -0.135427936437765008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321536300563562971E-01  0.000000000000000000E+00  0.469561289675323007E-04
+ -0.734637551356524029E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.337097960055970989E-03 -0.529554555471797037E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251453403201698997E-01 -0.161349265278626000E-03 -0.161498227097286009E-03
+ -0.666666666666666970E-02 -0.239135715409318991E-03 -0.630652995284446973E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312158150516623004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161424178244695010E-03 -0.161423314131216999E-03
+ -0.161423314059395002E-03 -0.161424178316517007E-03 -0.161423748120379993E-03
+ -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937193477996E-03
+ -0.435086217679552987E-03 -0.138285143313543007E-03 -0.140231051601796993E-03
+ -0.435084982345309993E-03 -0.435054144804906983E-03  0.000000000000000000E+00
+  0.420929687910095995E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435023352535621991E-03 -0.435076141203039014E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286660715920991E-03
+ -0.140229534199418006E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256871015736835004E-01  0.000000000000000000E+00
+ -0.435049188847810018E-03 -0.435049198755422021E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258077328959006E-03 -0.139258117586380995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598187185257006E-01
+  0.000000000000000000E+00 -0.161423313380501998E-03 -0.161424178995410011E-03
+ -0.666666666666666970E-02 -0.435070190010078980E-03 -0.435068611700095025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929801235649015E-01
+ -0.435014945836726004E-03 -0.435084907682568021E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694923928999E-03 -0.140229499991409998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361127142999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423315570077993E-03
+ -0.161424176805833989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069372832834977E-03 -0.435069411726189985E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296373133570986998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435042919779554996E-03 -0.435055736280175014E-03 -0.435084883176088981E-03
+ -0.435014969337498025E-03 -0.435014925977329975E-03 -0.435084928391770982E-03
+ -0.666666666666666970E-02 -0.138910376259480993E-03 -0.139605818655859007E-03
+ -0.138287928869166004E-03 -0.140228266046172993E-03  0.000000000000000000E+00
+ -0.138286695535614994E-03 -0.140229499379725006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073423381998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232433000E-03 -0.161424187143480012E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384652433987E-03 -0.435069399640343013E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582249639075999E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660603881994E-03 -0.435120659531148981E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286858193205001E-03 -0.140229336722134999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868354321434997E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980641366321990E-03 -0.435120678040121982E-03 -0.666666666666666970E-02
+ -0.140229336893623989E-03 -0.138286858021716011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255642318718434999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743718790006E-03 -0.161423748657122003E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435032845221742983E-03 -0.435106762281529014E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996545778364992E-01
+  0.000000000000000000E+00 -0.161423313381549988E-03 -0.161424178994361994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384536214995E-03
+ -0.435069399759127010E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256786742296024988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743714627998E-03 -0.161423748661284011E-03  0.000000000000000000E+00
+ -0.435032854625325984E-03 -0.435106752665999992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996544124479005E-01 -0.161423313377080012E-03 -0.161424178998831997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069378322735991E-03
+ -0.435069406112619992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256614590292301992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435016112049723975E-03 -0.435083691563065015E-03 -0.166666666666667011E-01
+ -0.138320998722293008E-03 -0.140195196193045990E-03  0.000000000000000000E+00
+  0.319837098823798002E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.435069384789494977E-03 -0.435069399500138975E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256582249673321001E-01  0.000000000000000000E+00 -0.434980660337917985E-03
+ -0.435120659808483994E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138286858200591995E-03 -0.140229336714748005E-03
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073415166001E-01
+  0.000000000000000000E+00 -0.161423305232409988E-03 -0.161424187143501994E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069382174439019E-03
+ -0.435069402174196023E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078098052004E-01  0.000000000000000000E+00
+ -0.434980641344910027E-03 -0.435120678062418979E-03 -0.666666666666666970E-02
+ -0.140229336715104002E-03 -0.138286858200234995E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823800986E-01  0.000000000000000000E+00
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384776443026E-03 -0.435069399513485016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253136149492917996E-01 -0.435016112049779974E-03
+ -0.435083691563007010E-03 -0.666666666666666970E-02 -0.138320998722294011E-03
+ -0.140195196193044987E-03 -0.166666666666667011E-01  0.312157552277101992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161424178824619004E-03 -0.161423313551293005E-03 -0.161423313542176003E-03
+ -0.161424178833736006E-03 -0.161423748436264003E-03 -0.161423743939648006E-03
+ -0.166666666666667011E-01 -0.435065044970892999E-03 -0.435073837108244986E-03
+ -0.138284790665693988E-03 -0.140231404249645009E-03 -0.435089215417341023E-03
+ -0.435050005423793986E-03  0.000000000000000000E+00  0.420929694350214018E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016885622778980E-03 -0.435082884886724001E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286670421676992E-03 -0.140229524493662005E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.254915570467122987E-01  0.000000000000000000E+00 -0.161423360047314990E-03
+ -0.161424132328596992E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435068591366232014E-03 -0.435070210801716977E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.249645634044841011E-01  0.000000000000000000E+00
+ -0.161423119061156986E-03 -0.161424373314754995E-03 -0.666666666666666970E-02
+ -0.139258118141762012E-03 -0.139258076773577012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319795589138204966E-01 -0.161423181597302004E-03
+ -0.161424310778610005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063329379546009E-03 -0.435075591995543994E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322057822807787983E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435022004975030990E-03
+ -0.435077546786745027E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138280140954257988E-03 -0.140236053961082012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286987467725142997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423485461702009E-03 -0.161424006914210000E-03
+ -0.161423743701962999E-03 -0.161423748673950013E-03 -0.161423485478031991E-03
+ -0.161424006897879991E-03 -0.666666666666666970E-02 -0.434818199339674985E-03
+ -0.435326186862402978E-03 -0.138107939681827004E-03 -0.140408255233511993E-03
+  0.000000000000000000E+00 -0.434750487935355013E-03 -0.435395405802110002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834078568206009E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305246576987E-03 -0.161424187129334995E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069368742756007E-03
+ -0.435069415908644010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256580957796501002E-01  0.000000000000000000E+00
+ -0.434980863637934997E-03 -0.435120447850457018E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286593339164988E-03 -0.140229601576174009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256868273049125004E-01
+  0.000000000000000000E+00 -0.434980612246946025E-03 -0.435120708410804025E-03
+ -0.666666666666666970E-02 -0.140229353764145001E-03 -0.138286841151193996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256611955308473985E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435106719725676021E-03 -0.434994029732162983E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138320118013872997E-03 -0.140196076901467003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320999233193210026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423320520808004E-03 -0.161424171855104005E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435059778147493012E-03 -0.435079222540740013E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257716863609747000E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435022363974353014E-03
+ -0.435077172536910996E-03  0.000000000000000000E+00 -0.138312498857105987E-03
+ -0.140203696058234013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996765524547006E-01
+  0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858159996E-03
+ -0.435070024197671990E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255013626644550011E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161778513994175003E-03 -0.161068978381738009E-03 -0.166666666666667011E-01
+ -0.430721544928323017E-03 -0.439524018875910982E-03  0.000000000000000000E+00
+  0.319967451293014030E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03
+ -0.436779134539225992E-03 -0.433396918749564988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257463974004799000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.357123790381716987E-03 -0.513334443915504997E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849213053176987E-03
+ -0.139666981862162010E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600707339388966E-01  0.000000000000000000E+00 -0.438162222473089020E-03
+ -0.432071491834290977E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110310967384949997E-03 -0.562620287003987962E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.250247683829405015E-01  0.000000000000000000E+00  0.312489112000452975E-04
+ -0.718930333589037039E-03 -0.666666666666666970E-02 -0.143948366628467004E-03
+ -0.134567828286871993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321784029807069966E-01  0.000000000000000000E+00 -0.131393333165154990E-03
+ -0.556288089223836053E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.344952960576817022E-03 -0.526365968885628946E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252170877020880015E-01 -0.161423598610962993E-03
+ -0.161423893764948989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157364394618012E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423748574843987E-03 -0.161423743801067995E-03 -0.161423313378308007E-03
+ -0.161424178997604002E-03 -0.161424178997597009E-03 -0.161423313378315000E-03
+ -0.166666666666667011E-01 -0.435069384129798016E-03 -0.435069400174701974E-03
+ -0.435069377183192995E-03 -0.435069407277841015E-03 -0.138284694479560007E-03
+ -0.140231500435778991E-03  0.000000000000000000E+00  0.420929781177558013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014927902171013E-03
+ -0.435084926384880994E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286690846829989E-03 -0.140229504068509008E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.254915410590286010E-01
+  0.000000000000000000E+00 -0.161423359665572013E-03 -0.161424132710339996E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435069213980331018E-03
+ -0.435069574158249974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.249645180764926991E-01  0.000000000000000000E+00 -0.161423119329893002E-03
+ -0.161424373046020010E-03 -0.666666666666666970E-02 -0.139258102070886994E-03
+ -0.139258092844452003E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319784847376137008E-01 -0.161423139773135993E-03 -0.161424352602775989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059186696719997E-03
+ -0.435079828747282984E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322084822393016029E-01
+  0.000000000000000000E+00 -0.435016615284991993E-03 -0.435083166852704025E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138285779894368007E-03
+ -0.140230415020971993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286738419021206996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423325593204012E-03
+ -0.161424166782707997E-03 -0.161423747542489989E-03 -0.161423744833421993E-03
+ -0.161423325604668989E-03 -0.161424166771242993E-03 -0.666666666666666970E-02
+ -0.435017582368835013E-03 -0.435122368257235984E-03 -0.138274982048601996E-03
+ -0.140241212866737001E-03 -0.435000864585250997E-03 -0.435139462305625011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834075447891031E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305237978992E-03 -0.161424187137933992E-03 -0.666666666666666970E-02
+ -0.435069372007444006E-03 -0.435069412570366005E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256581825663056012E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980761705325996E-03
+ -0.435120554118331997E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286771391029988E-03 -0.140229423524309009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868309172346015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980606570965994E-03 -0.435120714326099026E-03 -0.666666666666666970E-02
+ -0.140229346218551998E-03 -0.138286848696788002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256614027694726000E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435029702345694975E-03 -0.435069519599524978E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138320910572908000E-03 -0.140195284342430998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997009695584007E-01
+  0.000000000000000000E+00 -0.161423314617753998E-03 -0.161424177758158011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435066266755664011E-03
+ -0.435072587790152974E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257749746704468014E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435018010012214000E-03 -0.435081712497056015E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138319195561076991E-03
+ -0.140196999354262006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996637608070984E-01
+ -0.161423313627734002E-03 -0.161424178748178007E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069045666087995E-03 -0.435069746265178998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255649837723719991E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423599069663996E-03
+ -0.161423893306248013E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435941020866372994E-03 -0.434216890957876992E-03  0.000000000000000000E+00
+  0.321778215180335975E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.133203886290680012E-03 -0.554477536098311953E-03
+ -0.310287644885220981E-03 -0.560775130597174036E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258308431809457992E-01
+  0.000000000000000000E+00 -0.200815543944558010E-03 -0.626132470433443992E-03
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.446184483319574977E-03
+ -0.411111793506487000E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.421914017926215967E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435150874851761001E-03 -0.434951664732459008E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138419555564682994E-03 -0.140096639350656004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.251983574296692006E-01  0.000000000000000000E+00 -0.874932041930874000E-05
+ -0.678932101969682953E-03 -0.666666666666666970E-02 -0.139012530212417987E-03
+ -0.139503664702921010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319960855039493966E-01  0.000000000000000000E+00 -0.161423534312431000E-03
+ -0.161423958063482012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.422822405468364991E-03 -0.447571581638852981E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251635090367436989E-01 -0.161499426491679002E-03 -0.161348065884233007E-03
+ -0.666666666666666970E-02 -0.431483288576659993E-03 -0.438741647262608995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157284528267985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161424179076904013E-03
+ -0.161423313299007996E-03 -0.161423313309436991E-03 -0.161424179066474991E-03
+ -0.161423748561558987E-03 -0.161423743814352995E-03 -0.166666666666667011E-01
+ -0.435069198160920999E-03 -0.435069590334165020E-03 -0.138284743432991999E-03
+ -0.140231451482346998E-03 -0.435089075923097014E-03 -0.435050142023235000E-03
+  0.000000000000000000E+00  0.420929803448919018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014018174735974E-03 -0.435085875041650019E-03
+ -0.666666666666666970E-02 -0.138286696532327987E-03 -0.140229498383011010E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256871018283196008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188665910004E-03
+ -0.435049198945112027E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258096875932990E-03 -0.139258098039406008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252598186653136998E-01  0.000000000000000000E+00 -0.161423313379014988E-03
+ -0.161424178996896994E-03 -0.666666666666666970E-02 -0.435069414335974010E-03
+ -0.435069370280600999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320929801358365019E-01 -0.435014929695106008E-03 -0.435084924514935985E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694969620992E-03
+ -0.140229499945718005E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996579150382988E-01  0.000000000000000000E+00
+ -0.161423313471394000E-03 -0.161424178904518009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069382051928022E-03 -0.435069402299393022E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.295485029250618994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435016114097582006E-03 -0.435083689427570000E-03
+ -0.435055716891851981E-03 -0.435042938371768973E-03 -0.435014924444715012E-03
+ -0.435084929989965974E-03 -0.666666666666666970E-02 -0.138321048329605000E-03
+ -0.140195146585733998E-03 -0.138900264540687006E-03 -0.139615930374651991E-03
+ -0.138286695606836994E-03 -0.140229499308502004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804454360026E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924421165002E-03
+ -0.435084930014524021E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286695611057007E-03 -0.140229499304281991E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257458287849329989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435049188237201015E-03 -0.435049199392181004E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270754996E-03
+ -0.139258097644584002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463582341989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313378621992E-03 -0.161424178997289990E-03
+ -0.666666666666666970E-02 -0.435069399485626008E-03 -0.435069384803689026E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257404831787951989E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435042938371768973E-03 -0.435055716891853011E-03
+  0.000000000000000000E+00 -0.138910376033074995E-03 -0.139605818882264002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361628787004E-01
+  0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804480011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384827771000E-03
+ -0.435069399460985995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256633780151170991E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378029990E-03
+ -0.161424178997881992E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435069376688627994E-03 -0.435069407783549012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281648140976E-01 -0.435014924425583993E-03 -0.435084930009956982E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694981505013E-03
+ -0.140229499933834011E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614590292298002E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016112049637998E-03
+ -0.435083691563155003E-03 -0.166666666666667011E-01 -0.138320998722293008E-03
+ -0.140195196193045990E-03  0.000000000000000000E+00  0.319837098823795019E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800439022E-03
+ -0.435069399488948003E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256582249675047017E-01
+  0.000000000000000000E+00 -0.434980660275586010E-03 -0.435120659873482023E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286858201177004E-03 -0.140229336714161993E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419834073409681013E-01  0.000000000000000000E+00
+ -0.161423305232394999E-03 -0.161424187143517010E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384650815002E-03 -0.435069399641999023E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253691078101502994E-01  0.000000000000000000E+00 -0.434980641228884999E-03
+ -0.435120678183400979E-03 -0.666666666666666970E-02 -0.140229336714165002E-03
+ -0.138286858201173995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837098823795019E-01  0.000000000000000000E+00 -0.161423313471231993E-03
+ -0.161424178904679989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384800325018E-03 -0.435069399489064988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492911994E-01 -0.435016112049640979E-03 -0.435083691563150992E-03
+ -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.312154838512909011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423744465432012E-03 -0.161423747910479997E-03
+ -0.161423311121030010E-03 -0.161424181254881999E-03 -0.161424181258488001E-03
+ -0.161423311117424008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.435082183658245017E-03 -0.435056882388395007E-03 -0.435077576153311009E-03
+ -0.435061388362830001E-03 -0.138286177937990999E-03 -0.140230016977347998E-03
+  0.000000000000000000E+00  0.420359769758308008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434988625092396007E-03 -0.435112371917261017E-03
+ -0.666666666666666970E-02 -0.138123103593930992E-03 -0.140393091321408005E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.254927196052617995E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423391697327001E-03
+ -0.161424100678585008E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.429493592762821984E-03 -0.440767063383173984E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.251692612175913009E-01  0.000000000000000000E+00 -0.161423746187955991E-03
+ -0.161423746187955991E-03 -0.666666666666666970E-02 -0.440899213583507998E-03
+ -0.429390539341747025E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318962392046409984E-01 -0.363404809254292002E-03 -0.507297969422810992E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135240574414478994E-03
+ -0.143275620500860003E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322093262366704020E-01  0.000000000000000000E+00
+ -0.435187611870158021E-03 -0.434916457449104998E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138287300795975000E-03 -0.140228894119363997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286700106542116008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423291382795011E-03
+ -0.161424200993116998E-03 -0.161423748962132008E-03 -0.161423743413780001E-03
+ -0.161423291392504990E-03 -0.161424200983406992E-03 -0.666666666666666970E-02
+ -0.435081950459870019E-03 -0.435057110654062009E-03 -0.138300165059673992E-03
+ -0.140216029855665006E-03 -0.435072720707763974E-03 -0.435066136833905980E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834073815130021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305233497009E-03
+ -0.161424187142415000E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069380325788988E-03 -0.435069404064516009E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256582188293747009E-01
+  0.000000000000000000E+00 -0.434980686274821977E-03 -0.435120632764312025E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286845679548006E-03
+ -0.140229349235790991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256868342262811002E-01  0.000000000000000000E+00 -0.434980627630137003E-03
+ -0.435120692364288997E-03 -0.666666666666666970E-02 -0.140229339372730996E-03
+ -0.138286855542608001E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256614658944893000E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435012511657230014E-03
+ -0.435087446025792977E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138321094746602009E-03 -0.140195100168737991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996483486796005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313210778008E-03 -0.161424179165134001E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068950004979999E-03
+ -0.435069844082024008E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257759423107052008E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016296291419009E-03 -0.435083499433781991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138321121808393998E-03
+ -0.140195073106944999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996585742336984E-01  0.000000000000000000E+00 -0.161423313488731992E-03
+ -0.161424178887179990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069291046625011E-03 -0.435069495355377978E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255844281626146988E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.327328574610849998E-03 -0.542038187704011951E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.138213770503833004E-03
+ -0.140302424411505993E-03  0.000000000000000000E+00  0.321601835856102983E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.946023952620347994E-04 -0.593079027126956976E-03 -0.133620312350716999E-03
+ -0.725519910394541970E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257239980440517012E-01  0.000000000000000000E+00
+  0.196968326301319991E-04 -0.707378255019124025E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458132642260517973E-03
+ -0.413595834475461983E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.423184714573687989E-01  0.000000000000000000E+00 -0.434738589247653001E-03
+ -0.435372752845944011E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138688174968105987E-03 -0.139828019947233010E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.251897402149190985E-01  0.000000000000000000E+00 -0.646688794373514043E-04
+ -0.623012542951639965E-03 -0.666666666666666970E-02 -0.137976511090806987E-03
+ -0.140539683824532011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319702861190497012E-01  0.000000000000000000E+00 -0.161422661148781000E-03
+ -0.161424831227131009E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.425027871300158022E-03 -0.445328608699814012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253132418654096990E-01 -0.429884568296530988E-03 -0.440422631514232989E-03
+ -0.666666666666666970E-02 -0.138411741521856003E-03 -0.140104453393482994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.310595796671125010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161430908664596994E-03 -0.161416583711314988E-03
+ -0.161367227947133000E-03 -0.161480264428779009E-03 -0.161480354961919997E-03
+ -0.161367137413992012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.438392776826464984E-03 -0.431826121382375026E-03 -0.436721410041992001E-03
+ -0.433458024629497019E-03 -0.139062002018278005E-03 -0.139454192897061995E-03
+  0.000000000000000000E+00  0.421404524821471990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.402803578099727999E-03 -0.467868848878249025E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.227910657339421994E-03
+ -0.900841911728360008E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256049262714956004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.340826630558288979E-03
+ -0.529260041764853008E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138562349500047995E-03 -0.139953845415291002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253040215793172013E-01  0.000000000000000000E+00
+ -0.246116548137240024E-03 -0.598443155917590948E-03 -0.666666666666666970E-02
+ -0.140073789192540987E-03 -0.138442405722798010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319703071004562994E-01 -0.161422660702825987E-03
+ -0.161424831673085995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.426922177757430997E-03 -0.443395080957603976E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998407804300029E-01
+  0.000000000000000000E+00 -0.161423318608057989E-03 -0.161424173767853993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432673706188636982E-03
+ -0.437518364259369976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295486137458738987E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435073936139989976E-03 -0.435025467270096005E-03 -0.435055941094787983E-03
+ -0.435042723373920998E-03 -0.435014132966001985E-03 -0.435085755332848012E-03
+ -0.666666666666666970E-02 -0.138321121970870991E-03 -0.140195072944468007E-03
+ -0.138900280797982998E-03 -0.139615914117355999E-03  0.000000000000000000E+00
+ -0.138286771859713999E-03 -0.140229423055624998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929803055049012E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014930179625984E-03 -0.435084924009661019E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286695318050990E-03
+ -0.140229499597288007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.257458286284305005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049187797943010E-03
+ -0.435049199850251001E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258097008661995E-03 -0.139258097906677002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255775441079788002E-01
+  0.000000000000000000E+00 -0.161423313317997011E-03 -0.161424179057914998E-03
+ -0.666666666666666970E-02 -0.435069409759835009E-03 -0.435069374755894995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257406249620084002E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042723384703009E-03
+ -0.435055941080233978E-03  0.000000000000000000E+00 -0.138910497854425013E-03
+ -0.139605697060914011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997355336692033E-01  0.000000000000000000E+00 -0.161423315554604991E-03
+ -0.161424176821307994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069370488565992E-03 -0.435069414123279022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256633754959206990E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313307973996E-03
+ -0.161424179067938013E-03  0.000000000000000000E+00 -0.435085186467014016E-03
+ -0.435053945729502005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322092437021506001E-01
+ -0.435014144601562013E-03 -0.435085743160859985E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138287341133016000E-03 -0.140228853782322998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.254980067883627988E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286311511997E-03
+ -0.161341206064400012E-03 -0.166666666666667011E-01 -0.431364756974581974E-03
+ -0.438866309542754976E-03  0.000000000000000000E+00  0.319866973588324027E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423386153582989E-03 -0.161424106222328993E-03 -0.435249329951151978E-03
+ -0.434893408363733994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256606789218085003E-01
+  0.000000000000000000E+00 -0.434963065954236988E-03 -0.435139005528701976E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138291831741704991E-03 -0.140224363173634006E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419833492947222983E-01  0.000000000000000000E+00
+ -0.161423303638426010E-03 -0.161424188737485999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435072120349480976E-03 -0.435066723923213975E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253728413941171002E-01  0.000000000000000000E+00 -0.434979152196512009E-03
+ -0.435122229395894016E-03 -0.666666666666666970E-02 -0.140221824647693988E-03
+ -0.138294370267645010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319851023354644978E-01  0.000000000000000000E+00 -0.161423349940120991E-03
+ -0.161424142435790991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435344407954329007E-03 -0.434800430411269015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251600082854132011E-01 -0.161430149009246000E-03 -0.161417343366666009E-03
+ -0.666666666666666970E-02 -0.432078300694966016E-03 -0.438133229980573974E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320887275245489018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435111004034048989E-03 -0.434989920105145983E-03 -0.435014834120551011E-03
+ -0.435085024172310027E-03 -0.435084968865059006E-03 -0.435014887158174024E-03
+ -0.166666666666667011E-01 -0.138286802031117000E-03 -0.140229392884223000E-03
+ -0.138288034558563993E-03 -0.140228160356775005E-03 -0.138900326717328012E-03
+ -0.139615868198011988E-03  0.000000000000000000E+00  0.419834068202609018E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305219030012E-03
+ -0.161424187156881997E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435073349340887991E-03 -0.435065522020417015E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256004806855422984E-01  0.000000000000000000E+00 -0.434981132067385973E-03
+ -0.435120167645717019E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138288861702206010E-03 -0.140227333213132987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253696619275626985E-01  0.000000000000000000E+00
+ -0.434980119724182024E-03 -0.435121221827560020E-03 -0.666666666666666970E-02
+ -0.140228210971901000E-03 -0.138287983943439000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319694542067313001E-01 -0.161422588303278990E-03
+ -0.161424904072633994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435109649626736015E-03 -0.435030036925322975E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320971379137156981E-01  0.000000000000000000E+00 -0.161423238729728995E-03
+ -0.161424253646182987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435144484966556002E-03 -0.434995958682276988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.296734541379750985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434149030098187011E-03 -0.435987412204495999E-03 -0.435031452984650986E-03
+ -0.435067691864950016E-03 -0.434950548419927988E-03 -0.435152046333653016E-03
+ -0.666666666666666970E-02 -0.138392945863299996E-03 -0.140123249052040005E-03
+ -0.138921815209089002E-03 -0.139594379706249995E-03  0.000000000000000000E+00
+ -0.138356769775533993E-03 -0.140159425139805004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.420931571567067994E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014671747375992E-03
+ -0.435085193476965012E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138287057706326004E-03 -0.140229137209012993E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.257455695212117017E-01
+  0.000000000000000000E+00 -0.435047022783618996E-03 -0.435051457582689993E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258438339454998E-03
+ -0.139257756575883999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255770773094191987E-01  0.000000000000000000E+00 -0.161423300794449003E-03
+ -0.161424191581463006E-03 -0.666666666666666970E-02 -0.435056164326672024E-03
+ -0.435082918027994002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.257823402544687004E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435067703423035017E-03 -0.435031441124867973E-03  0.000000000000000000E+00
+ -0.138940670377310000E-03 -0.139575524538028997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998367437754992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423318157696995E-03
+ -0.161424174218214987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435046243704917976E-03 -0.435093061865261994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256631852001646017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423307528138997E-03
+ -0.161424184847773012E-03  0.000000000000000000E+00 -0.438796404237050007E-03
+ -0.431424781290872027E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.323103002924175972E-01
+  0.000000000000000000E+00 -0.434856382728898011E-03 -0.435250190151687997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138421511330616989E-03
+ -0.140094683584722008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255452807930411988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423207454343989E-03 -0.161424284921568996E-03 -0.166666666666667011E-01
+ -0.404324480174985020E-03 -0.466398849792140982E-03  0.000000000000000000E+00
+  0.318949373068843009E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.323838290351469984E-03 -0.544927583290811006E-03
+ -0.135274543056306012E-03 -0.143241651859033012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255622073519536988E-01  0.000000000000000000E+00 -0.114452729937741999E-03
+ -0.573228692451250047E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.218484783290129011E-03 -0.891416037679068000E-03
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.421663414565641992E-01
+  0.000000000000000000E+00  0.687960644515914959E-05 -0.694561028834151002E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.438096223299876001E-03
+ -0.432167718169455990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255893861003781004E-01  0.000000000000000000E+00
+ -0.330656818749315009E-03 -0.535638411162091960E-03 -0.666666666666666970E-02
+ -0.139701772394438010E-03 -0.138814422520901990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319824507944573982E-01  0.000000000000000000E+00
+ -0.161423276715586998E-03 -0.161424215660325011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.453058027683075014E-03 -0.417478867706813021E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252771264600550986E-01 -0.434642415203527976E-03
+ -0.435473406290417023E-03 -0.666666666666666970E-02 -0.142683907585935987E-03
+ -0.135832287329404013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323033163369174023E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435248914259523019E-03 -0.434857618105281016E-03 -0.434883815526439026E-03
+ -0.435221605059342973E-03 -0.435144012463656023E-03 -0.434958245914109976E-03
+ -0.166666666666667011E-01 -0.138392691801889008E-03 -0.140123503113449990E-03
+ -0.138936896109547012E-03 -0.139579298805792012E-03 -0.138396653973258001E-03
+ -0.140119540942080996E-03  0.000000000000000000E+00  0.419134531555254014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003730040957000E-03
+ -0.162843762334955009E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439266035090063022E-03 -0.430980363867674026E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256392673957523999E-01  0.000000000000000000E+00 -0.434777562142257996E-03
+ -0.435332391625641989E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138409864927514006E-03 -0.140106329987824992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.254101279172769014E-01  0.000000000000000000E+00
+ -0.373644623806981997E-03 -0.492026351519330037E-03 -0.666666666666666970E-02
+ -0.908412435934918957E-03  0.235481181545980998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321268372004474001E-01 -0.200001267706564000E-03
+ -0.627372315602944992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.127171097759715991E-03 -0.545760156629222998E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995602660988991E-01  0.000000000000000000E+00 -0.161423310639775001E-03
+ -0.161424181736137008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.437288207604906009E-03 -0.432899576991382993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296377168165362984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434989920176705983E-03 -0.435111003938864983E-03 -0.435084786849818019E-03
+ -0.435015061692732000E-03 -0.435015088357717974E-03 -0.435084759043768022E-03
+ -0.666666666666666970E-02 -0.138910479291025010E-03 -0.139605715624313987E-03
+ -0.138288220944727007E-03 -0.140227973970611991E-03 -0.138286985838064012E-03
+ -0.140229209077275013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073428042021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232445008E-03 -0.161424187143467001E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384775871976E-03 -0.435069399514123015E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582244991315003E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660285921980E-03 -0.435120659862830007E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286857240759006E-03 -0.140229337674580994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256868355803533989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980641844436011E-03 -0.435120677541527977E-03 -0.666666666666666970E-02
+ -0.140229336589824008E-03 -0.138286858325514989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255642317604596984E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423743725482000E-03
+ -0.161423748650431012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435032785443368025E-03 -0.435106823406514024E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996548652853000E-01
+ -0.161423313389232997E-03 -0.161424178986679012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069349700356026E-03 -0.435069435379967977E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256786747395171995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743712989009E-03 -0.161423748662923000E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435032864937970982E-03
+ -0.435106742120615006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996543404327983E-01
+ -0.161423313375148994E-03 -0.161424179000762988E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069389550788983E-03 -0.435069394631554978E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256124534411805993E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563580132133995E-03 -0.447998624216019981E-03
+ -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186492007E-03
+  0.000000000000000000E+00  0.319552376669786989E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161419817025352993E-03
+ -0.161427675350558989E-03 -0.434132277678501012E-03 -0.436028351482909977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256464487088938001E-01  0.000000000000000000E+00
+ -0.434899896421036018E-03 -0.435204881377557976E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138261308742496992E-03
+ -0.140254886172842006E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419830852158453033E-01  0.000000000000000000E+00 -0.161423296248439004E-03
+ -0.161424196127473005E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435080307367357005E-03 -0.435058717463834973E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253594622355668986E-01
+  0.000000000000000000E+00 -0.434950331860284995E-03 -0.435152283722859020E-03
+ -0.666666666666666970E-02 -0.140249996876446007E-03 -0.138266198038893993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319571786575690020E-01
+  0.000000000000000000E+00 -0.161420475113086009E-03 -0.161427017262827003E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830151222019E-03
+ -0.435847642640102974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252660555352819012E-01
+ -0.423620518084152984E-03 -0.446908164705096009E-03 -0.666666666666666970E-02
+ -0.138290733115794992E-03 -0.140225461799544005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320831749900574031E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.434897829232994994E-03
+ -0.435207041701621983E-03 -0.435182072257829001E-03 -0.434921773469817985E-03
+ -0.434970483470178027E-03 -0.435131276171263001E-03 -0.166666666666667011E-01
+ -0.138283460799556990E-03 -0.140232734115782007E-03 -0.138290545167624998E-03
+ -0.140225649747715002E-03 -0.138898900526048002E-03 -0.139617294389290995E-03
+  0.000000000000000000E+00  0.421628681453787013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.397188434436698007E-04 -0.727400265832661014E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435230978617207986E-03
+ -0.434916855000890001E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256973883757283007E-01
+  0.000000000000000000E+00 -0.435443117881719019E-03 -0.434671353469976026E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672805722981989E-03
+ -0.139843389192358011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253811407347032010E-01  0.000000000000000000E+00 -0.111660189923879004E-03
+ -0.576021232465113043E-03 -0.666666666666666970E-02 -0.807402873679339953E-03
+  0.113200250901337992E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318951479564182974E-01 -0.334801618998244014E-03 -0.534674353093398985E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236534922172994E-03
+ -0.143279659993166003E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320991543625473996E-01
+  0.000000000000000000E+00 -0.161423299320882996E-03 -0.161424193055029988E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089523234416986E-03
+ -0.431138190963970024E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292550790147567998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423735549563995E-03 -0.161423756826347987E-03
+ -0.161424083758227992E-03 -0.161423408617683990E-03 -0.161423408591030993E-03
+ -0.161424083784880988E-03 -0.666666666666666970E-02 -0.434939106055095008E-03
+ -0.435202598868165004E-03 -0.434989974486818000E-03 -0.435150588304691020E-03
+ -0.434898044278314978E-03 -0.435244580342165975E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736779580982E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014942481458009E-03 -0.435084911182310021E-03
+ -0.666666666666666970E-02 -0.138286681738491990E-03 -0.140229513176847007E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502900122650013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360190818011E-03 -0.161424132185093998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552577983E-03
+ -0.435069911008902977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252824734873780994E-01  0.000000000000000000E+00 -0.161423118220565989E-03
+ -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03
+ -0.139258084345714997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255485447377079983E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756024003E-03
+ -0.161424189619888006E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434872688531394978E-03 -0.435270525342987020E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322052497845927016E-01 -0.435023979094238992E-03 -0.435075488205511978E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138279010834734011E-03
+ -0.140237184080605013E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.258530259445254006E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434958189051660006E-03 -0.435144092590821993E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138909064929736007E-03
+ -0.139607129985603993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320994819723091004E-01 -0.161423308725767013E-03 -0.161424183650144996E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068461045984E-03
+ -0.435058950989111021E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731138163008E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766954008E-03
+ -0.440793399874086025E-03 -0.166666666666667011E-01 -0.138412525549365991E-03
+ -0.140103669365973006E-03  0.000000000000000000E+00  0.319693244854589026E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422576318977012E-03 -0.161424916056934997E-03 -0.437741651795111979E-03
+ -0.432457178334932999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257682962742948989E-01
+  0.000000000000000000E+00 -0.372135275695876014E-03 -0.498733752802202964E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908713201793988E-03 -0.139607481713545009E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896828694112003E-01  0.000000000000000000E+00
+ -0.436533664253622989E-03 -0.433624016341012007E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626819469842998E-03 -0.143889375445495999E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833740154079986E-01  0.000000000000000000E+00  0.378159457418904982E-04
+ -0.725497368130881976E-03 -0.666666666666666970E-02 -0.597713125202172023E-03
+ -0.752181291867666958E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087590768978E-01  0.000000000000000000E+00 -0.981956570690248044E-04
+ -0.589485765319966958E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422521139187011E-03 -0.626419769058526952E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252392538739079006E-01 -0.333702074783931012E-03
+ -0.536013663819399954E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.310595586219111000E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161431110154876994E-03
+ -0.161416382221035991E-03 -0.161367182761019008E-03 -0.161480309614893001E-03
+ -0.161480399925954988E-03 -0.161367092449956993E-03 -0.166666666666667011E-01
+ -0.438417298132810025E-03 -0.431802193377014986E-03 -0.436709554426625997E-03
+ -0.433469595476341000E-03 -0.139063297967854010E-03 -0.139452896947484987E-03
+  0.000000000000000000E+00  0.421404520292116974E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.402798477196058982E-03 -0.467873676471954026E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.227909367425477999E-03
+ -0.900840621814416988E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256044922875989997E-01
+  0.000000000000000000E+00 -0.340827301030340014E-03 -0.529259221222554946E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138561533653603990E-03
+ -0.139954661261736010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253040211801738996E-01  0.000000000000000000E+00 -0.246116211180549989E-03
+ -0.598443499755276051E-03 -0.666666666666666970E-02 -0.140073790975437003E-03
+ -0.138442403939901995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319703070963996000E-01 -0.161422660702506988E-03 -0.161424831673404994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922132602362023E-03
+ -0.443395127058409007E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321002728756756978E-01
+  0.000000000000000000E+00 -0.161423329712289998E-03 -0.161424162663622011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432632949803903998E-03
+ -0.437559988660470999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.296364413425855011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435207087979181024E-03 -0.434897780856480991E-03 -0.435078144333426976E-03
+ -0.435021431652866995E-03 -0.435027766934605010E-03 -0.435071537880043005E-03
+ -0.666666666666666970E-02 -0.138910179724992006E-03 -0.139606015190346991E-03
+ -0.138289402276009992E-03 -0.140226792639329005E-03 -0.138286064088546994E-03
+ -0.140230130826792004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834145316453980E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305428603012E-03 -0.161424186947310000E-03 -0.666666666666666970E-02
+ -0.435068797728623978E-03 -0.435069999793927990E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256581977277514015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434982640822920976E-03 -0.435118594642042004E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286799402304007E-03 -0.140229395513034991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256861835048418988E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980941091875014E-03 -0.435120365734433022E-03
+ -0.666666666666666970E-02 -0.140230675700951006E-03 -0.138285519214387991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255648151652009999E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423744258310013E-03 -0.161423748117601996E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434971272637178004E-03 -0.435169719904374022E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996755017593990E-01
+ -0.161423313937937010E-03 -0.161424178437974999E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067422348721020E-03 -0.435071406160561979E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256787278728396011E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423752315616002E-03 -0.161423740060296007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434863912658681987E-03
+ -0.435279496392151021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321000539265607968E-01
+ -0.161423323758083008E-03 -0.161424168617829001E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435021975200768020E-03 -0.435117876636681000E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.254980067884808988E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161506286650737010E-03 -0.161341205725174999E-03
+ -0.166666666666667011E-01 -0.431364756994681022E-03 -0.438866309522160990E-03
+  0.000000000000000000E+00  0.319866973712260028E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423386153855991E-03
+ -0.161424106222056993E-03 -0.435249227300768993E-03 -0.434893508757877025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256606773361423013E-01  0.000000000000000000E+00
+ -0.434963273622003977E-03 -0.435138788986911998E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138291826977172987E-03
+ -0.140224367938166010E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833531788693995E-01  0.000000000000000000E+00 -0.161423303747212010E-03
+ -0.161424188628699999E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435047841600877025E-03 -0.435091428270091996E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253728372267130013E-01
+  0.000000000000000000E+00 -0.434980109299934025E-03 -0.435121231414349987E-03
+ -0.666666666666666970E-02 -0.140221834450205004E-03 -0.138294360465133993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319851023684395025E-01
+  0.000000000000000000E+00 -0.161423349940937992E-03 -0.161424142434973990E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435344118977629016E-03
+ -0.434800713024421025E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.251600082863014003E-01
+ -0.161430149161919989E-03 -0.161417343213992996E-03 -0.666666666666666970E-02
+ -0.432078300658010010E-03 -0.438133230018339012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320880852120032020E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435006978817776007E-03
+ -0.435093216238015019E-03 -0.435023227600191012E-03 -0.435076271502717992E-03
+ -0.435080703092234026E-03 -0.435018977896097989E-03 -0.166666666666667011E-01
+ -0.138286342284362987E-03 -0.140229852630976986E-03 -0.138288877468917000E-03
+ -0.140227317446421998E-03 -0.138898309276879996E-03 -0.139617885638460004E-03
+  0.000000000000000000E+00  0.419834046918393031E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305160284008E-03 -0.161424187215628001E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435057325047151988E-03
+ -0.435081731065996996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256008015072015996E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434979053168488001E-03 -0.435122335339392023E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138289510792666997E-03 -0.140226684122673003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253691080193319991E-01
+  0.000000000000000000E+00 -0.434981271968578004E-03 -0.435120020450890975E-03
+ -0.666666666666666970E-02 -0.140229335054007991E-03 -0.138286859861331006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837881512948022E-01
+ -0.161423315571830009E-03 -0.161424176804082000E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069394631856982E-03 -0.435069389550488985E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320970354177166975E-01  0.000000000000000000E+00 -0.161423235472326997E-03
+ -0.161424256903585988E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435168272526173982E-03 -0.434972697148703995E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295545368095916985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434122085949861976E-03 -0.436015593157589986E-03 -0.435101402844330994E-03
+ -0.434999127125352007E-03 -0.434882942033044983E-03 -0.435222550914836999E-03
+ -0.666666666666666970E-02 -0.138329236201211012E-03 -0.140186958714128013E-03
+ -0.138899088561680992E-03 -0.139617106353658006E-03 -0.138290618264299002E-03
+ -0.140225576651039995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420931657846550997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014640889280020E-03
+ -0.435085225654339980E-03 -0.666666666666666970E-02 -0.138287075366651002E-03
+ -0.140229119548687995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257458666890181993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435045894309902018E-03 -0.435052634377398005E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258454546935994E-03 -0.139257740368403004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255770740455284994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423300836045991E-03 -0.161424191539865991E-03
+ -0.666666666666666970E-02 -0.435055528504503020E-03 -0.435083568186561014E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257826760506618992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434999152127107014E-03 -0.435101368436342979E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138940653027276996E-03
+ -0.139575541888062001E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996072561336027E-01
+ -0.161423312108405990E-03 -0.161424180267505992E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435072662950887002E-03 -0.435066193259916003E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256629422996789008E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423300977684998E-03 -0.161424191398227011E-03
+  0.000000000000000000E+00 -0.438825800967892977E-03 -0.431396059158853015E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323107970182680976E-01 -0.434855364031651011E-03
+ -0.435251251813800982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138421931556495010E-03 -0.140094263358843987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255642320215948987E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743718458999E-03 -0.161423748657454013E-03
+ -0.166666666666667011E-01 -0.435032852641439025E-03 -0.435106754694577981E-03
+  0.000000000000000000E+00  0.319837065352637992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381099990E-03
+ -0.161424178994811992E-03 -0.435069595976944991E-03 -0.435069192642484977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582283981556004E-01  0.000000000000000000E+00
+ -0.434978918685956012E-03 -0.435122475932554027E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286871097143010E-03
+ -0.140229323818195987E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833928488543007E-01  0.000000000000000000E+00 -0.161423304820364997E-03
+ -0.161424187555547012E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435207587129756010E-03 -0.434934243879776989E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691146856406986E-01
+  0.000000000000000000E+00 -0.434977844183998008E-03 -0.435123594722203980E-03
+ -0.666666666666666970E-02 -0.140229316757458007E-03 -0.138286878157881994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837065219248998E-01
+  0.000000000000000000E+00 -0.161423313380713987E-03 -0.161424178995198998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069861087792979E-03
+ -0.435068933373936994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252163879413985006E-01
+ -0.161423743718170005E-03 -0.161423748657742004E-03 -0.666666666666666970E-02
+ -0.435032852613000023E-03 -0.435106754723656988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320883554506867022E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435127306548502995E-03 -0.434974286158534991E-03
+ -0.435023129650981017E-03 -0.435076373631457973E-03 -0.435052776118324998E-03
+ -0.435045758390568017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286581602824004E-03 -0.140229613312514993E-03 -0.138900368233470993E-03
+ -0.139615826681868004E-03 -0.138293378267740995E-03 -0.140222816647599005E-03
+  0.000000000000000000E+00  0.419834123874497020E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305373013988E-03 -0.161424187002898997E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435065741772807020E-03
+ -0.435073124632554995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255992190832709000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434982794949810992E-03 -0.435118433992394990E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286284757403011E-03 -0.140229910157936013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253696592428569005E-01
+  0.000000000000000000E+00 -0.434980341928489984E-03 -0.435120990165656005E-03
+ -0.666666666666666970E-02 -0.140228218876105011E-03 -0.138287976039234013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319694542100162002E-01
+ -0.161422588303611000E-03 -0.161424904072301009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435109462002123012E-03 -0.435030220343913024E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320999013551653001E-01  0.000000000000000000E+00 -0.161423319891360013E-03
+ -0.161424172484551996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435048884195216989E-03 -0.435090361870336983E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.297460083034186001E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435093184138760016E-03 -0.435007004727823009E-03
+ -0.435044582892759007E-03 -0.435054001407353006E-03 -0.435082620665603974E-03
+ -0.435017135654766987E-03 -0.666666666666666970E-02 -0.138933893426106987E-03
+ -0.139582301489232010E-03 -0.138357128758214001E-03 -0.140159066157124996E-03
+  0.000000000000000000E+00 -0.138352953476254988E-03 -0.140163241439084010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834091612526009E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305282162996E-03
+ -0.161424187093749013E-03 -0.666666666666666970E-02 -0.435069239563229014E-03
+ -0.435069548000012994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256578127707293990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434981761685145974E-03 -0.435119511480767007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286013333851002E-03 -0.140230181581488998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256868183233514985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980125799632985E-03 -0.435121215670877001E-03
+ -0.666666666666666970E-02 -0.140229372680005009E-03 -0.138286822235333988E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255642631986222994E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423746696742987E-03 -0.161423745679168995E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434938825463917979E-03 -0.435202898680430025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320998066618511033E-01 -0.161423317331866001E-03 -0.161424175044046008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435039227329621022E-03
+ -0.435100236286997026E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256789254737601995E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423744043055008E-03 -0.161423748332858004E-03  0.000000000000000000E+00
+ -0.434994690186026999E-03 -0.435145776107332002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996673803692970E-01 -0.161423313722635004E-03
+ -0.161424178653277005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435068125149740983E-03 -0.435070687523708016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255452807933189002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423207454355997E-03 -0.161424284921556012E-03
+ -0.166666666666667011E-01 -0.404324471515191976E-03 -0.466398858585693999E-03
+  0.000000000000000000E+00  0.318949372864687011E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838542624367001E-03
+ -0.544927348767803980E-03 -0.135274539864154011E-03 -0.143241655051185013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255622073357968001E-01  0.000000000000000000E+00
+ -0.114453213670021004E-03 -0.573228208718970988E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.218486799119613006E-03
+ -0.891418053508551047E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421663421972207977E-01  0.000000000000000000E+00  0.686273215429911012E-05
+ -0.694544154543290989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.437941742226114026E-03 -0.432315834894540991E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255893865594537008E-01
+  0.000000000000000000E+00 -0.330660242956903980E-03 -0.535635480866768947E-03
+ -0.666666666666666970E-02 -0.139701781018268989E-03 -0.138814413897070008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319824507952650994E-01
+  0.000000000000000000E+00 -0.161423276715632995E-03 -0.161424215660278987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057648293402017E-03
+ -0.417479238786919984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252771264643981003E-01
+ -0.434642414870016003E-03 -0.435473406638131990E-03 -0.666666666666666970E-02
+ -0.142683907492472013E-03 -0.135832287422867011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320831993534842971E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435039282000798980E-03
+ -0.435059530140830008E-03 -0.435183177514903016E-03 -0.434920713595511989E-03
+ -0.434923578188304021E-03 -0.435180190251448023E-03 -0.166666666666667011E-01
+ -0.138283467016899007E-03 -0.140232727898440993E-03 -0.138898928565553995E-03
+ -0.139617266349785003E-03 -0.138290715294724011E-03 -0.140225479620615013E-03
+  0.000000000000000000E+00  0.421628681647200979E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.397187181120990971E-04
+ -0.727400140501091022E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435231404167815000E-03 -0.434916453249015015E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256970881787325009E-01
+  0.000000000000000000E+00 -0.435443459120569022E-03 -0.434671026438330013E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672556881986003E-03
+ -0.139843638033352994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253811403923139001E-01  0.000000000000000000E+00 -0.111659736914792001E-03
+ -0.576021685474199015E-03 -0.666666666666666970E-02 -0.807403253803167008E-03
+  0.113208142554363006E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318951479398702983E-01 -0.334801708387352976E-03 -0.534674268837695009E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236533028903003E-03
+ -0.143279661886435995E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995045206775992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423308955499994E-03 -0.161424183420412991E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.439073955171491003E-03 -0.431153380462344989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.292551256260864985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423741996618989E-03
+ -0.161423750379292993E-03 -0.161424083526681006E-03 -0.161423408849231003E-03
+ -0.161423408822156008E-03 -0.161424083553757003E-03 -0.666666666666666970E-02
+ -0.434870224107475974E-03 -0.435273023345194990E-03 -0.434990359763353014E-03
+ -0.435150194228865013E-03 -0.434886571394757027E-03 -0.435256310073468009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929704392936990E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014948460666999E-03 -0.435084904947665005E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286675090450010E-03 -0.140229519824888987E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502912848136997E-01  0.000000000000000000E+00 -0.161423360221235005E-03
+ -0.161424132154678007E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435068640548550009E-03 -0.435070160511034009E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823540608277005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423118749632011E-03 -0.161424373626279998E-03 -0.666666666666666970E-02
+ -0.139258116844783998E-03 -0.139258078070556002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255489628695773005E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423314131588989E-03 -0.161424178244322993E-03
+  0.000000000000000000E+00 -0.434859092529006977E-03 -0.435284424993224019E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322050771166806013E-01  0.000000000000000000E+00
+ -0.435024403864370983E-03 -0.435075045265772024E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138278642730750004E-03 -0.140237552184589996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.258528526278573000E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435045758287675984E-03 -0.435052776259060999E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138908581714380013E-03
+ -0.139607613200959987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997445770579981E-01 -0.161423315795968994E-03 -0.161424176579942987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069043735089990E-03
+ -0.435069748238998001E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731136071000E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527301021726991E-03
+ -0.440793399609727991E-03 -0.166666666666667011E-01 -0.138412525548941011E-03
+ -0.140103669366398013E-03  0.000000000000000000E+00  0.319693244861690984E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422576319042010E-03 -0.161424916056869999E-03 -0.437741641876694004E-03
+ -0.432457188030005977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257682960389588010E-01
+  0.000000000000000000E+00 -0.372135294409653991E-03 -0.498733734440172968E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908712464486990E-03 -0.139607482450852008E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896816202265973E-01  0.000000000000000000E+00
+ -0.436537889268050986E-03 -0.433619959227105999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626693225825994E-03 -0.143889501689513003E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833739842331998E-01  0.000000000000000000E+00  0.378158663248480029E-04
+ -0.725497288713840022E-03 -0.666666666666666970E-02 -0.597713400143132971E-03
+ -0.752178542458052050E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087558135013E-01  0.000000000000000000E+00 -0.981956616589569036E-04
+ -0.589485760730035007E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422452004102012E-03 -0.626419835077194048E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252392538736065999E-01 -0.333702076879594987E-03
+ -0.536013661829747045E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138219319557457995E-03 -0.140296875357881002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321074799924774973E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434939601065898027E-03 -0.435163477654634996E-03
+ -0.435157861406833973E-03 -0.434944986847371983E-03 -0.434989471025225001E-03
+ -0.435111473318402012E-03 -0.166666666666667011E-01 -0.138299620238519004E-03
+ -0.140216574676820996E-03 -0.138306111153669995E-03 -0.140210083761669002E-03
+ -0.138904474579713009E-03 -0.139611720335626992E-03  0.000000000000000000E+00
+  0.421630568906255984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.382244495288109972E-04 -0.725905871917801953E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435871010405398983E-03
+ -0.434311639967858000E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256952418884623009E-01  0.000000000000000000E+00
+ -0.435307159704155999E-03 -0.434801765636921985E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138667044406462996E-03 -0.139849150508876001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253756621166979016E-01
+  0.000000000000000000E+00 -0.110334788320004006E-03 -0.577346634068988027E-03
+ -0.666666666666666970E-02 -0.811801782227151991E-03  0.217125293756245991E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.318985675848939970E-01
+ -0.333010067147420014E-03 -0.536378952719955017E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.135533186849416004E-03 -0.142983008065922994E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998187031011004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423317205601008E-03
+ -0.161424175170311001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.439006972327073976E-03 -0.431218847172978989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.292402861508459996E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423747268413004E-03 -0.161423745107499005E-03 -0.161424199494236009E-03
+ -0.161423292881676000E-03 -0.161423292890866999E-03 -0.161424199485045010E-03
+ -0.666666666666666970E-02 -0.434862443268883999E-03 -0.435281002767785006E-03
+ -0.435100436115737983E-03 -0.435039032534641993E-03  0.000000000000000000E+00
+ -0.435036509827943995E-03 -0.435103015721981983E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929537555077005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435015025801769976E-03 -0.435084824299273990E-03
+ -0.666666666666666970E-02 -0.138286640917195994E-03 -0.140229553998143003E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502682711579998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423359671854992E-03 -0.161424132704057993E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435067434800663979E-03
+ -0.435071393428817989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252818652853033995E-01  0.000000000000000000E+00 -0.161423121023111999E-03
+ -0.161424371352800010E-03 -0.666666666666666970E-02 -0.139258148276853998E-03
+ -0.139258046638484999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255494975093474991E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423327915938004E-03
+ -0.161424164459974005E-03  0.000000000000000000E+00 -0.435032834826549976E-03
+ -0.435106772375089984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322088227833026985E-01
+ -0.435016604539718995E-03 -0.435083178015839003E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286481481243002E-03 -0.140229713434095995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.258545382043442011E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435180234489963001E-03 -0.434923532533521007E-03  0.000000000000000000E+00
+ -0.138910062415604000E-03 -0.139606132499734998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.321001796355565028E-01 -0.161423327064930995E-03
+ -0.161424165310980987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435025887435454999E-03 -0.435113876200826026E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255844281592568015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.327328582483525979E-03
+ -0.542038180279041048E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.138213770482627013E-03 -0.140302424432712987E-03  0.000000000000000000E+00
+  0.321601835592413010E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.946024118762530064E-04 -0.593079010512739054E-03
+ -0.133620199164305990E-03 -0.725519994603599015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257239982411411987E-01
+  0.196968234580236002E-04 -0.707378245847014990E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458129631085333017E-03
+ -0.413598754070089024E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.423184714605200005E-01  0.000000000000000000E+00 -0.434756036956663000E-03
+ -0.435354579774434021E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138688118698557992E-03 -0.139828076216782008E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.251897443601684992E-01
+  0.000000000000000000E+00 -0.646690982568611981E-04 -0.623012324132130035E-03
+ -0.666666666666666970E-02 -0.137976768222350989E-03 -0.140539426692988008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319702861271339983E-01
+  0.000000000000000000E+00 -0.161422661149470011E-03 -0.161424831226441998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.425027766008080989E-03
+ -0.445328716144572018E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253132418625061015E-01
+ -0.429884571295451002E-03 -0.440422628401466976E-03 -0.666666666666666970E-02
+ -0.138411741515959000E-03 -0.140104453399379997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157365405992017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748574759013E-03 -0.161423743801152996E-03
+ -0.161423313379198001E-03 -0.161424178996714008E-03 -0.161424178996706988E-03
+ -0.161423313379204994E-03 -0.166666666666667011E-01 -0.435069382470465026E-03
+ -0.435069401871426007E-03 -0.435069376779887026E-03 -0.435069407690234008E-03
+ -0.138284693812444989E-03 -0.140231501102894008E-03  0.000000000000000000E+00
+  0.420929708077939987E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014938579204977E-03 -0.435084915251974006E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286675893133993E-03
+ -0.140229519022206007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256866648616451004E-01  0.000000000000000000E+00
+ -0.435050846412010986E-03 -0.435047608857992002E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258078983040012E-03 -0.139258115932299013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598190481393009E-01
+  0.000000000000000000E+00 -0.161423313207681012E-03 -0.161424179168232000E-03
+ -0.666666666666666970E-02 -0.435070119779517976E-03 -0.435068680382981010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929799686702974E-01
+ -0.435014999741732006E-03 -0.435084851470821988E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694533200995E-03 -0.140229500382138002E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321001296867379007E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423325778209994E-03
+ -0.161424166597702991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435030925452593018E-03 -0.435108724823487006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296363951915997008E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435163526863417986E-03 -0.434939554418941980E-03 -0.435078552482020999E-03
+ -0.435021040258217015E-03 -0.435027006817730023E-03 -0.435072330536212006E-03
+ -0.666666666666666970E-02 -0.138910218392951005E-03 -0.139605976522387993E-03
+ -0.138289242827388992E-03 -0.140226952087950006E-03 -0.138286026412109991E-03
+ -0.140230168503229007E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834140961602986E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305416680989E-03
+ -0.161424186959231996E-03 -0.666666666666666970E-02 -0.435068823290631025E-03
+ -0.435069973655738992E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256581988626478005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434982526778275992E-03 -0.435118713563232987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286801913291003E-03 -0.140229393002047994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256862275407307994E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980912593391981E-03
+ -0.435120395436144002E-03 -0.666666666666666970E-02 -0.140230584882228997E-03
+ -0.138285610033110000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255647395300480987E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423744222123004E-03 -0.161423748153789005E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434974854599588009E-03
+ -0.435166057532792997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996742791537990E-01  0.000000000000000000E+00
+ -0.161423313905322989E-03 -0.161424178470588993E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067503898425021E-03 -0.435071322773375989E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256787237707600986E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423750872133001E-03 -0.161423741503779008E-03  0.000000000000000000E+00
+ -0.434878052016917011E-03 -0.435265039122159018E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320999946846379988E-01
+ -0.161423322217426002E-03 -0.161424170158486007E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435025227850866988E-03 -0.435114550827602994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614590240232011E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016115295252983E-03
+ -0.435083688178646997E-03 -0.166666666666667011E-01 -0.138320998711983004E-03
+ -0.140195196203355993E-03  0.000000000000000000E+00  0.319837098932621994E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471531992E-03 -0.161424178904379990E-03 -0.435069292513270988E-03
+ -0.435069493855684000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256582235323178989E-01
+  0.000000000000000000E+00 -0.434980856857106006E-03 -0.435120454886334974E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286853775869010E-03 -0.140229341139469987E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419834109605791983E-01  0.000000000000000000E+00
+ -0.161423305333356989E-03 -0.161424187042554993E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435046434997871025E-03 -0.435092866540485025E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253691040607966004E-01  0.000000000000000000E+00 -0.434981527116148989E-03
+ -0.435119754440041018E-03 -0.666666666666666970E-02 -0.140229345868751000E-03
+ -0.138286849046589000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837099126562968E-01  0.000000000000000000E+00 -0.161423313472065989E-03
+ -0.161424178903845993E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069123307102977E-03 -0.435069666874682976E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149345605001E-01 -0.435016121073636018E-03 -0.435083682152984010E-03
+ -0.666666666666666970E-02 -0.138320998693124988E-03 -0.140195196222214010E-03
+ -0.166666666666667011E-01  0.312157824691738014E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423746615296988E-03 -0.161423745760614994E-03
+ -0.161423313748345991E-03 -0.161424178627565991E-03 -0.161424178629453993E-03
+ -0.161423313746457989E-03 -0.166666666666667011E-01 -0.435045651504759002E-03
+ -0.435093667508772001E-03 -0.435080766409200008E-03 -0.435058268328494021E-03
+ -0.138278294452393003E-03 -0.140237900462945994E-03  0.000000000000000000E+00
+  0.420929613697930971E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435018968638475976E-03 -0.435080712727501978E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286650710325003E-03 -0.140229544205013995E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256871012844454988E-01  0.000000000000000000E+00
+ -0.435049191356535016E-03 -0.435049196139260014E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258062358015006E-03 -0.139258132557323991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598188674287988E-01
+  0.000000000000000000E+00 -0.161423313384250004E-03 -0.161424178991662005E-03
+ -0.666666666666666970E-02 -0.435070776497347995E-03 -0.435068038137203020E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929799834977977E-01
+ -0.435014991950716994E-03 -0.435084859595236974E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694574832000E-03 -0.140229500340508000E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996579186692971E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471493991E-03 -0.161424178904418994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069357599841020E-03
+ -0.435069427302476014E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295485029193944988E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016115212415004E-03 -0.435083688265029017E-03 -0.435055716856611019E-03
+ -0.435042938405563989E-03 -0.435014924529298014E-03 -0.435084929901764013E-03
+ -0.666666666666666970E-02 -0.138321048324070988E-03 -0.140195146591268009E-03
+ -0.138900264540679010E-03 -0.139615930374659987E-03 -0.138286695603002008E-03
+ -0.140229499312336989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929804452524967E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924421641997E-03
+ -0.435084930014025993E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286695610681005E-03 -0.140229499304657992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257458287848717007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188240248003E-03
+ -0.435049199389004021E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258097270406994E-03 -0.139258097644932003E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463585184992E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313378629012E-03 -0.161424178997282997E-03
+ -0.666666666666666970E-02 -0.435069399499345990E-03 -0.435069384790270994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257404831595632010E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435042938405561007E-03 -0.435055716856614001E-03
+  0.000000000000000000E+00 -0.138910376016605991E-03 -0.139605818898733006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361629841993E-01 -0.161423315571435007E-03
+ -0.161424176804478005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384822633020E-03 -0.435069399466241015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256633780153988009E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378038013E-03 -0.161424178997873996E-03
+  0.000000000000000000E+00 -0.435069374982946988E-03 -0.435069409527666015E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322089281358309970E-01 -0.435014924516908999E-03
+ -0.435084929914729011E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286694922169990E-03 -0.140229499993169008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292298002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049646021E-03 -0.435083691563146980E-03
+ -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313471231993E-03
+ -0.161424178904679989E-03 -0.435069384800128994E-03 -0.435069399489265024E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249674996987E-01  0.000000000000000000E+00
+ -0.434980660277106983E-03 -0.435120659871895998E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201160009E-03
+ -0.140229336714178988E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419834073409849004E-01  0.000000000000000000E+00 -0.161423305232394999E-03
+ -0.161424187143517010E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069384518094995E-03 -0.435069399777710994E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691078101383992E-01
+  0.000000000000000000E+00 -0.434980641232482002E-03 -0.435120678179650019E-03
+ -0.666666666666666970E-02 -0.140229336714196010E-03 -0.138286858201142987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837098823795990E-01
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799506987E-03
+ -0.435069399489901993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253136149492911994E-01
+ -0.435016112049662988E-03 -0.435083691563128007E-03 -0.666666666666666970E-02
+ -0.138320998722293008E-03 -0.140195196193046992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320885977646230974E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435028541278858008E-03 -0.435070730506436016E-03 -0.435014782666575002E-03
+ -0.435085077833358993E-03 -0.435084998915685002E-03 -0.435014858345719973E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138286722125585996E-03
+ -0.140229472789753001E-03 -0.138287952903743989E-03 -0.140228242011595009E-03
+ -0.138900254709783987E-03 -0.139615940205555010E-03  0.000000000000000000E+00
+  0.419834115616333006E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305349811004E-03 -0.161424187026102008E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435070335762563986E-03
+ -0.435068469166236015E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255991790107528011E-01  0.000000000000000000E+00
+ -0.434982884474892026E-03 -0.435118340650319981E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286203203484011E-03 -0.140229991711855013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253696597865739996E-01
+  0.000000000000000000E+00 -0.434979979753249990E-03 -0.435121367820745985E-03
+ -0.666666666666666970E-02 -0.140228217416730994E-03 -0.138287977498608003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319694542079494021E-01
+ -0.161422588303417009E-03 -0.161424904072495000E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435109502026177015E-03 -0.435030181217087981E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997612467455035E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423316150031992E-03
+ -0.161424176225879990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435046027403482012E-03 -0.435093283082004022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296643788713649992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435041466166206979E-03 -0.435057251216224993E-03
+ -0.434987713518249003E-03 -0.435113298977872021E-03 -0.435084291440460005E-03
+ -0.435015533322462983E-03 -0.666666666666666970E-02 -0.138384828446226003E-03
+ -0.140131366469113997E-03 -0.138922740970408006E-03 -0.139593453944930992E-03
+ -0.138352603070340010E-03 -0.140163591844998987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929717450221966E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014955543717996E-03 -0.435084897561344988E-03
+ -0.666666666666666970E-02 -0.138286677768216005E-03 -0.140229517147122992E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257455320601093010E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435050355484345984E-03
+ -0.435048079623292984E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258080653318999E-03 -0.139258114262021002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255775497212881002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423313341254991E-03 -0.161424179034656991E-03 -0.666666666666666970E-02
+ -0.435070056149005984E-03 -0.435068742611218005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257402390203077015E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435113345596286011E-03 -0.434987675900872992E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138910391927832998E-03
+ -0.139605802987505999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320999657878337022E-01
+ -0.161423321565590998E-03 -0.161424170810321011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435043064748192026E-03 -0.435096312338415002E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256636236446474014E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423319781588012E-03 -0.161424172594323997E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435040266525950976E-03
+ -0.435099173616122011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322086798404708011E-01
+ -0.435015968986542014E-03 -0.435083840783048986E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286185919272000E-03 -0.140230008996066998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255452807944169004E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423207454397007E-03 -0.161424284921515002E-03
+ -0.166666666666667011E-01 -0.404324467010457988E-03 -0.466398863159433010E-03
+  0.000000000000000000E+00  0.318949372706568007E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838600968767004E-03
+ -0.544927294428741000E-03 -0.135274538150394994E-03 -0.143241656764945006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255622073245997013E-01  0.000000000000000000E+00
+ -0.114453247275369001E-03 -0.573228175113622992E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.218487273219374002E-03
+ -0.891418527608312016E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421663423170699977E-01  0.000000000000000000E+00  0.686068254293191984E-05
+ -0.694542104931923968E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.437928115910693992E-03 -0.432328899183500985E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255893867272030014E-01
+  0.000000000000000000E+00 -0.330661927924627975E-03 -0.535633960374462992E-03
+ -0.666666666666666970E-02 -0.139701786050046989E-03 -0.138814408865292008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319824508284424977E-01
+  0.000000000000000000E+00 -0.161423276716622004E-03 -0.161424215659290005E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057324015285993E-03
+ -0.417479555947782987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252771264598301987E-01
+ -0.434642424564362999E-03 -0.435473396530201005E-03 -0.666666666666666970E-02
+ -0.142683907588298003E-03 -0.135832287327041998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.310595659754696989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161480379356609011E-03
+ -0.161367113019302998E-03 -0.161367201132704989E-03 -0.161480291243207996E-03
+ -0.161431097435676003E-03 -0.161416394940237009E-03 -0.166666666666667011E-01
+ -0.438433354062566980E-03 -0.431786524598485992E-03 -0.139063303639515990E-03
+ -0.139452891275823008E-03 -0.436729888333051012E-03 -0.433449749920077998E-03
+  0.000000000000000000E+00  0.421404522783956001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.402800893973209023E-03
+ -0.467871394310760984E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.227909597161434006E-03 -0.900840851550373050E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256049312238144991E-01
+  0.000000000000000000E+00 -0.340826929960757021E-03 -0.529259757575146952E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138562359099061988E-03
+ -0.139953835816277009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253040222377561991E-01  0.000000000000000000E+00 -0.246116167738164992E-03
+ -0.598443392983163958E-03 -0.666666666666666970E-02 -0.140073786324521011E-03
+ -0.138442408590818013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319703070887365007E-01 -0.161422660701845001E-03 -0.161424831674067008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922283598156992E-03
+ -0.443394972905718991E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320998325318406982E-01  0.000000000000000000E+00 -0.161423318386659000E-03
+ -0.161424173989254012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.432674210328745981E-03 -0.437517849506738994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.296374388638415004E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435070730494953989E-03 -0.435028541287397997E-03
+ -0.435085047523665978E-03 -0.435014811728346980E-03 -0.435014733314390012E-03
+ -0.435085129293168984E-03 -0.666666666666666970E-02 -0.138910408020893991E-03
+ -0.139605786894445006E-03 -0.138288015962704006E-03 -0.140228178952634991E-03
+ -0.138286786003025006E-03 -0.140229408912313991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073379218020E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232311000E-03 -0.161424187143602012E-03 -0.666666666666666970E-02
+ -0.435069384517251974E-03 -0.435069399778572989E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256582252366890015E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980659785321009E-03
+ -0.435120660384639003E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286858754340991E-03 -0.140229336160998006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256868357246730003E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980640625951985E-03
+ -0.435120678812035981E-03 -0.666666666666666970E-02 -0.140229336292177010E-03
+ -0.138286858623161988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255642315719874988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743707386991E-03 -0.161423748668524991E-03  0.000000000000000000E+00
+ -0.435032940295220990E-03 -0.435106665066769985E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996541101558966E-01
+ -0.161423313368980995E-03 -0.161424179006930987E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069405363546012E-03 -0.435069379055302017E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256786736305051992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743710730996E-03 -0.161423748665181989E-03
+  0.000000000000000000E+00 -0.435032918551319986E-03 -0.435106687300586010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996542492566969E-01
+ -0.161423313372716993E-03 -0.161424179003194989E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069400469630010E-03 -0.435069383841369989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.254980067883639992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286329310991E-03
+ -0.161341206046600991E-03 -0.166666666666667011E-01 -0.431364756977960023E-03
+ -0.438866309539295992E-03  0.000000000000000000E+00  0.319866973594409992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423386153596000E-03 -0.161424106222316009E-03
+ -0.435249325540191991E-03 -0.434893412677740991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256606788604577009E-01
+  0.000000000000000000E+00 -0.434963057292216018E-03 -0.435139014560906973E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138291831602560002E-03 -0.140224363312778995E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833493286519976E-01  0.000000000000000000E+00 -0.161423303639283993E-03
+ -0.161424188736628991E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435072704181455982E-03 -0.435066152962260003E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253728412360505994E-01
+  0.000000000000000000E+00 -0.434979171689485976E-03 -0.435122209070102999E-03
+ -0.666666666666666970E-02 -0.140221824975804004E-03 -0.138294369939534993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319851023368492998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423349940155008E-03
+ -0.161424142435757001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435344396466284009E-03 -0.434800441646335982E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.251600082854456994E-01 -0.161430149015849008E-03
+ -0.161417343360063001E-03 -0.666666666666666970E-02 -0.432078300695953020E-03
+ -0.438133229979561004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.312157350718397994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748575108993E-03 -0.161423743800802989E-03
+ -0.161423313366150007E-03 -0.161424179009763005E-03 -0.161424179009759996E-03
+ -0.161423313366152013E-03 -0.166666666666667011E-01 -0.435069386750616997E-03
+ -0.435069397494826013E-03 -0.435069386095123000E-03 -0.435069398165091018E-03
+ -0.138284703375701995E-03 -0.140231491539637002E-03  0.000000000000000000E+00
+  0.420931147064136021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014771024603002E-03 -0.435085089957181989E-03
+ -0.666666666666666970E-02 -0.138286970731685001E-03 -0.140229224183653997E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.254910621183637998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423348017174001E-03 -0.161424144358738008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435079665188463989E-03
+ -0.435059345445455996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.249645250281322999E-01  0.000000000000000000E+00
+ -0.161423121269891999E-03 -0.161424371106020010E-03 -0.666666666666666970E-02
+ -0.139257838659735000E-03 -0.139258356255603997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319784847308889966E-01 -0.161423139772803008E-03
+ -0.161424352603109001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435059648379540009E-03 -0.435079356600350973E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323098477771483969E-01  0.000000000000000000E+00
+ -0.434855411650391993E-03 -0.435251202539120021E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138421129190984010E-03 -0.140095065724355991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286728645834729984E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315838776007E-03 -0.161424176537136002E-03
+ -0.161423752187323010E-03 -0.161423740188588999E-03 -0.161423317510291011E-03
+ -0.161424174865620998E-03 -0.666666666666666970E-02 -0.438771655527875001E-03
+ -0.431448953116295001E-03 -0.138271241563716002E-03 -0.140244953351622995E-03
+ -0.435165323893648012E-03 -0.434975574060252989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.419833961506188985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423304924141007E-03 -0.161424187451771002E-03
+ -0.666666666666666970E-02 -0.435069598057073975E-03 -0.435069190609702994E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582546102008015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434978063949837974E-03 -0.435123367209186982E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286924064156001E-03 -0.140229270851182996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256881329485436002E-01
+  0.000000000000000000E+00 -0.434979402202776018E-03 -0.435121969745410990E-03
+ -0.666666666666666970E-02 -0.140226692592859009E-03 -0.138289502322480991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257149793095595991E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434964716442319023E-03 -0.435137270613733019E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138403971130206005E-03
+ -0.140112223785132992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995089735353969E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423309436031000E-03 -0.161424182939881009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435071687413104014E-03 -0.435067147303640016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257775991854455010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434123548609677019E-03 -0.436014071331951010E-03  0.000000000000000000E+00
+ -0.138328115136844999E-03 -0.140188079778493999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320970398516952013E-01
+ -0.161423235617312989E-03 -0.161424256758598993E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435167788232856023E-03 -0.434973170725627982E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255649837703663986E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423599069647001E-03 -0.161423893306266011E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.435941020818056012E-03 -0.434216891005145009E-03
+  0.321778215370647006E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.133203828204913997E-03 -0.554477594184077968E-03
+ -0.310288607973751023E-03 -0.560774180157768039E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258307931039124007E-01
+  0.000000000000000000E+00 -0.200815667582686987E-03 -0.626132462157779008E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.446235732743376025E-03 -0.410993106848831988E-03  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.421918986255057984E-01
+  0.000000000000000000E+00 -0.435044026349501998E-03 -0.435054581088656020E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138420116213701009E-03
+ -0.140096078701637989E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.251983478038565016E-01  0.000000000000000000E+00
+ -0.874760871910627032E-05 -0.678933813669884995E-03 -0.666666666666666970E-02
+ -0.139011566068860000E-03 -0.139504628846480000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319960854906845005E-01  0.000000000000000000E+00
+ -0.161423534312251998E-03 -0.161423958063660011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.422822744160627974E-03 -0.447571236367950997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.251635090375166015E-01 -0.161499426349241996E-03
+ -0.161348066026670989E-03 -0.666666666666666970E-02 -0.431483288490702005E-03
+ -0.438741647350535027E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.312158150513081011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161424178244698994E-03 -0.161423314131212988E-03
+ -0.161423314059391994E-03 -0.161424178316519988E-03 -0.161423748120379993E-03
+ -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937198406996E-03
+ -0.435086217674513994E-03 -0.138285143315771992E-03 -0.140231051599568009E-03
+ -0.435084983208553009E-03 -0.435054143960659026E-03  0.000000000000000000E+00
+  0.420929687916430026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435023352498181996E-03 -0.435076141242082002E-03 -0.666666666666666970E-02
+ -0.138286660717270009E-03 -0.140229534198068988E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256871016294627011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188710877018E-03 -0.435049198898220012E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258077330179004E-03
+ -0.139258117585160996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252598187185143000E-01  0.000000000000000000E+00
+ -0.161423313380516987E-03 -0.161424178995394995E-03 -0.666666666666666970E-02
+ -0.435070189961645989E-03 -0.435068611747460024E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320929801235530984E-01 -0.435014945837523977E-03
+ -0.435084907681736004E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286694923904008E-03 -0.140229499991435992E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579074415977E-01
+  0.000000000000000000E+00 -0.161423313471188002E-03 -0.161424178904724007E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069377867833978E-03
+ -0.435069406577771021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295485029270328992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435016111300222006E-03 -0.435083692344642020E-03
+ -0.435055716669636018E-03 -0.435042938584860996E-03 -0.435014924590734005E-03
+ -0.435084929837698995E-03 -0.666666666666666970E-02 -0.138321048357603003E-03
+ -0.140195146557735994E-03 -0.138900264538850991E-03 -0.139615930376488006E-03
+ -0.138286695606074989E-03 -0.140229499309265011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804437093005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924430680990E-03
+ -0.435084930004601023E-03 -0.666666666666666970E-02 -0.138286695607513997E-03
+ -0.140229499307826003E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.257458287840863984E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188263473999E-03
+ -0.435049199364782998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258097267473007E-03 -0.139258097647866993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255775463585166986E-01  0.000000000000000000E+00 -0.161423313378626003E-03
+ -0.161424178997286006E-03 -0.666666666666666970E-02 -0.435069399615192016E-03
+ -0.435069384676977993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257404832051559007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042938584863977E-03
+ -0.435055716669632006E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138910376055990990E-03 -0.139605818859349010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361635547013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571449996E-03 -0.161424176804462013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384753956998E-03
+ -0.435069399536463976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256633780155203009E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378038989E-03 -0.161424178997872993E-03
+  0.000000000000000000E+00 -0.435069385238039011E-03 -0.435069399041488984E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089283932210022E-01 -0.435014924466399001E-03 -0.435084929967365997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695450416002E-03
+ -0.140229499464922995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292296996E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049732974E-03 -0.435083691563056016E-03
+ -0.166666666666667011E-01 -0.138320998722292005E-03 -0.140195196193046992E-03
+  0.000000000000000000E+00  0.319837098823798002E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384798286989E-03
+ -0.435069399491148988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249674734003E-01  0.000000000000000000E+00
+ -0.434980660273001976E-03 -0.435120659876175994E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201095011E-03
+ -0.140229336714244013E-03  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073410084996E-01
+  0.000000000000000000E+00 -0.161423305232396002E-03 -0.161424187143516007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069384645917010E-03
+ -0.435069399647007983E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078100609993E-01  0.000000000000000000E+00
+ -0.434980641243667987E-03 -0.435120678167986010E-03 -0.666666666666666970E-02
+ -0.140229336714366013E-03 -0.138286858200973987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823803970E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384793870004E-03
+ -0.435069399495665991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492908004E-01 -0.435016112049908018E-03 -0.435083691562873978E-03
+ -0.666666666666666970E-02 -0.138320998722292005E-03 -0.140195196193046992E-03
+ -0.166666666666667011E-01  0.312157364805190005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423748574844990E-03 -0.161423743801066992E-03 -0.161423313378673003E-03
+ -0.161424178997239006E-03 -0.161424178997232013E-03 -0.161423313378679996E-03
+ -0.166666666666667011E-01 -0.435069383970378013E-03 -0.435069400337715023E-03
+ -0.435069376957132018E-03 -0.435069407508996008E-03 -0.138284694213328010E-03
+ -0.140231500702010987E-03  0.000000000000000000E+00  0.420929739826170005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014932601145024E-03
+ -0.435084921485382001E-03 -0.666666666666666970E-02 -0.138286682366655988E-03
+ -0.140229512548683009E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.254915570191668016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423360046659997E-03
+ -0.161424132329252012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435068904052976024E-03 -0.435069891069177000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.249645635933331000E-01 -0.161423119060364001E-03 -0.161424373315548008E-03
+ -0.666666666666666970E-02 -0.139258110049798010E-03 -0.139258084865540987E-03
+ -0.166666666666667011E-01  0.319795589129939009E-01 -0.161423181597269993E-03
+ -0.161424310778641989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063340741964014E-03 -0.435075580375908022E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322057823008502012E-01
+  0.000000000000000000E+00 -0.435021995940365993E-03 -0.435077556208205004E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138280141008673011E-03
+ -0.140236053906665986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286987467722891014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423485461700003E-03
+ -0.161424006914212006E-03 -0.161423743701962999E-03 -0.161423748673950013E-03
+ -0.161423485478030988E-03 -0.161424006897880994E-03 -0.666666666666666970E-02
+ -0.434818199915888975E-03 -0.435326186273355992E-03 -0.138107939683270998E-03
+ -0.140408255232067999E-03 -0.434750487938305018E-03 -0.435395405799094998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834078568204969E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305246576987E-03
+ -0.161424187129334995E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069368742756982E-03 -0.435069415908642004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256580957796514984E-01
+  0.000000000000000000E+00 -0.434980863637928004E-03 -0.435120447850464987E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286593339167997E-03
+ -0.140229601576171000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256868273049145994E-01  0.000000000000000000E+00 -0.434980612246944019E-03
+ -0.435120708410805001E-03 -0.666666666666666970E-02 -0.140229353764139986E-03
+ -0.138286841151199011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256611955309135990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435106719724830994E-03 -0.434994029732972990E-03  0.000000000000000000E+00
+ -0.138320118014007004E-03 -0.140196076901331993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320999233193186018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423320520808004E-03 -0.161424171855104005E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059778147594005E-03
+ -0.435079222540635984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257716863610135995E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435022363973044002E-03 -0.435077172538276982E-03
+  0.000000000000000000E+00 -0.138312498857191992E-03 -0.140203696058148009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996765524508010E-01
+  0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858323006E-03
+ -0.435070024197505999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255013626644550011E-01 -0.666666666666666970E-02
+ -0.161778513994174000E-03 -0.161068978381738009E-03 -0.166666666666667011E-01
+ -0.430721544928321987E-03 -0.439524018875910982E-03  0.319967451293014030E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03
+ -0.436779134539303025E-03 -0.433396918749490016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257463974004814994E-01
+  0.000000000000000000E+00 -0.357123790381600977E-03 -0.513334443915617971E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138849213053182001E-03 -0.139666981862157999E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600707339402012E-01  0.000000000000000000E+00 -0.438162222449141976E-03
+ -0.432071491857211011E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110310967390135005E-03 -0.562620286998803958E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.250247683829411989E-01
+  0.000000000000000000E+00  0.312489112005970006E-04 -0.718930333589588031E-03
+ -0.666666666666666970E-02 -0.143948366628396992E-03 -0.134567828286942005E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321784029807069966E-01
+ -0.131393333165138998E-03 -0.556288089223852967E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.344952960577046006E-03 -0.526365968885401047E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252170877020880015E-01 -0.161423598610962993E-03
+ -0.161423893764948989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.312157364068764015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748574870008E-03 -0.161423743801043004E-03
+ -0.161423313378020991E-03 -0.161424178997891994E-03 -0.161424178997885001E-03
+ -0.161423313378028011E-03 -0.166666666666667011E-01 -0.435069384799240003E-03
+ -0.435069399490174995E-03 -0.435069377535609984E-03 -0.435069406917481974E-03
+ -0.138284694694889009E-03 -0.140231500220449988E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.420929804454727996E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014924420943987E-03 -0.435084930014753980E-03
+ -0.666666666666666970E-02 -0.138286695611132006E-03 -0.140229499304206991E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256871018618805014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188236810973E-03 -0.435049199392586983E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270825008E-03
+ -0.139258097644513989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252598186487732017E-01  0.000000000000000000E+00
+ -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02
+ -0.435069399482861021E-03 -0.435069384806393027E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320929801562968992E-01 -0.435014924399239995E-03
+ -0.435084930037426003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286695018654008E-03 -0.140229499896684989E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579148839015E-01
+  0.000000000000000000E+00 -0.161423313471389989E-03 -0.161424178904521993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384785220998E-03
+ -0.435069399504509015E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295485029252238011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016114058515979E-03 -0.435083689468307975E-03 -0.435055716894270023E-03
+ -0.435042938369449973E-03 -0.435014924446868020E-03 -0.435084929987721025E-03
+ -0.666666666666666970E-02 -0.138321048328908996E-03 -0.140195146586430001E-03
+ -0.138900264540856006E-03 -0.139615930374483994E-03 -0.138286695607003012E-03
+ -0.140229499308336012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.420929804454603998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014924420999986E-03 -0.435084930014695975E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286695611106988E-03 -0.140229499304232009E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.257458287849481986E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435049188236881013E-03 -0.435049199392514017E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270799990E-03
+ -0.139258097644539007E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463582476013E-01  0.000000000000000000E+00
+ -0.161423313378621992E-03 -0.161424178997289990E-03 -0.666666666666666970E-02
+ -0.435069399483869980E-03 -0.435069384805406023E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257404831771072991E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938369448998E-03 -0.435055716894270999E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138910376031616011E-03 -0.139605818883723013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361628699990E-01
+  0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829067001E-03
+ -0.435069399459660992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256633780151233996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423313378029990E-03 -0.161424178997881992E-03  0.000000000000000000E+00
+ -0.435069376411787974E-03 -0.435069408066628019E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281582298992E-01 -0.435014924431676993E-03 -0.435084930003603991E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694967978995E-03
+ -0.140229499947360002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292298002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049635016E-03 -0.435083691563157009E-03
+ -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800495997E-03
+ -0.435069399488889998E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249675054996E-01  0.000000000000000000E+00
+ -0.434980660275541991E-03 -0.435120659873527993E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201179010E-03
+ -0.140229336714159988E-03  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073409666025E-01
+  0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069384656581982E-03
+ -0.435069399636101993E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078101523984E-01  0.000000000000000000E+00
+ -0.434980641228473002E-03 -0.435120678183829998E-03 -0.666666666666666970E-02
+ -0.140229336714159988E-03 -0.138286858201179010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800471982E-03
+ -0.435069399488914989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492911994E-01 -0.435016112049635992E-03 -0.435083691563157009E-03
+ -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320885629537309008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435084884244977990E-03 -0.435014968312467975E-03
+ -0.435014924420903005E-03 -0.435084930014797023E-03 -0.435055716900959984E-03
+ -0.435042938363035996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286695608023002E-03 -0.140229499307315995E-03 -0.138900264540568990E-03
+ -0.139615930374770007E-03 -0.138287928689845009E-03 -0.140228266225493988E-03
+  0.000000000000000000E+00  0.419834073409644029E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03
+ -0.666666666666666970E-02 -0.435069384693222974E-03 -0.435069399598635002E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255994980449185013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660275051986E-03 -0.435120659874038977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858201180012E-03
+ -0.140229336714159012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.253691078105353005E-01  0.000000000000000000E+00
+ -0.434980641228200975E-03 -0.435120678184114005E-03 -0.666666666666666970E-02
+ -0.140229336713374998E-03 -0.138286858201963999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837881362097023E-01 -0.161423315571430996E-03
+ -0.161424176804481013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384829208001E-03 -0.435069399459517986E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579140455027E-01
+  0.000000000000000000E+00 -0.161423313471367004E-03 -0.161424178904545006E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800495021E-03
+ -0.435069399488890974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295485029266231991E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435016113771493024E-03
+ -0.435083689767614020E-03 -0.435055716901091010E-03 -0.435042938362910012E-03
+ -0.435014924420824020E-03 -0.435084930014878989E-03 -0.666666666666666970E-02
+ -0.138321048331202002E-03 -0.140195146584136995E-03 -0.138900264540522993E-03
+ -0.139615930374817007E-03 -0.138286695607886989E-03 -0.140229499307452008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.420929804454732021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014924420943012E-03 -0.435084930014755010E-03 -0.666666666666666970E-02
+ -0.138286695611133009E-03 -0.140229499304205988E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.257458287849504017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188236650999E-03 -0.435049199392754005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270826011E-03
+ -0.139258097644513013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463581538985E-01  0.000000000000000000E+00
+ -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02
+ -0.435069399482816027E-03 -0.435069384806436991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257404831864265007E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938362910012E-03 -0.435055716901091010E-03  0.000000000000000000E+00
+ -0.138910376039622004E-03 -0.139605818875716993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361628520966E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829304983E-03
+ -0.435069399459418022E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256633780150350016E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435069377535611014E-03
+ -0.435069406917480998E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322089281809833025E-01 -0.435014924399087014E-03
+ -0.435084930037585977E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286695014638991E-03 -0.140229499900700006E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255642320220383010E-01 -0.666666666666666970E-02 -0.161423743718748996E-03
+ -0.161423748657163013E-03 -0.166666666666667011E-01 -0.435032852654396000E-03
+ -0.435106754681328976E-03  0.319837065485489985E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381479000E-03
+ -0.161424178994433009E-03 -0.435069384799287003E-03 -0.435069399490127019E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256582249674894985E-01
+  0.000000000000000000E+00 -0.434980660275194016E-03 -0.435120659873890984E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286858201148001E-03 -0.140229336714191999E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419834073409640976E-01  0.000000000000000000E+00 -0.161423305232394999E-03
+ -0.161424187143517010E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069384687570974E-03 -0.435069399604414992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691078101366992E-01
+  0.000000000000000000E+00 -0.434980641228032002E-03 -0.435120678184291001E-03
+ -0.666666666666666970E-02 -0.140229336714191999E-03 -0.138286858201148001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837065485489985E-01
+  0.000000000000000000E+00 -0.161423313381479000E-03 -0.161424178994433009E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799287003E-03
+ -0.435069399490127019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.252163879420997002E-01
+ -0.161423743718748996E-03 -0.161423748657163013E-03 -0.666666666666666970E-02
+ -0.435032852654396000E-03 -0.435106754681328976E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320887366348247965E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435111019942127013E-03 -0.434989904848891011E-03
+ -0.435014811994715992E-03 -0.435085047244578027E-03 -0.435084986246467005E-03
+ -0.435014870489667008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286808290661993E-03 -0.140229386624677005E-03 -0.138288040456547998E-03
+ -0.140228154458792002E-03 -0.138900328862363002E-03 -0.139615866052975995E-03
+  0.000000000000000000E+00  0.419834068919483008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305219605994E-03
+ -0.161424187156306991E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435073590544243023E-03 -0.435065286134219016E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255994981604824000E-01
+  0.000000000000000000E+00 -0.434980605991956010E-03 -0.435120716478292976E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858621097992E-03
+ -0.140229336294241005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253691080400011010E-01  0.000000000000000000E+00 -0.434980551188209996E-03
+ -0.435120772070955015E-03 -0.666666666666666970E-02 -0.140229336059168004E-03
+ -0.138286858856170994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837098814821016E-01 -0.161423313471205999E-03 -0.161424178904706010E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069401084582977E-03
+ -0.435069383239967001E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996545746903006E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423313381464986E-03 -0.161424178994447998E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069392005489001E-03 -0.435069392121542001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292425277050150992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423743718737991E-03 -0.161423748657174993E-03
+ -0.161424178997885001E-03 -0.161423313378027008E-03 -0.161423313378019988E-03
+ -0.161424178997891994E-03 -0.666666666666666970E-02 -0.435032852646152974E-03
+ -0.435106754689757998E-03 -0.435069377535869000E-03 -0.435069406917216995E-03
+  0.000000000000000000E+00 -0.435069384798973994E-03 -0.435069399490447021E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804212083966E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924419121986E-03
+ -0.435084930016657026E-03 -0.666666666666666970E-02 -0.138286695561419006E-03
+ -0.140229499353920994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255502626954251001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423359539924989E-03
+ -0.161424132835986993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435069382989675027E-03 -0.435069401340516977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823579642372995E-01  0.000000000000000000E+00 -0.161423118848985994E-03
+ -0.161424373526926991E-03 -0.666666666666666970E-02 -0.139258097691293003E-03
+ -0.139258097224046997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255489358069172015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00 -0.435069377535374983E-03 -0.435069406917722016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322089281809831013E-01
+  0.000000000000000000E+00 -0.435014924399098994E-03 -0.435084930037573996E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695014637988E-03
+ -0.140229499900701009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258549253945415010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938363035996E-03 -0.435055716900959008E-03  0.000000000000000000E+00
+ -0.138910376039619998E-03 -0.139605818875718999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997361628526032E-01 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829208001E-03
+ -0.435069399459517010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255489358069375012E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378028011E-03
+ -0.161424178997883998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.435069377571737986E-03 -0.435069406880540007E-03
+  0.000000000000000000E+00  0.320929801553170024E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435014924399170009E-03
+ -0.435084930037499024E-03 -0.138286695016646012E-03 -0.140229499898693013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255502626954253013E-01
+  0.000000000000000000E+00 -0.161423359539924989E-03 -0.161424132835986993E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.435069382989559993E-03 -0.435069401340634017E-03  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.420929804212068007E-01
+  0.000000000000000000E+00 -0.435014924417415994E-03 -0.435084930018437015E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286695561417000E-03
+ -0.140229499353921997E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.249646302548570016E-01  0.000000000000000000E+00
+ -0.161423118848985994E-03 -0.161424373526926991E-03 -0.666666666666666970E-02
+ -0.139258097691296012E-03 -0.139258097224043988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837881362063023E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829284004E-03
+ -0.435069399459439977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.253926391078276997E-01 -0.435042938363074973E-03
+ -0.435055716900919001E-03 -0.666666666666666970E-02 -0.138910376047635993E-03
+ -0.139605818867703004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.323033163368178014E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435248914087700014E-03 -0.434857618270111026E-03
+ -0.434883815525587006E-03 -0.435221605060230013E-03 -0.435144012464388997E-03
+ -0.434958245913406004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138392691801862987E-03 -0.140123503113476010E-03 -0.138936896109538013E-03
+ -0.139579298805801011E-03 -0.138396653972959005E-03 -0.140119540942379992E-03
+  0.000000000000000000E+00  0.419134531554943013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003730031608006E-03
+ -0.162843762344304003E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439266035089253990E-03 -0.430980363868474004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256392674343047001E-01
+  0.000000000000000000E+00 -0.434777562089547990E-03 -0.435332391680561979E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138409864981691995E-03
+ -0.140106329933648005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.254101279172756003E-01  0.000000000000000000E+00 -0.373644623812598002E-03
+ -0.492026351508191052E-03 -0.666666666666666970E-02 -0.908412435916435045E-03
+  0.235481181527497004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321268372004408984E-01 -0.200001267691483995E-03 -0.627372315610022989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.127171097760206999E-03
+ -0.545760156628731963E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995602249590020E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423310638646997E-03 -0.161424181737265012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.437288210106853009E-03 -0.432899574546833006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.296377168966710999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434989904918347983E-03 -0.435111019849799988E-03
+ -0.435084787576991007E-03 -0.435015060995395985E-03 -0.435015087313978986E-03
+ -0.435084760132168008E-03 -0.666666666666666970E-02 -0.138910479330903999E-03
+ -0.139605715584434998E-03 -0.138288220902347007E-03 -0.140227974012992993E-03
+  0.000000000000000000E+00 -0.138286985894706013E-03 -0.140229209020633011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.419834073423189028E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305232431997E-03
+ -0.161424187143480012E-03 -0.666666666666666970E-02 -0.435069384782949973E-03
+ -0.435069399506886020E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256582245002681016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980660161395994E-03
+ -0.435120659992680999E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286857243334990E-03 -0.140229337672004008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868356243260994E-01  0.000000000000000000E+00 -0.434980641780946975E-03
+ -0.435120677607716026E-03 -0.666666666666666970E-02 -0.140229336498973991E-03
+ -0.138286858416365006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255642316967265998E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743725422992E-03 -0.161423748650489993E-03  0.000000000000000000E+00
+ -0.435032788457063980E-03 -0.435106820324987996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996548633918979E-01  0.000000000000000000E+00
+ -0.161423313389182012E-03 -0.161424178986729997E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069349793661977E-03 -0.435069435284559975E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256786747351973009E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743711796008E-03 -0.161423748664116001E-03  0.000000000000000000E+00
+ -0.435032875429173019E-03 -0.435106731393101022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996542943430968E-01 -0.161423313373898990E-03 -0.161424179002013995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069393047989017E-03
+ -0.435069391099458976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256124534411805993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563580131968979E-03 -0.447998624216189984E-03
+ -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186493010E-03
+  0.000000000000000000E+00  0.319552376669784977E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161419817025352993E-03 -0.161427675350558989E-03 -0.434132277684113981E-03
+ -0.436028351477167012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256464487089681989E-01  0.000000000000000000E+00 -0.434899896387440008E-03
+ -0.435204881412590012E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138261308742774005E-03 -0.140254886172565995E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419830852156009016E-01
+  0.000000000000000000E+00 -0.161423296248432011E-03 -0.161424196127479998E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435080309509953001E-03
+ -0.435058715368486990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253594622357068006E-01  0.000000000000000000E+00
+ -0.434950331803641015E-03 -0.435152283781920987E-03 -0.666666666666666970E-02
+ -0.140249996876020999E-03 -0.138266198039317998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319571786575686967E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161420475113085006E-03 -0.161427017262827003E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830163140978E-03
+ -0.435847642627907976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252660555352814988E-01 -0.423620518083848974E-03
+ -0.446908164705409994E-03 -0.666666666666666970E-02 -0.138290733115793013E-03
+ -0.140225461799546011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320833316936339971E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434976564103407978E-03 -0.435124935063587002E-03 -0.435182102606195996E-03
+ -0.434921744344358996E-03 -0.434970473518194996E-03 -0.435131286534540024E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138283560982178005E-03
+ -0.140232633933160992E-03 -0.138290646200932993E-03 -0.140225548714406005E-03
+ -0.138898978959791009E-03 -0.139617215955547988E-03  0.000000000000000000E+00
+  0.421628681367203009E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.397192681545031025E-04 -0.727400690543495007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435235074593796000E-03
+ -0.434912988301708998E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256973882949261996E-01  0.000000000000000000E+00
+ -0.435443110746612982E-03 -0.434671360314161997E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138672805730211991E-03 -0.139843389185127006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253811408048402014E-01
+  0.000000000000000000E+00 -0.111660192932873994E-03 -0.576021229456117972E-03
+ -0.666666666666666970E-02 -0.807402803567670987E-03  0.113198743547475004E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.318951479546440986E-01
+ -0.334801614937765011E-03 -0.534674356904564995E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.135236534786964987E-03 -0.143279660128374010E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320991543630683995E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423299320898012E-03
+ -0.161424193055014999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.439089518603328992E-03 -0.431138195492317014E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.292550790147875009E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423735549572994E-03 -0.161423756826338988E-03 -0.161424083758227992E-03
+ -0.161423408617684993E-03 -0.161423408591030993E-03 -0.161424083784880988E-03
+ -0.666666666666666970E-02 -0.434939106056469018E-03 -0.435202598866759986E-03
+ -0.434989974486759020E-03 -0.435150588304750977E-03  0.000000000000000000E+00
+ -0.434898044279196977E-03 -0.435244580341264027E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736779582994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014942481456979E-03
+ -0.435084911182310997E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286681738492993E-03 -0.140229513176846004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255502900122648001E-01
+  0.000000000000000000E+00 -0.161423360190818011E-03 -0.161424132185093998E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552595005E-03
+ -0.435069911008886009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252824734873801985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423118220565989E-03
+ -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03
+ -0.139258084345714997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255485447377057015E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423302756024003E-03 -0.161424189619888006E-03  0.000000000000000000E+00
+ -0.434872688531902005E-03 -0.435270525342468988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322052497845985025E-01  0.000000000000000000E+00 -0.435023979094175024E-03
+ -0.435075488205578982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138279010834745992E-03 -0.140237184080593006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258530259445350005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434958189050955980E-03 -0.435144092591555997E-03  0.000000000000000000E+00
+ -0.138909064929744003E-03 -0.139607129985594994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320994819723071992E-01 -0.161423308725767013E-03
+ -0.161424183650144996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435080068461255018E-03 -0.435058950988906974E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731138163008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.429527300766952978E-03 -0.440793399874087001E-03 -0.166666666666667011E-01
+ -0.138412525549365991E-03 -0.140103669365973006E-03  0.000000000000000000E+00
+  0.319693244854589026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161422576318977012E-03
+ -0.161424916056934997E-03 -0.437741651795140981E-03 -0.432457178334903997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257682962742953013E-01
+  0.000000000000000000E+00 -0.372135275695885989E-03 -0.498733752802193965E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908713201794991E-03 -0.139607481713544007E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896828694114015E-01  0.000000000000000000E+00
+ -0.436533664255522999E-03 -0.433624016339187999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626819469867013E-03 -0.143889375445472011E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833740154079986E-01  0.000000000000000000E+00  0.378159457419296988E-04
+ -0.725497368130921007E-03 -0.666666666666666970E-02 -0.597713125202046038E-03
+ -0.752181291868920024E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087590768978E-01  0.000000000000000000E+00 -0.981956570690172963E-04
+ -0.589485765319974005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422521139259002E-03 -0.626419769058458972E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252392538739079006E-01 -0.333702074783927001E-03 -0.536013663819403965E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.323033117395759989E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435248913863458978E-03
+ -0.434857618485890026E-03 -0.434883818075460025E-03 -0.435221602402711982E-03
+ -0.435144010721807983E-03 -0.434958247585231009E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.138392690175634999E-03 -0.140123504739703998E-03
+ -0.138936895546960007E-03 -0.139579299368378990E-03 -0.138396652402613996E-03
+ -0.140119542512725002E-03  0.000000000000000000E+00  0.419134536981460026E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003888239780996E-03
+ -0.162843604136131013E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439265966752186974E-03 -0.430980430216653992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256385430489332010E-01  0.000000000000000000E+00 -0.434778992070864020E-03
+ -0.435330901615854019E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138408856053690005E-03 -0.140107338861648992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.254101277388419991E-01  0.000000000000000000E+00
+ -0.373643453136775986E-03 -0.492027450061672958E-03 -0.666666666666666970E-02
+ -0.908412791146940971E-03  0.235481536758003012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321268373345425032E-01 -0.200001595110305000E-03
+ -0.627372161158813030E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.127171079349847012E-03 -0.545760175039092005E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000029443719026E-01  0.000000000000000000E+00 -0.161423322269462990E-03
+ -0.161424170106448992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.437246082492378997E-03 -0.432940746439987984E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296367144183955003E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435124963867026995E-03
+ -0.434976534547966984E-03 -0.435077880745983003E-03 -0.435021684411966020E-03
+ -0.435028125198068990E-03 -0.435071164272797983E-03 -0.666666666666666970E-02
+ -0.138910250043690000E-03 -0.139605944871648997E-03 -0.138289604026781002E-03
+ -0.140226590888557995E-03 -0.138286261066744988E-03 -0.140229933848595012E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.419834145363481015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305428732005E-03 -0.161424186947180004E-03
+ -0.666666666666666970E-02 -0.435068797988044014E-03 -0.435069999528659985E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256581969780162004E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434982641293220015E-03 -0.435118594151826009E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286797863401998E-03
+ -0.140229397051936999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256861833871666993E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980942275246015E-03 -0.435120364500525027E-03
+ -0.666666666666666970E-02 -0.140230675945465999E-03 -0.138285518969872998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255648152871906995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423744276026012E-03
+ -0.161423748099886999E-03  0.000000000000000000E+00 -0.434971116348022017E-03
+ -0.435169879709118980E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996762669861976E-01  0.000000000000000000E+00
+ -0.161423313958423010E-03 -0.161424178417488999E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067366048129010E-03 -0.435071463729721000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256787289926059988E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423752315998997E-03 -0.161423740059913012E-03
+  0.000000000000000000E+00 -0.434863867040502017E-03 -0.435279543030525011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000539630773019E-01 -0.161423323759014989E-03 -0.161424168616896993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435021966445662005E-03
+ -0.435117885588833984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256124534378156000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563583786935012E-03 -0.447998620447711019E-03
+ -0.166666666666667011E-01 -0.138286150716901006E-03 -0.140230044198437991E-03
+  0.000000000000000000E+00  0.319552376748891975E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161419817028325008E-03 -0.161427675347588004E-03 -0.434132178542880997E-03
+ -0.436028452917342011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256464472998883010E-01  0.000000000000000000E+00 -0.434900113529982019E-03
+ -0.435204654991785977E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138261303887879009E-03 -0.140254891027459988E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419830890665787021E-01
+  0.000000000000000000E+00 -0.161423296358576002E-03 -0.161424196017336007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435055172719591018E-03
+ -0.435083932013876024E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253594585320825010E-01  0.000000000000000000E+00
+ -0.434951272976298023E-03 -0.435151302408671016E-03 -0.666666666666666970E-02
+ -0.140250006846153006E-03 -0.138266188069185992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319571786801332997E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161420475119938004E-03 -0.161427017255975008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308550272304017E-03
+ -0.435847929000599975E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252660555256693994E-01 -0.423620528167419021E-03
+ -0.446908154298355977E-03 -0.666666666666666970E-02 -0.138290733082334995E-03
+ -0.140225461833005005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320861713072602972E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435354479890220003E-03
+ -0.434756412513020009E-03 -0.435057879269062974E-03 -0.435040865004169999E-03
+ -0.434966118570618990E-03 -0.435135827690980980E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.138285724464454012E-03 -0.140230470450885988E-03
+ -0.138897585402013996E-03 -0.139618609513325002E-03 -0.138201697331153989E-03
+ -0.140314497584185008E-03  0.000000000000000000E+00  0.421628678547669028E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.397284147640295009E-04 -0.727409837153020992E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435314260356761012E-03 -0.434838234146065004E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256973904451089016E-01  0.000000000000000000E+00 -0.435443064032224982E-03
+ -0.434671405121811992E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138672811291356012E-03 -0.139843383623983013E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253811442749607014E-01  0.000000000000000000E+00
+ -0.111659649866176005E-03 -0.576021772522815052E-03 -0.666666666666666970E-02
+ -0.807399256528189952E-03  0.113123198788523993E-04 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.318951480124047976E-01 -0.334801256611660003E-03
+ -0.534674694551013989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.135236541601312007E-03 -0.143279653314027993E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320991543634817009E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423299320899991E-03 -0.161424193055011991E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089603173896989E-03
+ -0.431138112796988022E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292550790154386987E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423735549587007E-03
+ -0.161423756826326004E-03 -0.161424083758224008E-03 -0.161423408617689004E-03
+ -0.161423408591035005E-03 -0.161424083784877004E-03 -0.666666666666666970E-02
+ -0.434939104555981024E-03 -0.435202600400863017E-03 -0.434989974551208984E-03
+ -0.435150588238853982E-03  0.000000000000000000E+00 -0.434898044011626994E-03
+ -0.435244580614824986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736778404007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014942481818994E-03 -0.435084911181932989E-03
+ -0.666666666666666970E-02 -0.138286681738250999E-03 -0.140229513177087998E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502900122961014E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360190818987E-03 -0.161424132185092995E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884543961991E-03
+ -0.435069911017712987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252824734843049986E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423118220580002E-03 -0.161424374155332007E-03
+ -0.666666666666666970E-02 -0.139258110569847997E-03 -0.139258084345492004E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255485447455677007E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756240003E-03
+ -0.161424189619672006E-03  0.000000000000000000E+00 -0.434872688199729998E-03
+ -0.435270525682074014E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322052497810565996E-01  0.000000000000000000E+00
+ -0.435023979104199015E-03 -0.435075488195124996E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138279010827208998E-03 -0.140237184088130000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.258530259407728988E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434958190724600005E-03 -0.435144090846366989E-03
+  0.000000000000000000E+00 -0.138909064921109010E-03 -0.139607129994229987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320994819774753012E-01 -0.161423308725907011E-03 -0.161424183650004998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068181975977E-03
+ -0.435058951262028994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614731138162001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766965989E-03
+ -0.440793399874073991E-03 -0.166666666666667011E-01 -0.138412525549365991E-03
+ -0.140103669365973006E-03  0.000000000000000000E+00  0.319693244854589997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161422576318977012E-03 -0.161424916056934997E-03
+ -0.437741651794766986E-03 -0.432457178335270023E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.257682962742878004E-01  0.000000000000000000E+00
+ -0.372135275696198998E-03 -0.498733752801886051E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138908713201775990E-03
+ -0.139607481713563007E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.418896828693844023E-01  0.000000000000000000E+00 -0.436533664324920991E-03
+ -0.433624016272548000E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.134626819467095006E-03 -0.143889375448244994E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.252833740154073013E-01
+  0.000000000000000000E+00  0.378159457400615982E-04 -0.725497368129053035E-03
+ -0.666666666666666970E-02 -0.597713125208466033E-03 -0.752181291804723058E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321707087590768007E-01
+  0.000000000000000000E+00 -0.981956570691642057E-04 -0.589485765319826987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.242422521137329990E-03
+ -0.626419769060301032E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252392538739078000E-01
+ -0.333702074783996986E-03 -0.536013663819338046E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138219319560300990E-03
+ -0.140296875355038007E-03 -0.166666666666667011E-01  0.320887149633356031E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435084983875554023E-03
+ -0.435014872763852026E-03 -0.435014828662647025E-03 -0.435085029863970982E-03
+ -0.435055763684865022E-03 -0.435042893498926986E-03 -0.166666666666667011E-01
+ -0.138286799284913995E-03 -0.140229395630425002E-03 -0.138900300230182012E-03
+ -0.139615894685157012E-03 -0.138288031619809013E-03 -0.140228163295530011E-03
+  0.000000000000000000E+00  0.419834394758250989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423306124254990E-03 -0.161424186251656992E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435073250189719975E-03
+ -0.435065618982948010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255995701724177002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434985228537088976E-03
+ -0.435115896247923015E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286994987186010E-03 -0.140229199928153990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253544285340416009E-01  0.000000000000000000E+00
+ -0.434987985656363021E-03 -0.435113023678724013E-03 -0.666666666666666970E-02
+ -0.140261879841764011E-03 -0.138254315073575989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319435795329029021E-01 -0.161410251962658001E-03
+ -0.161437240413254008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.433888695127137008E-03 -0.436278193993204976E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320989341466072983E-01
+  0.000000000000000000E+00 -0.161423293334831010E-03 -0.161424199041080999E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435076232841126974E-03
+ -0.435062702151373017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.291210504532323985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423172358814994E-03
+ -0.161424320017096988E-03 -0.161436491869935994E-03 -0.161411000505975988E-03
+ -0.161411014301727991E-03 -0.161436478074183991E-03 -0.666666666666666970E-02
+ -0.437347517893396990E-03 -0.432844697077642020E-03 -0.436241571863408976E-03
+ -0.433925164846824982E-03  0.000000000000000000E+00 -0.437803693779869989E-03
+ -0.432399238650052009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420931105150842977E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014654010180979E-03 -0.435085211979509994E-03
+ -0.666666666666666970E-02 -0.138286962315859997E-03 -0.140229232599479000E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255498409722413992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423349311523990E-03 -0.161424143064387991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435079351261626014E-03 -0.435059652438656008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252819204017892991E-01
+  0.000000000000000000E+00 -0.161423122522292011E-03 -0.161424369853619998E-03
+ -0.666666666666666970E-02 -0.139257846415800999E-03 -0.139258348499537998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255491014693066017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423317190226995E-03
+ -0.161424175185684987E-03  0.000000000000000000E+00 -0.438043455875436027E-03
+ -0.432161023931338002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322801706716391024E-01  0.000000000000000000E+00 -0.434874122345711006E-03
+ -0.435231709745513020E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138392757283109009E-03 -0.140123437632229988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258886981903099014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435135825668647020E-03 -0.434966107095303013E-03  0.000000000000000000E+00
+ -0.138935446888224990E-03 -0.139580748027114007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000528704651969E-01 -0.161423323773416988E-03 -0.161424168602494994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435030730220607980E-03
+ -0.435108924522503010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.254844804391417996E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161142124363106994E-03
+ -0.161705368012804988E-03 -0.166666666666667011E-01 -0.225281625809598993E-03
+ -0.643927738894806969E-03  0.000000000000000000E+00  0.321599842976618006E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.464257778793151032E-04 -0.734107200268307048E-03
+ -0.388025614613707001E-03 -0.483301313684787997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255020432374560985E-01  0.000000000000000000E+00
+ -0.337126126111301995E-03 -0.532578756276378025E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.137258701832956002E-03
+ -0.141257493082382995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421427039886230967E-01  0.000000000000000000E+00 -0.413943151144718978E-03
+ -0.457074858319932024E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.211434099434618995E-03 -0.884365353823558038E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.252964489671319986E-01
+  0.000000000000000000E+00 -0.213567925484438011E-03 -0.614115947829842971E-03
+ -0.666666666666666970E-02 -0.140017356100088014E-03 -0.138498838815251987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319739847535212979E-01
+  0.000000000000000000E+00 -0.161422918075368997E-03 -0.161424574300543012E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.427230082581125004E-03
+ -0.443079920816016025E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253449783929956991E-01
+ -0.431322912949462978E-03 -0.438929377851577025E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138850109293811998E-03
+ -0.139666085621527000E-03 -0.166666666666667011E-01  0.240847367787417992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.419834073413790990E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305232406004E-03
+ -0.161424187143506005E-03 -0.666666666666666970E-02 -0.435069384667189979E-03
+ -0.435069399625256014E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255994979990794985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980660475906998E-03 -0.435120659664607981E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286858107371994E-03 -0.140229336807968006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253691078096668007E-01
+  0.000000000000000000E+00 -0.434980641165363003E-03 -0.435120678249643021E-03
+ -0.666666666666666970E-02 -0.140229336715335995E-03 -0.138286858200003003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837098824014010E-01
+ -0.161423313471232996E-03 -0.161424178904679013E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384765853027E-03 -0.435069399524313973E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996543007720028E-01  0.000000000000000000E+00 -0.161423313374075010E-03
+ -0.161424179001836999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069381467433980E-03 -0.435069402897058982E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292425276973896017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423743711891987E-03 -0.161423748664019995E-03 -0.161424178997872993E-03
+ -0.161423313378038989E-03 -0.161423313378032999E-03 -0.161424178997880013E-03
+ -0.666666666666666970E-02 -0.435032871054405002E-03 -0.435106735866860005E-03
+ -0.435069379977790014E-03 -0.435069404420270985E-03 -0.435069383385271993E-03
+ -0.435069400936004996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.420929804189209972E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014924423208018E-03 -0.435084930012396979E-03
+ -0.666666666666666970E-02 -0.138286695556728991E-03 -0.140229499358610006E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502626963007989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423359539945995E-03 -0.161424132835965987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435069382819562025E-03 -0.435069401514463013E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252823579206017009E-01
+  0.000000000000000000E+00 -0.161423118849177003E-03 -0.161424373526735006E-03
+ -0.666666666666666970E-02 -0.139258097695681013E-03 -0.139258097219658011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255489355337556005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313370652997E-03
+ -0.161424179005259012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435069374869375021E-03 -0.435069409643798000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281766608018E-01 -0.435014924689618014E-03 -0.435084929734623027E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695005679009E-03
+ -0.140229499909660992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258549254171622014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042893494948018E-03 -0.435055763690273998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138910376350831992E-03 -0.139605818564507005E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997360358935974E-01 -0.161423315568021993E-03 -0.161424176807890992E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069380937729011E-03
+ -0.435069403438677978E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255489358069375012E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435069377571565001E-03 -0.435069406880717003E-03  0.000000000000000000E+00
+  0.320929801553103966E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435014924404768016E-03 -0.435084930031662005E-03
+ -0.138286695016626008E-03 -0.140229499898713992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255502626954418992E-01  0.000000000000000000E+00
+ -0.161423359539924989E-03 -0.161424132835986993E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.435069382752795012E-03
+ -0.435069401582733977E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.420929804179970002E-01  0.000000000000000000E+00
+ -0.435014924505136027E-03 -0.435084929926964016E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286695554652988E-03 -0.140229499360686009E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.249646302547502988E-01  0.000000000000000000E+00 -0.161423118848985994E-03
+ -0.161424373526925988E-03 -0.666666666666666970E-02 -0.139258097697407009E-03
+ -0.139258097217931988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837881362066007E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423315571430996E-03 -0.161424176804481013E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384823893025E-03 -0.435069399464952007E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.253926391078274985E-01 -0.435042938363147018E-03 -0.435055716900843974E-03
+ -0.666666666666666970E-02 -0.138910376047635993E-03 -0.139605818867703004E-03
+ -0.166666666666667011E-01 -0.666666666666666970E-02 -0.162064605978737010E-03
+  0.755721724764775969E-02 -0.421583401256607965E-11 -0.164615581083446994E-03
+ -0.622132993821362964E-09  0.822717713000382066E-02 -0.666666666666666970E-02
+ -0.435031787535767993E-03  0.921658221859024927E-02 -0.671472362675333990E-03
+  0.000000000000000000E+00 -0.435066598341386974E-03 -0.622856281165874953E-03
+  0.926582425818990064E-02 -0.666666666666666970E-02 -0.435031698814498024E-03
+  0.921598015968093029E-02 -0.671350185242260036E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066687046999979E-03
+  0.000000000000000000E+00 -0.622803871309166967E-03  0.926515487581706064E-02
+ -0.666666666666666970E-02 -0.162091523387943991E-03  0.755721896647259996E-02
+ -0.540935128117279988E-11  0.000000000000000000E+00 -0.164750534006632987E-03
+ -0.795366002646085000E-09  0.822717800583096931E-02  0.926452029956657992E-02
+ -0.622281045706827999E-03 -0.666666666666666970E-02 -0.668721208315983004E-03
+  0.920448404025630065E-02 -0.666666666666666970E-02  0.920291930713102065E-02
+ -0.668399798797767049E-03 -0.622140084769083981E-03  0.926273190908652976E-02
+ -0.666666666666666970E-02 -0.162061623289820004E-03  0.755721877319791985E-02
+ -0.410171929572062999E-11 -0.164601757794207001E-03 -0.605571814693378962E-09
+  0.822717790541416943E-02  0.921717278330758984E-02 -0.671606351121000053E-03
+ -0.666666666666666970E-02 -0.622958188273658011E-03  0.926596681290394082E-02
+ -0.666666666666666970E-02 -0.164300067974464987E-03  0.822730729925378983E-02
+ -0.253113151843113024E-09 -0.164103118034321003E-03 -0.251546774270035010E-09
+  0.822722715999001965E-02 -0.666666666666666970E-02 -0.435040487769458974E-03
+  0.921687604564088055E-02 -0.671540363061452956E-03  0.000000000000000000E+00
+ -0.435057899246222006E-03 -0.622872037532872002E-03  0.926585755069535971E-02
+ -0.666666666666666970E-02 -0.162127209687436009E-03  0.759121330666192966E-02
+ -0.381352367097723024E-11  0.000000000000000000E+00 -0.164582493974596989E-03
+ -0.545377294793035977E-09  0.822717728074968957E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166319287245753013E-03  0.755721788134880966E-02
+ -0.125543672150161994E-10 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854288422184003E-03  0.000000000000000000E+00 -0.695251112369320969E-09
+  0.759121397932040025E-02 -0.666666666666666970E-02  0.921698284063145017E-02
+ -0.671564905209266956E-03 -0.622877569603531964E-03  0.926586949940542970E-02
+ -0.666666666666666970E-02 -0.435044292403726006E-03  0.921676136558105021E-02
+ -0.671305578260984965E-03 -0.435054094871408024E-03 -0.622625142915718996E-03
+  0.926561885584748987E-02 -0.164408450418608012E-03  0.822722728633254984E-02
+ -0.379408966293651983E-09 -0.162009019530446013E-03 -0.666666666666666970E-02
+ -0.254284697644517994E-11  0.755732846813902005E-02 -0.162062503890727003E-03
+ -0.666666666666666970E-02  0.755732483871392972E-02 -0.412630295389444991E-11
+ -0.164604272365608011E-03 -0.609067369792561007E-09  0.822717730203972007E-02
+ -0.162008946283214003E-03 -0.666666666666666970E-02  0.755728526608244033E-02
+ -0.254314505624484981E-11 -0.164408477352361010E-03 -0.379466146432811988E-09
+  0.822722728638950081E-02 -0.666666666666666970E-02 -0.162061922095729002E-03
+  0.755721774336119961E-02 -0.411297420986845978E-11 -0.164602266437615996E-03
+ -0.607177113853009997E-09  0.822717717450133050E-02  0.926588728390210017E-02
+ -0.622885394141277989E-03 -0.666666666666666970E-02 -0.671602423890051950E-03
+  0.921714903947036014E-02 -0.435048603885989987E-03 -0.666666666666666970E-02
+  0.921700338564301024E-02 -0.671444778088914037E-03 -0.435049783507797990E-03
+ -0.622737470874805995E-03  0.926573446347610068E-02 -0.666666666666666970E-02
+  0.897119442365579972E-02 -0.606814776285979950E-03 -0.607757345476577036E-03
+  0.923427126759777082E-02  0.921865858272334984E-02 -0.671454242057297965E-03
+ -0.666666666666666970E-02 -0.646212455869751953E-03  0.929466342228658926E-02
+ -0.192798491760539997E-03 -0.666666666666666970E-02  0.825745086852652972E-02
+ -0.932628952743546979E-06 -0.167240534534016013E-03 -0.225014212014958993E-07
+  0.822685297950877942E-02 -0.666666666666666970E-02 -0.435814163537570013E-03
+  0.921887387061852068E-02 -0.575307467536130958E-03 -0.434281293408232021E-03
+ -0.604501016235682955E-03  0.915698677472209961E-02 -0.413213566066566995E-03
+ -0.666666666666666970E-02  0.877723361706403069E-02 -0.541754211244169052E-03
+  0.000000000000000000E+00 -0.454683602340957979E-03 -0.590990555173378953E-03
+  0.920776881757097952E-02 -0.666666666666666970E-02 -0.163098401897450001E-03
+  0.763192105293324010E-02 -0.471912653584295006E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165759527253025E-03
+  0.000000000000000000E+00 -0.266005332113615023E-03  0.861802401723344921E-02
+ -0.666666666666666970E-02 -0.162004211770428006E-03  0.757730699043844026E-02
+ -0.176948923798146998E-05 -0.529438965832964951E-03 -0.405267405870187984E-04
+  0.875391888040484047E-02  0.921979764348120068E-02 -0.602143221168287973E-03
+ -0.666666666666666970E-02 -0.555336710991042974E-03  0.881585220388428001E-02
+ -0.666666666666666970E-02 -0.163010868628506010E-03  0.795402855233807993E-02
+ -0.273140602330698999E-11 -0.164270868470497012E-03 -0.213417929117084997E-09
+  0.822716826170043040E-02 -0.666666666666666970E-02  0.927693305369737056E-02
+ -0.632604553292874993E-03 -0.671545581861832000E-03  0.921776054175638994E-02
+ -0.161978982193291990E-03 -0.666666666666666970E-02  0.755685732173456970E-02
+ -0.195386460922786995E-11 -0.164191969605661007E-03 -0.291699488577104992E-09
+  0.822713227252660009E-02 -0.666666666666666970E-02 -0.162064640844275011E-03
+  0.755721703602540030E-02 -0.421716748215653003E-11 -0.164615755229752009E-03
+ -0.622326809900769984E-09  0.822717689294136939E-02 -0.666666666666666970E-02
+ -0.435030941620315019E-03  0.921655770481275005E-02 -0.671470405959126958E-03
+  0.000000000000000000E+00 -0.435067444105677011E-03 -0.622859268229516961E-03
+  0.926582520767522955E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435032188115217974E-03  0.921571954136004949E-02 -0.671094905936150000E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066197830839016E-03
+  0.000000000000000000E+00 -0.622560079517476997E-03  0.926486481761856079E-02
+ -0.666666666666666970E-02 -0.162178699975160996E-03  0.755732514584614001E-02
+ -0.123678884662636997E-10 -0.165233346068301005E-03 -0.180156095420544003E-08
+  0.822721902239778938E-02  0.925859295317845929E-02 -0.620261598377330964E-03
+ -0.666666666666666970E-02 -0.659675714565077006E-03  0.916581375466305956E-02
+ -0.666666666666666970E-02  0.818856060767081048E-02 -0.736710799666523000E-11
+ -0.221521744718127990E-09  0.822698054922356946E-02 -0.666666666666666970E-02
+ -0.162061886876159011E-03  0.755721683506239972E-02 -0.411153398251473010E-11
+ -0.164602034973986998E-03 -0.606965384602901960E-09  0.822717599389061020E-02
+ -0.435048835805744005E-03  0.921717024009354943E-02 -0.671607913788692021E-03
+ -0.435049551589144980E-03 -0.666666666666666970E-02 -0.622896101942852008E-03
+  0.926589988923041952E-02 -0.666666666666666970E-02 -0.164092201396150991E-03
+  0.822715599945273920E-02 -0.245104005328256975E-09 -0.164092160478588006E-03
+ -0.245036767609787999E-09  0.822714877827463061E-02 -0.666666666666666970E-02
+ -0.435040451026806000E-03  0.921687587838842040E-02 -0.671541321966330983E-03
+ -0.435057935985662001E-03 -0.622873182117880046E-03  0.926585851990125060E-02
+ -0.666666666666666970E-02 -0.162127502447745005E-03  0.759121254289110009E-02
+ -0.382274259639387968E-11  0.000000000000000000E+00 -0.164583524873151011E-03
+ -0.546666909348633031E-09  0.822717637948596967E-02 -0.666666666666666970E-02
+ -0.166319284889783996E-03  0.755721738061322038E-02 -0.125530862526258001E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854283136274001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.695179911733615969E-09  0.759121329234466961E-02 -0.435040521890532996E-03
+ -0.666666666666666970E-02  0.921687815467167083E-02 -0.671542181838102967E-03
+ -0.435057865128120000E-03 -0.622873658606718991E-03  0.926585863114167026E-02
+ -0.666666666666666970E-02 -0.435041288920689020E-03  0.921669954253361012E-02
+ -0.671349556748095989E-03 -0.435057098161677983E-03 -0.622686448593169997E-03
+  0.926564688510513046E-02 -0.164325652029393012E-03  0.822714890084452928E-02
+ -0.369068713238530013E-09 -0.162006572449318004E-03 -0.666666666666666970E-02
+ -0.248169403822924009E-11  0.755729660596458994E-02 -0.162062728183659998E-03
+ -0.666666666666666970E-02  0.755729867674227035E-02 -0.413683822802849013E-11
+ -0.164605417097992009E-03 -0.610604701856152959E-09  0.822717650154041955E-02
+ -0.162006472670696007E-03 -0.666666666666666970E-02  0.755723750063842965E-02
+ -0.248209213801367997E-11 -0.164325689607119994E-03 -0.369144828512267008E-09
+  0.822714890092034017E-02 -0.666666666666666970E-02 -0.162062060860401010E-03
+  0.755721735591260990E-02 -0.411820697467498986E-11 -0.164602902125366996E-03
+ -0.607936291979959974E-09  0.822717685023799036E-02 -0.435051421420119008E-03
+  0.926588186737323932E-02 -0.622883850263154012E-03 -0.435046965950540009E-03
+ -0.666666666666666970E-02 -0.671591030490054954E-03  0.921709452945720931E-02
+ -0.435046965633857023E-03 -0.666666666666666970E-02  0.921703017052688013E-02
+ -0.671537494669817957E-03 -0.435051421736794026E-03 -0.622834247686287991E-03
+  0.926581357500960018E-02 -0.666666666666666970E-02  0.755832397448806970E-02
+ -0.411138722941705998E-11 -0.609957135264493987E-09  0.822725821738184981E-02
+  0.926024553873549920E-02 -0.621022213891748000E-03 -0.666666666666666970E-02
+ -0.663176099841847051E-03  0.918049665304914927E-02 -0.433952552704485026E-03
+ -0.666666666666666970E-02  0.917940522365195966E-02 -0.660850447221769969E-03
+ -0.436139822938484006E-03 -0.618641376898239033E-03  0.925939891361087994E-02
+ -0.666666666666666970E-02 -0.436004520088601995E-03  0.917698869910051063E-02
+ -0.534972138649924970E-03 -0.434089279817291974E-03 -0.560727539174508006E-03
+  0.911474373371438072E-02 -0.405342384042105025E-03 -0.666666666666666970E-02
+  0.868224382189996917E-02 -0.501678846320934967E-03  0.000000000000000000E+00
+ -0.460809060945838991E-03 -0.586502661037227048E-03  0.920011158722021036E-02
+ -0.666666666666666970E-02 -0.163211684871365992E-03  0.764753570908803006E-02
+ -0.433463713660776003E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.529849870720847950E-03  0.000000000000000000E+00
+ -0.242696464819676003E-03  0.855013113167163961E-02 -0.666666666666666970E-02
+ -0.162715770416370988E-03  0.761282345029158040E-02 -0.385507239315511005E-04
+ -0.531931290617636970E-03 -0.289327756212754013E-03  0.896268459992746042E-02
+  0.822708378408086929E-02 -0.453015989481950975E-09 -0.666666666666666970E-02
+ -0.344818166136396003E-11  0.765885647535816984E-02 -0.666666666666666970E-02
+ -0.162981032279518011E-03  0.765876435591864024E-02 -0.850978247264382040E-09
+ -0.167181094192769006E-03 -0.102217899652555996E-06  0.822724871883005922E-02
+ -0.666666666666666970E-02  0.899524486155203924E-02 -0.614036210565926987E-03
+ -0.609553123871605014E-03  0.923729250745260015E-02 -0.162146754878560000E-03
+ -0.666666666666666970E-02  0.755888395212829002E-02 -0.877868214680283015E-11
+ -0.165048028543372008E-03 -0.128278104037022006E-08  0.822721063460546062E-02
+ -0.666666666666666970E-02 -0.435042623242022023E-03  0.921694787133059065E-02
+ -0.671556689557351017E-03 -0.435055763937149977E-03 -0.622875549102152042E-03
+  0.926586538966900006E-02 -0.666666666666666970E-02 -0.435049192464642004E-03
+  0.921716877978374927E-02 -0.671606871920753961E-03  0.000000000000000000E+00
+ -0.435049194930888991E-03 -0.622886313955485970E-03  0.926588938212210070E-02
+ -0.666666666666666970E-02 -0.162062558982797991E-03  0.755721781611762029E-02
+ -0.413711697147502992E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605380154927001E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.610686417053202014E-09  0.822717710018053966E-02
+ -0.666666666666666970E-02 -0.435040485643464010E-03  0.921687647065221917E-02
+ -0.671540876850887998E-03 -0.435057901372032005E-03 -0.622872543909229947E-03
+  0.926585805993978932E-02  0.822717712936564018E-02 -0.606929319768994989E-09
+ -0.666666666666666970E-02 -0.411128314368316016E-11  0.755721765744643992E-02
+ -0.666666666666666970E-02  0.755721765744643992E-02 -0.411128317078820962E-11
+ -0.606929319768994989E-09  0.822717712936564018E-02 -0.666666666666666970E-02
+ -0.162062002168695000E-03  0.755721765497851042E-02 -0.411600421982957979E-11
+ -0.164602608742393997E-03 -0.607615952871632013E-09  0.822717713082168033E-02
+ -0.435050788864285991E-03  0.926588513557140975E-02 -0.622885206966810982E-03
+ -0.435047598518492997E-03 -0.666666666666666970E-02 -0.671596264858268996E-03
+  0.921711667933667982E-02 -0.666666666666666970E-02 -0.435047598512343999E-03
+  0.921711525903850040E-02 -0.671594794936314007E-03 -0.435050788870434014E-03
+ -0.622883798838939949E-03  0.926588365351022937E-02 -0.666666666666666970E-02
+ -0.162064603060552002E-03  0.755721765252307005E-02 -0.421568902488327991E-11
+ -0.164615562620811005E-03 -0.622111732773898991E-09  0.822717714449052936E-02
+ -0.666666666666666970E-02 -0.435031767332582026E-03  0.921658397831942068E-02
+ -0.671474831069555976E-03  0.000000000000000000E+00 -0.435066618541047982E-03
+ -0.622858773502414035E-03  0.926582672366347941E-02 -0.666666666666666970E-02
+ -0.435031759872446977E-03  0.921598447326531928E-02 -0.671353605549905000E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066625999740012E-03  0.000000000000000000E+00 -0.622806880419023005E-03
+  0.926515741228120006E-02 -0.435040467082004024E-03 -0.666666666666666970E-02
+  0.921687727787984939E-02 -0.671542217009017000E-03 -0.435057919931870025E-03
+ -0.622873933298062990E-03  0.926585948649214994E-02 -0.666666666666666970E-02
+ -0.162062558212559002E-03  0.755721765246759013E-02 -0.413710611008680003E-11
+ -0.164605378163800012E-03 -0.610684955957268983E-09  0.822717713313388978E-02
+ -0.435057895396635016E-03  0.926585809258491030E-02 -0.622872563956516044E-03
+ -0.435040491619382013E-03 -0.666666666666666970E-02 -0.671540933200040989E-03
+  0.921687668189401059E-02 -0.164358965526126012E-03 -0.666666666666666970E-02
+  0.822694399085341019E-02 -0.400851057692599981E-09 -0.164360419602091007E-03
+ -0.404402306244331983E-09  0.822717693055982982E-02 -0.435040480593904986E-03
+ -0.666666666666666970E-02  0.921687631186799035E-02 -0.671540849649739004E-03
+ -0.435057906421149976E-03 -0.622872546527165980E-03  0.926585805293679045E-02
+ -0.666666666666666970E-02 -0.162062559343999999E-03  0.755721765385561003E-02
+ -0.413714921327754003E-11 -0.164605383792317005E-03 -0.610691224557087966E-09
+  0.822717713418924003E-02 -0.435057910263422993E-03  0.926585952122008921E-02
+ -0.622873948580168974E-03 -0.435040476751295975E-03 -0.666666666666666970E-02
+ -0.671542290274572949E-03  0.921687760235279964E-02 -0.164358958283310994E-03
+ -0.666666666666666970E-02  0.822694359327636945E-02 -0.400841676414764018E-09
+ -0.164360412981448008E-03 -0.404398890304639026E-09  0.822717692927778937E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128322752053976E-11
+ -0.606929330558722046E-09  0.822717712936565058E-02  0.822717692676106990E-02
+ -0.401916224886799020E-09 -0.666666666666666970E-02 -0.398503993611501997E-09
+  0.822695176179768020E-02 -0.435040782183989977E-03 -0.666666666666666970E-02
+  0.921688643492079955E-02 -0.671543136482274949E-03 -0.435057604856947981E-03
+ -0.622873024631925972E-03  0.926585913880650924E-02 -0.666666666666666970E-02
+ -0.162064603067295007E-03  0.755721765304565030E-02 -0.421568935096809032E-11
+ -0.164615562649187992E-03 -0.622111764150747044E-09  0.822717714448892995E-02
+ -0.435031767264953018E-03 -0.666666666666666970E-02  0.921658397617671973E-02
+ -0.671474830688970005E-03  0.000000000000000000E+00 -0.435066618608665010E-03
+ -0.622858773521975970E-03  0.926582672355137985E-02 -0.666666666666666970E-02
+ -0.435031759867127014E-03  0.921598447384829045E-02 -0.671353608399908978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066626005059000E-03  0.000000000000000000E+00 -0.622806883450133957E-03
+  0.926515741287571928E-02 -0.666666666666666970E-02 -0.162062535484642990E-03
+  0.755721765241379028E-02 -0.413624149795890984E-11 -0.164605264966733003E-03
+ -0.610559194893120966E-09  0.822717713298093054E-02  0.822717692676106990E-02
+ -0.401916224778066025E-09 -0.666666666666666970E-02 -0.398503993617073007E-09
+  0.822695176179807051E-02 -0.666666666666666970E-02 -0.435040782184004994E-03
+  0.921688643492129048E-02 -0.671543136482392043E-03 -0.435057604856933019E-03
+ -0.622873024631955029E-03  0.926585913880656996E-02 -0.666666666666666970E-02
+  0.755721765237515018E-02 -0.411128322752061973E-11 -0.606929330558734970E-09
+  0.822717712936565058E-02 -0.162062535484639006E-03 -0.666666666666666970E-02
+  0.755721765241101993E-02 -0.413624149803294008E-11 -0.164605264966739996E-03
+ -0.610559194905281038E-09  0.822717713298093054E-02 -0.666666666666666970E-02
+ -0.435040978876939991E-03  0.921689266842156026E-02 -0.671544215875962010E-03
+ -0.435057408180386984E-03 -0.622872937769437958E-03  0.926585946462473060E-02
+ -0.666666666666666970E-02 -0.162127387652292998E-03  0.759121301093484014E-02
+ -0.381917589201341967E-11  0.000000000000000000E+00 -0.164583145431396004E-03
+ -0.546168929922635014E-09  0.822717717091626943E-02 -0.666666666666666970E-02
+ -0.166319270517048003E-03  0.755721782869738998E-02 -0.125468645423563995E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854266113373008E-03  0.000000000000000000E+00 -0.694835327004200008E-09
+  0.759121369277350033E-02 -0.666666666666666970E-02 -0.162091523336998009E-03
+  0.755721898664039968E-02 -0.540934653228686967E-11 -0.164750533613684997E-03
+ -0.795365292654328000E-09  0.822717800588296938E-02  0.926452030366763969E-02
+ -0.622281047524610035E-03 -0.666666666666666970E-02 -0.668721216930222967E-03
+  0.920448407782870071E-02 -0.666666666666666970E-02  0.920291934460982022E-02
+ -0.668399807408691956E-03 -0.622140086592986989E-03  0.926273191317288080E-02
+ -0.666666666666666970E-02 -0.435042278634080015E-03  0.921682356015443051E-02
+ -0.671431390315878037E-03 -0.435056108521798986E-03 -0.622756696995610017E-03
+  0.926574695980873948E-02 -0.164604656267662011E-03  0.822717725537036940E-02
+ -0.609640569558637034E-09 -0.162062479874256008E-03 -0.666666666666666970E-02
+ -0.413006082275806018E-11  0.755726759085299012E-02 -0.666666666666666970E-02
+ -0.162006162226947987E-03  0.755727139217683957E-02 -0.248123277233144982E-11
+ -0.164398193103688998E-03 -0.370429925449825014E-09  0.822722962912562916E-02
+ -0.666666666666666970E-02 -0.162064599546255997E-03  0.755721765602699984E-02
+ -0.421555271070431014E-11 -0.164615545095071005E-03 -0.622091901724106049E-09
+  0.822717714627387020E-02 -0.666666666666666970E-02 -0.435031818763031985E-03
+  0.921658104872763971E-02 -0.671470214801736043E-03  0.000000000000000000E+00
+ -0.435066567119566977E-03 -0.622854034507517988E-03  0.926582206155602932E-02
+ -0.666666666666666970E-02 -0.435031752461292008E-03  0.921598570305218016E-02
+ -0.671353593598150960E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066633409600010E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.622806719992169021E-03
+  0.926515905834468002E-02 -0.162062393641141989E-03 -0.666666666666666970E-02
+  0.755722114123831036E-02 -0.413058273432428028E-11 -0.164604687174786012E-03
+ -0.609739837378995011E-09  0.822717725546923996E-02 -0.666666666666666970E-02
+ -0.162060731985959994E-03  0.755721979203459980E-02 -0.406827649196786028E-11
+ -0.164598252833557000E-03 -0.600736958190943012E-09  0.822717859850461038E-02
+ -0.164089877343614012E-03  0.822722950591434077E-02 -0.245723982778697014E-09
+ -0.164740360816590010E-03 -0.666666666666666970E-02 -0.253032241614393982E-09
+  0.822760867484073968E-02 -0.435063218164074010E-03 -0.666666666666666970E-02
+  0.926625279535532066E-02 -0.623222898205732980E-03 -0.435035168245856982E-03
+ -0.671596115310440014E-03  0.921718087666401065E-02 -0.162006191893170997E-03
+ -0.666666666666666970E-02  0.755728897650832006E-02 -0.248111426872892018E-11
+ -0.164398182178321996E-03 -0.370407207277356981E-09  0.822722962910301010E-02
+ -0.666666666666666970E-02 -0.162062408593516993E-03  0.755721778752637980E-02
+ -0.413141814596678035E-11 -0.164604734980543005E-03 -0.609861061511916968E-09
+  0.822717721123181027E-02 -0.164359758083610009E-03  0.822717705188663086E-02
+ -0.403734886259138006E-09 -0.164358443024191010E-03 -0.666666666666666970E-02
+ -0.400953600054809976E-09  0.822699461577980004E-02 -0.435042369262490986E-03
+ -0.666666666666666970E-02  0.921678605590701971E-02 -0.671388823849048961E-03
+ -0.435056017899627977E-03 -0.622715238127973967E-03  0.926570506586106915E-02
+ -0.666666666666666970E-02  0.897119442339463016E-02 -0.606814776206859971E-03
+ -0.607757345455318975E-03  0.923427126754335081E-02  0.921865858631707932E-02
+ -0.671454241685535990E-03 -0.666666666666666970E-02 -0.646212508082324977E-03
+  0.929466349621914031E-02 -0.192798550395760997E-03 -0.666666666666666970E-02
+  0.825745095197305025E-02 -0.932632147975482013E-06 -0.167240534661016013E-03
+ -0.225014932097670997E-07  0.822685297960725967E-02 -0.666666666666666970E-02
+ -0.435853333051191975E-03  0.921982672935077983E-02 -0.576029289720921011E-03
+ -0.434241817036211018E-03 -0.604491260294178975E-03  0.915701687929672964E-02
+ -0.413213785008022022E-03 -0.666666666666666970E-02  0.877723644056978974E-02
+ -0.541755128362865035E-03 -0.454683425554907011E-03 -0.590990666065798007E-03
+  0.920776909864313986E-02 -0.666666666666666970E-02 -0.163098438357505001E-03
+  0.763192860819322978E-02 -0.471912218750692030E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547247946E-03
+ -0.266001580374522018E-03  0.861802239077459967E-02 -0.666666666666666970E-02
+ -0.162004212070115012E-03  0.757730725874164977E-02 -0.176948864124370000E-05
+ -0.529438965863178955E-03 -0.405266342026540003E-04  0.875391878258710045E-02
+  0.921979766533967057E-02 -0.602143233936809023E-03 -0.666666666666666970E-02
+ -0.555336769390324045E-03  0.881585236637343062E-02 -0.666666666666666970E-02
+ -0.163010868918803012E-03  0.795402864518520922E-02 -0.273140627481144992E-11
+ -0.164270868404786012E-03 -0.213417891562764991E-09  0.822716826171294990E-02
+ -0.666666666666666970E-02  0.927693305629184034E-02 -0.632604555455441994E-03
+ -0.671545581847466972E-03  0.921776054189130979E-02 -0.161978982117798992E-03
+ -0.666666666666666970E-02  0.755685740523829967E-02 -0.195386057816982017E-11
+ -0.164191968508161013E-03 -0.291698868024367013E-09  0.822713227244221967E-02
+ -0.666666666666666970E-02 -0.435040481081572998E-03  0.921687625100787999E-02
+ -0.671540765653113052E-03 -0.435057905933523977E-03 -0.622872462287793992E-03
+  0.926585797472436953E-02 -0.666666666666666970E-02 -0.162127507324803994E-03
+  0.759121282163092038E-02 -0.382298621466253031E-11 -0.164583632044819009E-03
+ -0.546703997412867024E-09  0.822717713283360047E-02 -0.666666666666666970E-02
+ -0.166319256859904011E-03  0.755721769623335978E-02 -0.125414820597383008E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854250172961013E-03 -0.694537633219151044E-09  0.759121350560284982E-02
+ -0.666666666666666970E-02 -0.162099247952961000E-03  0.755721989572244977E-02
+ -0.581368808752400979E-11 -0.164789516502383002E-03 -0.853932691235368040E-09
+  0.822717846311052034E-02  0.926237298878906934E-02 -0.621979743485002014E-03
+ -0.666666666666666970E-02 -0.667643958464268013E-03  0.919963618647812002E-02
+ -0.666666666666666970E-02  0.920119115376864978E-02 -0.667964676147511953E-03
+ -0.622121227338204037E-03  0.926416149864764914E-02 -0.666666666666666970E-02
+ -0.435040909817440986E-03  0.921687655493176952E-02 -0.671527187062299024E-03
+ -0.435057477234175008E-03 -0.622856729377207043E-03  0.926584500443678079E-02
+ -0.164605266534459990E-03  0.822717720592583933E-02 -0.610450247156119009E-09
+ -0.162062530200907004E-03 -0.666666666666666970E-02 -0.413549431266714970E-11
+  0.755722446015559979E-02 -0.666666666666666970E-02 -0.162059365323539989E-03
+  0.755722567800456990E-02 -0.401936380178025961E-11 -0.164612123412715989E-03
+ -0.594277800995966983E-09  0.822719391360227960E-02 -0.666666666666666970E-02
+ -0.162064601680787996E-03  0.755721765533357015E-02 -0.421563549128826973E-11
+  0.000000000000000000E+00 -0.164615555733905002E-03 -0.622103957379088996E-09
+  0.822717714640426936E-02 -0.666666666666666970E-02 -0.435031792976918995E-03
+  0.921658339535963086E-02 -0.671473400089229013E-03  0.000000000000000000E+00
+ -0.435066592901186004E-03 -0.622857239963683028E-03  0.926582531946548021E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031751019518008E-03
+  0.921598490375117978E-02 -0.671353543894939037E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066634851121988E-03  0.000000000000000000E+00
+ -0.622806775506295953E-03  0.926515820468288079E-02 -0.162062519267639990E-03
+ -0.666666666666666970E-02  0.755721857194846958E-02 -0.413556062572659997E-11
+ -0.164605270453123994E-03 -0.610462848248556989E-09  0.822717720593840046E-02
+ -0.666666666666666970E-02 -0.162062243310156996E-03  0.755721835822449010E-02
+ -0.412518743024167011E-11 -0.164604700854507999E-03 -0.608980634819725018E-09
+  0.822717779878246944E-02 -0.164369437450745994E-03  0.822719371500816969E-02
+ -0.393313770364309007E-09 -0.164344328728707013E-03 -0.666666666666666970E-02
+ -0.391103618326530975E-09  0.822706248393468946E-02 -0.435044237783256996E-03
+ -0.666666666666666970E-02  0.921700053762436987E-02 -0.671567082152385998E-03
+ -0.435054149489181002E-03 -0.622876302108100002E-03  0.926586962058313993E-02
+ -0.162059379398541000E-03 -0.666666666666666970E-02  0.755723329728289969E-02
+ -0.401928043513828996E-11 -0.164612118388109011E-03 -0.594261933530200978E-09
+  0.822719391358646933E-02 -0.666666666666666970E-02 -0.162062495727046009E-03
+  0.755721772845492981E-02 -0.413473254943709991E-11 -0.164605146051557987E-03
+ -0.610342338260452991E-09  0.822717719432444067E-02 -0.164360315553491003E-03
+  0.822717700213156967E-02 -0.404232600081692006E-09 -0.164358887835959998E-03
+ -0.666666666666666970E-02 -0.400997831179622018E-09  0.822696488974702066E-02
+ -0.435041264130012022E-03 -0.666666666666666970E-02  0.921687154445821957E-02
+ -0.671511051258173990E-03 -0.435057122950393985E-03 -0.622839088072463958E-03
+  0.926582874613315031E-02 -0.666666666666666970E-02  0.927696108771089954E-02
+ -0.632627917074633003E-03 -0.671545420285009982E-03  0.921776198086651961E-02
+  0.922026584017879985E-02 -0.602408843182564049E-03 -0.666666666666666970E-02
+ -0.556545745722415003E-03  0.881922185436606956E-02 -0.163017068240428012E-03
+ -0.666666666666666970E-02  0.795594799175567031E-02 -0.273445423603751999E-11
+ -0.164269850461446013E-03 -0.212776812668013003E-09  0.822716866999695026E-02
+ -0.666666666666666970E-02 -0.435073828677985024E-03  0.926998697001005928E-02
+ -0.626775693155938018E-03 -0.435024555678502995E-03 -0.674658125777386950E-03
+  0.922087193677081918E-02 -0.317388619046037024E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.807555110066160951E-02 -0.161231565052261995E-03
+ -0.506658534832258992E-03 -0.311775039000433988E-03  0.895372823636290081E-02
+ -0.666666666666666970E-02 -0.163033789441224992E-03  0.757712252412545001E-02
+ -0.230784432560672012E-04  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.528851957602917964E-03 -0.189448307782970002E-03  0.832713693082491081E-02
+ -0.666666666666666970E-02 -0.161988418356935989E-03  0.755983351837515982E-02
+ -0.201738757027985980E-11 -0.164212922403823006E-03 -0.300296756336277975E-09
+  0.822710828992056012E-02  0.921859794240553060E-02 -0.671461758481382994E-03
+ -0.666666666666666970E-02 -0.645287749704886960E-03  0.929336169606828980E-02
+ -0.666666666666666970E-02 -0.192231218611419992E-03  0.825469369225220045E-02
+ -0.211366001320515989E-06 -0.166616012215181007E-03 -0.134857476054616004E-07
+  0.822777129012250072E-02 -0.666666666666666970E-02  0.902089049785161946E-02
+ -0.621512175397749989E-03 -0.611398199404824985E-03  0.924071304541716938E-02
+ -0.162013975253551008E-03 -0.666666666666666970E-02  0.758489435794543979E-02
+ -0.176583046930321999E-05 -0.529428370426132016E-03 -0.381593418223961990E-04
+  0.875176249380833077E-02 -0.666666666666666970E-02 -0.435040249760507993E-03
+  0.921686856770915011E-02 -0.671539094886240001E-03 -0.435058137234120004E-03
+ -0.622872175240221974E-03  0.926585722647491929E-02 -0.666666666666666970E-02
+ -0.435049192418392975E-03  0.921716878281565039E-02 -0.671606876372485046E-03
+ -0.435049194977138020E-03 -0.622886318490127947E-03  0.926588938673533064E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062559002504992E-03
+  0.755721765529914023E-02 -0.413713063586915014E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605381839701988E-03 -0.610688496265456990E-09
+  0.822717709903768996E-02 -0.666666666666666970E-02 -0.435040481539034974E-03
+  0.921687633332718068E-02 -0.671540846145312980E-03 -0.435057905476102008E-03
+ -0.622872537796382957E-03  0.926585804562602040E-02  0.822717712936564018E-02
+ -0.606929320140830976E-09 -0.666666666666666970E-02 -0.411128314549578026E-11
+  0.755721765728442022E-02 -0.666666666666666970E-02  0.755721765728442022E-02
+ -0.411128314549578026E-11 -0.606929320113726025E-09  0.822717712936564018E-02
+ -0.666666666666666970E-02 -0.162062535444855996E-03  0.755721765299004027E-02
+ -0.413623988345109963E-11 -0.164605264762881988E-03 -0.610558967689581019E-09
+  0.822717713298422998E-02 -0.164360411104607013E-03  0.822717692934940049E-02
+ -0.404392079821727997E-09 -0.164359007307861987E-03 -0.666666666666666970E-02
+ -0.400959182056933016E-09  0.822695177830758996E-02 -0.666666666666666970E-02
+ -0.435040782704709025E-03  0.921688645230588945E-02 -0.671543140332089947E-03
+ -0.435057604336271976E-03 -0.622873025362695042E-03  0.926585914058404916E-02
+ -0.666666666666666970E-02 -0.435040480198235974E-03  0.921687629980992920E-02
+ -0.671540848018962978E-03  0.000000000000000000E+00 -0.435057906816784022E-03
+ -0.622872547225220984E-03  0.926585805277986042E-02 -0.666666666666666970E-02
+ -0.435049192309392005E-03  0.921716878048172046E-02 -0.671606876991212012E-03
+ -0.435049195086140020E-03 -0.622886319711294994E-03  0.926588938772130062E-02
+ -0.666666666666666970E-02 -0.162062559108483992E-03  0.755721765240897035E-02
+ -0.413713455792400007E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605382375200988E-03 -0.610689046115110989E-09
+  0.822717709593154961E-02 -0.162062559033916009E-03 -0.666666666666666970E-02
+  0.755721765255952006E-02 -0.413713742434565006E-11 -0.164605382254266990E-03
+ -0.610689502144610012E-09  0.822717713322326967E-02 -0.666666666666666970E-02
+ -0.162062001821161990E-03  0.755721765238182019E-02 -0.411599099559748981E-11
+ -0.164602607031998988E-03 -0.607614053906671031E-09  0.822717713004775947E-02
+ -0.435050784143273998E-03  0.926588367156185033E-02 -0.622883807548416991E-03
+ -0.435047603239580017E-03 -0.666666666666666970E-02 -0.671594832030014948E-03
+  0.921711541879277965E-02 -0.435047603239578987E-03 -0.666666666666666970E-02
+  0.921711541854514961E-02 -0.671594831758394982E-03 -0.435050784143274974E-03
+ -0.622883807286240968E-03  0.926588367130479033E-02 -0.435040480204222993E-03
+ -0.666666666666666970E-02  0.921687629993508950E-02 -0.671540847984813971E-03
+ -0.435057906810797979E-03 -0.622872547158363982E-03  0.926585805272238036E-02
+ -0.666666666666666970E-02 -0.435040480199119979E-03  0.921687628837800955E-02
+ -0.671540836043143011E-03 -0.435057906815900994E-03 -0.622872535731676044E-03
+  0.926585804082549064E-02 -0.164605382251114998E-03  0.822717713322325926E-02
+ -0.610689492032270982E-09 -0.162062559042711003E-03 -0.666666666666666970E-02
+ -0.413713737102918003E-11  0.755721765729555021E-02 -0.162062559115393992E-03
+ -0.666666666666666970E-02  0.755721765728750976E-02 -0.413714006582654996E-11
+ -0.164605382612441987E-03 -0.610689891716875000E-09  0.822717713311152052E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128320041545960E-11
+ -0.606929330558716048E-09  0.822717712936565058E-02  0.822717692676106990E-02
+ -0.401916224778854018E-09 -0.666666666666666970E-02 -0.398503993603185006E-09
+  0.822695176179710080E-02 -0.435040782183968022E-03 -0.666666666666666970E-02
+  0.921688643492006056E-02 -0.671543136482093996E-03 -0.435057604856969990E-03
+ -0.622873024631876967E-03  0.926585913880641036E-02 -0.666666666666666970E-02
+ -0.162064603069931001E-03  0.755721765252316026E-02 -0.421568944254191005E-11
+ -0.164615562667500004E-03 -0.622111785613204007E-09  0.822717714448713971E-02
+ -0.435031767249101982E-03 -0.666666666666666970E-02  0.921658397565039943E-02
+ -0.671474830574549054E-03  0.000000000000000000E+00 -0.435066618624513010E-03
+ -0.622858773502365029E-03  0.926582672349992968E-02 -0.666666666666666970E-02
+ -0.435031759837616007E-03  0.921598447232674020E-02 -0.671353608078640034E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066626034565020E-03  0.000000000000000000E+00 -0.622806883368089994E-03
+  0.926515741217365935E-02 -0.666666666666666970E-02 -0.162062535484635997E-03
+  0.755721765240871968E-02 -0.413624136259383020E-11 -0.164605264966749998E-03
+ -0.610559194918970050E-09  0.822717713298093921E-02  0.822717692676106990E-02
+ -0.401916224724624988E-09 -0.666666666666666970E-02 -0.398503993603535989E-09
+  0.822695176179711989E-02 -0.666666666666666970E-02 -0.435040782183968998E-03
+  0.921688643492009005E-02 -0.671543136482101043E-03 -0.435057604856969014E-03
+ -0.622873024631878051E-03  0.926585913880642077E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558716048E-09
+  0.822717712936565058E-02 -0.162062535484635997E-03 -0.666666666666666970E-02
+  0.755721765240869973E-02 -0.413624144391172994E-11 -0.164605264966749998E-03
+ -0.610559194919383020E-09  0.822717713298093921E-02 -0.666666666666666970E-02
+ -0.435033689603804975E-03  0.921378384069738987E-02 -0.668589820131555040E-03
+ -0.435064696586663006E-03 -0.620092661077489989E-03  0.926283488586463963E-02
+ -0.666666666666666970E-02 -0.162162554053133000E-03  0.761060627573421043E-02
+ -0.358738548769622997E-11 -0.164556192142978992E-03 -0.502148070051966022E-09
+  0.822717405269838935E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.163092637467958997E-03  0.755846712970139993E-02  0.813151629364127964E-19
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.163377020705577990E-03
+ -0.197563428050820995E-37  0.761068726943637999E-02 -0.666666666666666970E-02
+ -0.415446954693720976E-03  0.879694625554016015E-02 -0.528338181427255972E-03
+ -0.452858816762892013E-03 -0.571958066866902980E-03  0.919346567847542998E-02
+  0.822717710541285077E-02 -0.582878441461362035E-09 -0.666666666666666970E-02
+ -0.398790879900396976E-11  0.756907608051521959E-02 -0.666666666666666970E-02
+  0.756911094403798970E-02 -0.399682676269469016E-11 -0.586791272997618962E-09
+  0.822723940155752065E-02 -0.666666666666666970E-02 -0.435082932504862979E-03
+  0.926680195170886956E-02 -0.623730586218653013E-03 -0.435015449187736995E-03
+ -0.671618387476765970E-03  0.921723408637379997E-02 -0.164605497784444998E-03
+  0.822717715380158997E-02 -0.610801293217642997E-09 -0.162062563985841001E-03
+ -0.666666666666666970E-02 -0.413787738555715978E-11  0.755721095967978960E-02
+ -0.666666666666666970E-02 -0.162067963173172003E-03  0.755721116190670970E-02
+ -0.434923998820925017E-11 -0.164635999950897013E-03 -0.641653733272013955E-09
+  0.822717987065326069E-02 -0.666666666666666970E-02 -0.162064602794511991E-03
+  0.755721765349726994E-02 -0.421567878592677010E-11  0.000000000000000000E+00
+ -0.164615561292109994E-03 -0.622110240351067001E-09  0.822717714537648040E-02
+ -0.666666666666666970E-02 -0.435031773851333981E-03  0.921658402777584054E-02
+ -0.671474653364973048E-03  0.000000000000000000E+00 -0.435066612023434006E-03
+ -0.622858559544564984E-03  0.926582657482898954E-02 -0.666666666666666970E-02
+ -0.435031756377946010E-03  0.921598454249483029E-02 -0.671353578907302054E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066629493629977E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.622806849901958001E-03  0.926515761104865047E-02
+ -0.162062576485603999E-03 -0.666666666666666970E-02  0.755721769097717974E-02
+ -0.413780156463713995E-11 -0.164605493304321993E-03 -0.610786879472733959E-09
+  0.822717715378724034E-02 -0.666666666666666970E-02 -0.162062600946185996E-03
+  0.755721775647281990E-02 -0.413875277158846027E-11 -0.164605835862048996E-03
+ -0.610932451422013981E-09  0.822717731789045920E-02 -0.164390239995456007E-03
+  0.822717965629857044E-02 -0.424752159962523975E-09 -0.164384780603813011E-03
+ -0.666666666666666970E-02 -0.420718048853076981E-09  0.822693037781527942E-02
+ -0.435039866684628978E-03 -0.666666666666666970E-02  0.921685570981070970E-02
+ -0.671536106102198996E-03 -0.435058520274921989E-03 -0.622871476563668946E-03
+  0.926585585423385025E-02 -0.162067974342572992E-03 -0.666666666666666970E-02
+  0.755721712453114990E-02 -0.434916937807187029E-11 -0.164635995970298013E-03
+ -0.641640314610242035E-09  0.822717987063988944E-02 -0.666666666666666970E-02
+ -0.162062530993410003E-03  0.755721767391838006E-02 -0.413607275117757979E-11
+ -0.164605272998311010E-03 -0.610535653474081026E-09  0.822717715805352037E-02
+ -0.164360523631990990E-03  0.822717694987937058E-02 -0.404455003494009021E-09
+ -0.164359098316623007E-03 -0.666666666666666970E-02 -0.401040888608453987E-09
+  0.822695309202677028E-02 -0.435040829062186994E-03 -0.666666666666666970E-02
+  0.921689067116016926E-02 -0.671545174080842035E-03 -0.435057557982693015E-03
+ -0.622874572247597981E-03  0.926586218088495950E-02 -0.666666666666666970E-02
+  0.755944258225673023E-02 -0.409900186798689021E-11 -0.607550069944511006E-09
+  0.822725761878751946E-02  0.822717683107231958E-02 -0.307429333767333980E-09
+ -0.666666666666666970E-02 -0.346590492659272013E-11  0.788944072959344919E-02
+ -0.405969476545236976E-03 -0.666666666666666970E-02  0.868864828240666076E-02
+ -0.503334320458691947E-03 -0.460337026543280981E-03 -0.585683663662835011E-03
+  0.919951403822613992E-02 -0.666666666666666970E-02 -0.434971941726958005E-03
+  0.922397396166772972E-02 -0.680859150263666982E-03 -0.435126415795954993E-03
+ -0.632212694429038955E-03  0.927545373694507042E-02 -0.163258163012422003E-03
+ -0.666666666666666970E-02  0.771427725269166996E-02 -0.477633450518533990E-04
+ -0.532374012608134001E-03 -0.264816289609958973E-03  0.893589131240556979E-02
+ -0.666666666666666970E-02 -0.163022374738535991E-03  0.756010628736956004E-02
+ -0.985505903625681948E-05  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.528465266513926044E-03 -0.112888812204124995E-03
+  0.818271031310010069E-02  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.162166578301091007E-03  0.755933919789102040E-02 -0.104753730850025000E-10
+ -0.165099058739546009E-03 -0.152246080895752994E-08  0.822717141726839012E-02
+  0.822711899108566000E-02 -0.221645141852008993E-09 -0.666666666666666970E-02
+ -0.803318716673082971E-11  0.819386805739684074E-02 -0.666666666666666970E-02
+ -0.433725918784746999E-03  0.917430859668704086E-02 -0.661288978702212048E-03
+ -0.436363722694927986E-03 -0.620322134736719043E-03  0.926073953711441071E-02
+ -0.666666666666666970E-02  0.755741928796660033E-02 -0.410903028695346997E-11
+ -0.606500685244875008E-09  0.822717712893872993E-02 -0.162225051571071008E-03
+ -0.666666666666666970E-02  0.762801677615301994E-02 -0.369132365187440996E-05
+ -0.529778141305529949E-03 -0.883337217346735949E-04  0.880271406258762920E-02
+ -0.666666666666666970E-02 -0.415930173645343991E-03  0.874527436890885045E-02
+ -0.345835963857271027E-03 -0.452435882453698006E-03 -0.346466114750530014E-03
+  0.898436350079646028E-02 -0.666666666666666970E-02 -0.410333749350875977E-03
+  0.874024429470159937E-02 -0.526324780402007012E-03 -0.456977206374832004E-03
+ -0.594710214126934963E-03  0.921019309223168083E-02 -0.666666666666666970E-02
+ -0.349138390596867001E-03  0.808745258250311017E-02 -0.174389744426073004E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.493489741678000975E-03 -0.295632081822985011E-03  0.859314073796063962E-02
+ -0.666666666666666970E-02 -0.162165456144072008E-03  0.755894484280849981E-02
+ -0.104756642479310002E-10 -0.165141511878991989E-03 -0.152686212523292010E-08
+  0.822721248571258931E-02  0.925936813944002075E-02 -0.620622043504368980E-03
+ -0.666666666666666970E-02 -0.661330297689515954E-03  0.917272030438071040E-02
+ -0.666666666666666970E-02  0.917272030438071040E-02 -0.661330297689515954E-03
+ -0.620622043504368980E-03  0.925936813944002075E-02 -0.666666666666666970E-02
+ -0.162062370509243004E-03  0.755772622350076983E-02 -0.408856756901033998E-11
+ -0.164599404158889001E-03 -0.603371041254018047E-09  0.822717649469902924E-02
+ -0.164363072970947994E-03  0.822717536870335060E-02 -0.406417442325602006E-09
+ -0.164592544349163996E-03 -0.666666666666666970E-02 -0.411459868924217992E-09
+  0.822733545578738061E-02 -0.666666666666666970E-02 -0.435055225953795985E-03
+  0.926605052142811039E-02 -0.623035876737523049E-03 -0.435043161259378005E-03
+ -0.671607266693034040E-03  0.921717853066967956E-02 -0.666666666666666970E-02
+ -0.435040481662372024E-03  0.921687634507689993E-02 -0.671540853956346050E-03
+ -0.435057905352776017E-03 -0.622872544450323051E-03  0.926585805410279004E-02
+ -0.666666666666666970E-02 -0.435049192197708014E-03  0.921716876912447025E-02
+ -0.671606868732131960E-03  0.000000000000000000E+00 -0.435049195197822981E-03
+ -0.622886312493042956E-03  0.926588937934206061E-02 -0.666666666666666970E-02
+ -0.162062574467009994E-03  0.755721765477525027E-02 -0.413771902340413000E-11
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605458858507004E-03  0.000000000000000000E+00 -0.610774061235578027E-09
+  0.822717709793101964E-02 -0.162063129702431002E-03 -0.666666666666666970E-02
+  0.755721488286742026E-02 -0.415889664939935991E-11 -0.164608241410742993E-03
+ -0.613854083280822974E-09  0.822717557368093065E-02 -0.666666666666666970E-02
+ -0.162062006116684993E-03  0.755721765538701958E-02 -0.411615339006451024E-11
+ -0.164602628392850010E-03 -0.607637651237361961E-09  0.822717712956365019E-02
+ -0.435050838802116988E-03  0.926588470716894000E-02 -0.622885079077197044E-03
+ -0.435047548579851017E-03 -0.666666666666666970E-02 -0.671595813711089052E-03
+  0.921711477741266050E-02 -0.435047548601973024E-03 -0.666666666666666970E-02
+  0.921711980798223078E-02 -0.671600947327989010E-03 -0.435050838779994981E-03
+ -0.622889987523941044E-03  0.926588996280537994E-02 -0.435040278978450999E-03
+ -0.666666666666666970E-02  0.921687073978887973E-02 -0.671540718908447947E-03
+ -0.435058108018793015E-03 -0.622873586970733978E-03  0.926585856104042080E-02
+ -0.666666666666666970E-02 -0.435040282159379990E-03  0.921688118588532938E-02
+ -0.671551578142270001E-03 -0.435058104838147977E-03 -0.622883989557962013E-03
+  0.926586935714399963E-02 -0.164608242952759995E-03  0.822717557368590063E-02
+ -0.613859067359255033E-09 -0.162063125398168002E-03 -0.666666666666666970E-02
+ -0.415892283159718001E-11  0.755721256705853967E-02 -0.162062565682641010E-03
+ -0.666666666666666970E-02  0.755721268194402968E-02 -0.413779732258981993E-11
+ -0.164605464633444011E-03 -0.610787973952621006E-09  0.822717713109540061E-02
+ -0.666666666666666970E-02  0.895231589357397932E-02 -0.601006644779317027E-03
+ -0.606288347917921001E-03  0.923179444181716075E-02  0.921580902907083964E-02
+ -0.671118539167890951E-03 -0.666666666666666970E-02 -0.628092210114751027E-03
+  0.927168309130125927E-02 -0.174566225284372993E-03 -0.666666666666666970E-02
+  0.823503971979330958E-02 -0.510771887614783954E-06 -0.167453071907815989E-03
+ -0.287656487025054002E-06  0.822679160087593994E-02 -0.666666666666666970E-02
+ -0.162064999238019001E-03  0.755721705065465991E-02 -0.423106066889392027E-11
+ -0.164617538202307007E-03 -0.624346471180137971E-09  0.822717661766741992E-02
+ -0.435027297636500021E-03 -0.666666666666666970E-02  0.921651161171017051E-02
+ -0.671532969891762031E-03  0.000000000000000000E+00 -0.435071087356345989E-03
+ -0.622941366988572030E-03  0.926589066343610004E-02 -0.666666666666666970E-02
+ -0.435031386261265976E-03  0.921594142114651009E-02 -0.671460160302111021E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066999544915002E-03  0.000000000000000000E+00 -0.622929217552493962E-03
+  0.926511198239901926E-02 -0.666666666666666970E-02 -0.162052976426746003E-03
+  0.755715649452497022E-02 -0.378464898874789026E-11 -0.164557847455290991E-03
+ -0.559380698263932047E-09  0.822711615636790086E-02  0.921569960061689036E-02
+ -0.671199549999528054E-03 -0.666666666666666970E-02 -0.624992481787264051E-03
+  0.926820615335867057E-02 -0.666666666666666970E-02 -0.170162248239514002E-03
+  0.823042259709222922E-02 -0.875293298501571007E-07 -0.166825903639511989E-03
+ -0.692727623674652971E-07  0.822728043870269918E-02 -0.666666666666666970E-02
+  0.900314235021752035E-02 -0.616359389498942999E-03 -0.610140243508493002E-03
+  0.923851413586198981E-02 -0.162042399166736006E-03 -0.666666666666666970E-02
+  0.755717693384080035E-02 -0.343592701208587996E-11 -0.164505035953650008E-03
+ -0.508562496748712960E-09  0.822712657547092045E-02 -0.666666666666666970E-02
+ -0.162064604852803001E-03  0.755721682250679039E-02 -0.421581532487358014E-11
+ -0.164615579255980999E-03 -0.622130465298954038E-09  0.822717706389368073E-02
+ -0.666666666666666970E-02 -0.435031888747001994E-03  0.921662497939955937E-02
+ -0.671514002290806984E-03  0.000000000000000000E+00 -0.435066497147766000E-03
+ -0.622895617170409984E-03  0.926586574231100073E-02 -0.666666666666666970E-02
+ -0.435031628074990023E-03  0.921600144320258025E-02 -0.671374321788007020E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066757774081993E-03  0.000000000000000000E+00 -0.622827422523968997E-03
+  0.926517931888306938E-02 -0.666666666666666970E-02 -0.162168008296603987E-03
+  0.755720830543264981E-02 -0.111631538525960998E-10 -0.165130579439813012E-03
+ -0.162383789215979997E-08  0.822716472453222970E-02  0.822711943021576034E-02
+ -0.222017847039514990E-09 -0.666666666666666970E-02 -0.777760925671697071E-11
+  0.819196919955125953E-02 -0.666666666666666970E-02  0.917235354741821086E-02
+ -0.661205826288643948E-03 -0.620672923073943951E-03  0.926095692434543948E-02
+ -0.666666666666666970E-02 -0.162079935652635995E-03  0.755720153609774035E-02
+ -0.485738294134179040E-11 -0.164692040195761001E-03 -0.715311464965949002E-09
+  0.822716985670068034E-02 -0.166685978125188992E-03  0.822673928904943050E-02
+ -0.488041135101126002E-07 -0.166618503841219014E-03 -0.666666666666666970E-02
+ -0.388518789845462021E-07  0.822064815916740028E-02 -0.666666666666666970E-02
+ -0.434820802690177985E-03  0.921114685377608967E-02 -0.671312500692984007E-03
+ -0.435277323473812975E-03 -0.623870937176576037E-03  0.926677062396401990E-02
+ -0.666666666666666970E-02 -0.435040416088372018E-03  0.921688078845643056E-02
+ -0.671547258120006041E-03 -0.435057970921028016E-03 -0.622879055329303975E-03
+  0.926586475383493063E-02 -0.666666666666666970E-02 -0.435048641715767004E-03
+  0.921713990189115991E-02 -0.671590634431848974E-03  0.000000000000000000E+00
+ -0.435049745678237000E-03 -0.622873878413765037E-03  0.926587662260439961E-02
+ -0.666666666666666970E-02 -0.162065760605982007E-03  0.755721473567340981E-02
+ -0.425851668132086027E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164621246760972002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.628327256337155983E-09
+  0.822716093119939956E-02 -0.162542321360015994E-03 -0.666666666666666970E-02
+  0.755650259353967020E-02 -0.562981055991819022E-09 -0.167001540638854007E-03
+ -0.773289585904606956E-07  0.822676761660280989E-02 -0.666666666666666970E-02
+ -0.162061328166368990E-03  0.755722295014580030E-02 -0.409106119013757001E-11
+ -0.164607216616360989E-03 -0.604245617915833026E-09  0.822718306150760012E-02
+ -0.435044697597018011E-03  0.921886702873469036E-02 -0.673104082698351962E-03
+ -0.435053689697235003E-03 -0.666666666666666970E-02 -0.624556261940502019E-03
+  0.926763947624762059E-02 -0.435053689776730007E-03 -0.666666666666666970E-02
+  0.926765139731208082E-02 -0.624573254433248028E-03 -0.435044697517509996E-03
+ -0.673283560783819986E-03  0.921873769877605033E-02 -0.435024190531306987E-03
+ -0.666666666666666970E-02  0.921794328133558952E-02 -0.672853471668093963E-03
+ -0.435074193732175026E-03 -0.624365177556240967E-03  0.926744434610436013E-02
+ -0.666666666666666970E-02 -0.435000466897472022E-03  0.921805866981871037E-02
+ -0.673973626915920021E-03 -0.435097908603312988E-03 -0.625447361535428985E-03
+  0.926834572012657024E-02 -0.167001904506701996E-03  0.822676773783739997E-02
+ -0.774505155958715954E-07 -0.162540904682313003E-03 -0.666666666666666970E-02
+ -0.563656255578501956E-09  0.755607546000543022E-02 -0.162061504581954011E-03
+ -0.666666666666666970E-02  0.755611035298445026E-02 -0.418942890780224988E-11
+ -0.164621967289905991E-03 -0.619216517642183046E-09  0.822718518765969999E-02
+ -0.666666666666666970E-02  0.920452787793652957E-02 -0.668731214720833010E-03
+ -0.622283241864520008E-03  0.926452697307078056E-02  0.822724025612167967E-02
+ -0.586193188243385978E-09 -0.666666666666666970E-02 -0.399370806297400039E-11
+  0.756944692405215041E-02 -0.162114939468458987E-03 -0.666666666666666970E-02
+  0.756941303972038983E-02 -0.521957340938646036E-11 -0.164756390961434999E-03
+ -0.760317731950115969E-09  0.822719049518791032E-02 -0.666666666666666970E-02
+ -0.162348256591775996E-03  0.757138537095577036E-02 -0.154279395379191983E-04
+ -0.530720557527397988E-03 -0.221706573191253990E-03  0.891607556388816086E-02
+ -0.163035589715460993E-03 -0.666666666666666970E-02  0.791125444579478081E-02
+ -0.568881226622524034E-05  0.000000000000000000E+00 -0.530385305835545987E-03
+ -0.644753151430826047E-04  0.878526723619102966E-02 -0.666666666666666970E-02
+ -0.399599558422803001E-03  0.843930969858660944E-02 -0.399745648940732980E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.464884917361518986E-03  0.000000000000000000E+00 -0.476898399198092998E-03
+  0.874546494059007078E-02 -0.666666666666666970E-02 -0.162063070389469993E-03
+  0.755351606433557997E-02 -0.447630187954819978E-11 -0.164644944937428005E-03
+ -0.661962104712968044E-09  0.822717544598733017E-02  0.926551780113546962E-02
+ -0.622722840997360001E-03 -0.666666666666666970E-02 -0.670824268645125988E-03
+  0.921370889062123923E-02 -0.666666666666666970E-02 -0.434945178559459974E-03
+  0.921154593743686925E-02 -0.668579923758682053E-03 -0.435153154613506974E-03
+ -0.620572207517385986E-03  0.926326406971371939E-02 -0.666666666666666970E-02
+  0.755816621993720978E-02 -0.411326355534032026E-11 -0.610348502456840988E-09
+  0.822725905587259020E-02 -0.402874184979866987E-03 -0.666666666666666970E-02
+  0.865106066171478054E-02 -0.472609065736268020E-03 -0.462635081887802994E-03
+ -0.557606013013843028E-03  0.917391259078658046E-02 -0.666666666666666970E-02
+ -0.162816160491244000E-03  0.755629729934226019E-02 -0.145267309791605992E-07
+ -0.168362009612906010E-03 -0.190258327748242995E-05  0.822792910350438923E-02
+ -0.666666666666666970E-02 -0.434980287177094997E-03  0.921591582004819936E-02
+ -0.672220539599956038E-03  0.000000000000000000E+00 -0.435118076428732989E-03
+ -0.623871208911141008E-03  0.926674480491154057E-02 -0.666666666666666970E-02
+ -0.391663109206836005E-03  0.857592364748300043E-02 -0.270377370496600986E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.470345934717641013E-03  0.000000000000000000E+00 -0.290023568937984979E-03
+  0.893922648721857081E-02 -0.666666666666666970E-02 -0.317994853290403008E-03
+  0.807402136257530031E-02 -0.163000017527141987E-03 -0.506466269749422031E-03
+ -0.315285081405852004E-03  0.895741238458845970E-02  0.822721524635505914E-02
+ -0.564092130899272960E-09 -0.666666666666666970E-02 -0.389036333446228008E-11
+  0.758050134032894028E-02 -0.666666666666666970E-02  0.758050134032894028E-02
+ -0.389036333446228008E-11 -0.564092130736643047E-09  0.822721524635505914E-02
+ -0.666666666666666970E-02 -0.162062220957438010E-03  0.755675930940021001E-02
+ -0.416185255233357985E-11 -0.164608241679856012E-03 -0.614515052693167971E-09
+  0.822717669744772034E-02 -0.435064264203551974E-03  0.926583708706145946E-02
+ -0.622864427576335982E-03 -0.435034122053574979E-03 -0.666666666666666970E-02
+ -0.671494672063580045E-03  0.921666478777227044E-02 -0.666666666666666970E-02
+ -0.435034122194007025E-03  0.921666834178432037E-02 -0.671498215917801949E-03
+ -0.435064264063141017E-03 -0.622867805184998033E-03  0.926584080673045936E-02
+ -0.666666666666666970E-02 -0.162064603057306009E-03  0.755721765249671960E-02
+ -0.421568895545965981E-11 -0.164615562604910991E-03 -0.622111714812958048E-09
+  0.822717714449142969E-02 -0.666666666666666970E-02 -0.435031767250997005E-03
+  0.921658395801952003E-02 -0.671474812281343020E-03  0.000000000000000000E+00
+ -0.435066618622617987E-03 -0.622858755966087044E-03  0.926582670501814067E-02
+ -0.666666666666666970E-02 -0.435031759994269993E-03  0.921598448039129964E-02
+ -0.671353611750878003E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066625877937976E-03  0.000000000000000000E+00
+ -0.622806885942570987E-03  0.926515741571650939E-02 -0.435040457277539980E-03
+ -0.666666666666666970E-02  0.921687744869944051E-02 -0.671542694130693044E-03
+ -0.435057929735475977E-03 -0.622874450455320979E-03  0.926585997023653997E-02
+ -0.666666666666666970E-02 -0.162062556267884004E-03  0.755721766013062960E-02
+ -0.413703217741733986E-11 -0.164605370008190008E-03 -0.610674234287949039E-09
+  0.822717713670985916E-02 -0.435057871804128975E-03  0.926586389852728995E-02
+ -0.622877919476854986E-03 -0.435040515213944027E-03 -0.666666666666666970E-02
+ -0.671546676413121014E-03  0.921688294468172957E-02 -0.164358973103396013E-03
+ -0.666666666666666970E-02  0.822694462574627937E-02 -0.400863023298765998E-09
+ -0.164360422527890990E-03 -0.404404639852252990E-09  0.822717693008181983E-02
+ -0.435040521994138000E-03 -0.666666666666666970E-02  0.921688317220892978E-02
+ -0.671546727788281000E-03 -0.435057865024523994E-03 -0.622877930192895020E-03
+  0.926586392288021934E-02 -0.666666666666666970E-02 -0.162062559830136006E-03
+  0.755721765142735018E-02 -0.413716779390587020E-11 -0.164605386233707005E-03
+ -0.610693931490910976E-09  0.822717713361387042E-02 -0.435057916250486973E-03
+  0.926586001868622000E-02 -0.622874471770934967E-03 -0.435040470763708000E-03
+ -0.666666666666666970E-02 -0.671542796319704962E-03  0.921687790126752074E-02
+ -0.164358941319385992E-03 -0.666666666666666970E-02  0.822694343451954926E-02
+ -0.400827305234235980E-09 -0.164360402068052007E-03 -0.404386932680776984E-09
+  0.822717693276291996E-02 -0.666666666666666970E-02  0.755853528834142963E-02
+ -0.410907237024926035E-11 -0.609517161481542976E-09  0.822725835180459011E-02
+  0.822699835769964000E-02 -0.229769146362661995E-09 -0.666666666666666970E-02
+ -0.552503881837150022E-11  0.815277410258053933E-02 -0.431800076739517009E-03
+ -0.666666666666666970E-02  0.911683250971320920E-02 -0.645107775821999975E-03
+ -0.438245882118603019E-03 -0.614761191522381957E-03  0.925173111512482028E-02
+ -0.666666666666666970E-02 -0.162066807230739002E-03  0.755721537399490963E-02
+ -0.430232877656178001E-11 -0.164626562646872987E-03 -0.634707547989976979E-09
+  0.822717705663869939E-02 -0.435011184600756027E-03 -0.666666666666666970E-02
+  0.921541718649479015E-02 -0.670832821913695961E-03  0.000000000000000000E+00
+ -0.435087195554581009E-03 -0.622361407627594967E-03  0.926525297165626920E-02
+ -0.666666666666666970E-02 -0.435024023596373988E-03  0.921463311000128051E-02
+ -0.670757573686644971E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435074360623453004E-03  0.000000000000000000E+00
+ -0.622355873651302019E-03  0.926393658659479917E-02 -0.666666666666666970E-02
+ -0.162274431703276004E-03  0.755738308037023996E-02 -0.318241339520051988E-10
+ -0.165730950021604001E-03 -0.457847648567586989E-08  0.822724281267087944E-02
+  0.822699408685815918E-02 -0.228425571690641000E-09 -0.666666666666666970E-02
+ -0.569261988097333975E-11  0.815841787771899064E-02 -0.666666666666666970E-02
+ -0.432079853778089007E-03  0.912452759982078018E-02 -0.647134253482418054E-03
+ -0.437974705900714988E-03 -0.615294574958598047E-03  0.925273449425248994E-02
+ -0.666666666666666970E-02  0.755849436233489019E-02 -0.410955655495479020E-11
+ -0.609618023537665964E-09  0.822725856237870035E-02 -0.162292777295443990E-03
+ -0.666666666666666970E-02  0.755742123691062993E-02 -0.382880604592767993E-10
+ -0.165829805992421012E-03 -0.549637808741221009E-08  0.822725133924320957E-02
+ -0.666666666666666970E-02 -0.162519834506453002E-03  0.758488071339646994E-02
+ -0.277296855958790984E-04 -0.531411092685664019E-03 -0.271305840096227018E-03
+  0.895289465200115042E-02 -0.666666666666666970E-02 -0.435144739153746991E-03
+  0.927049386352079077E-02 -0.627195195351904020E-03  0.000000000000000000E+00
+ -0.434953602544600011E-03 -0.673646030614466983E-03  0.921927161130219920E-02
+ -0.666666666666666970E-02 -0.162365832482225009E-03  0.770469472296904986E-02
+ -0.341374365570676015E-05  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.530093834704794035E-03  0.000000000000000000E+00
+ -0.670683663828369948E-04  0.878401698585971023E-02 -0.666666666666666970E-02
+ -0.406469353648676005E-03  0.868845378029185940E-02 -0.487445867647356014E-03
+ -0.459955485443886000E-03 -0.561455537392793009E-03  0.917876821487357934E-02
+  0.822724019224464927E-02 -0.586236000786575972E-09 -0.666666666666666970E-02
+ -0.399393159268467017E-11  0.756942275692288041E-02 -0.666666666666666970E-02
+  0.756938783566538965E-02 -0.398490203650706992E-11 -0.582276441424578042E-09
+  0.822717710481313952E-02 -0.666666666666666970E-02 -0.162064188321971013E-03
+  0.755638508819920975E-02 -0.426955668367828011E-11 -0.164621776697154991E-03
+ -0.630372643058242999E-09  0.822717559459556005E-02 -0.435097665976917993E-03
+  0.926770146147764921E-02 -0.624846565957563015E-03 -0.435000709641257001E-03
+ -0.666666666666666970E-02 -0.673347123713348948E-03  0.921745137370448032E-02
+ -0.666666666666666970E-02 -0.164221437092461006E-03  0.822590307325241077E-02
+ -0.301540502789892983E-09 -0.164276463598209013E-03 -0.317267646894687990E-09
+  0.822720896113244074E-02 -0.666666666666666970E-02 -0.435040484785290994E-03
+  0.921687621618137026E-02 -0.671540621958410973E-03 -0.435057902230129995E-03
+ -0.622872302682165040E-03  0.926585782249825934E-02 -0.666666666666666970E-02
+ -0.162127341684403998E-03  0.759121275229191969E-02 -0.381774204087762988E-11
+  0.000000000000000000E+00 -0.164583013632055004E-03 -0.545969305874372025E-09
+  0.822717722380156974E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319316569640000E-03  0.755721776450155036E-02 -0.125637202975271002E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854313971363013E-03
+  0.000000000000000000E+00 -0.695766372129872957E-09  0.759121342784058965E-02
+ -0.435007341079701976E-03 -0.666666666666666970E-02  0.921767423502365020E-02
+ -0.673397510122010013E-03 -0.435091037539719026E-03 -0.624857046911454035E-03
+  0.926772529096031930E-02 -0.666666666666666970E-02 -0.435042780427774989E-03
+  0.921682237061306926E-02 -0.671414330503463958E-03 -0.435055606761622013E-03
+ -0.622737221386204017E-03  0.926573013279845085E-02 -0.164512437178604990E-03
+  0.822720912015355993E-02 -0.478141451497207985E-09 -0.162035132712287004E-03
+ -0.666666666666666970E-02 -0.321987108752259996E-11  0.755727832217654039E-02
+ -0.162065357686432991E-03 -0.666666666666666970E-02  0.755727582940853970E-02
+ -0.423979675954428002E-11 -0.164618725863945991E-03 -0.625584793967118990E-09
+  0.822717479197860972E-02 -0.162035093479730998E-03 -0.666666666666666970E-02
+  0.755725620859361031E-02 -0.322006451958004010E-11 -0.164512451389381995E-03
+ -0.478178423500335003E-09  0.822720912019037943E-02 -0.666666666666666970E-02
+ -0.162063731980656992E-03  0.755721533834744982E-02 -0.418199541718481972E-11
+ -0.164611237647245009E-03 -0.617212851732219973E-09  0.822717573520458968E-02
+ -0.435072205240009014E-03  0.926779361315624960E-02 -0.624886867601484965E-03
+ -0.435026179501724010E-03 -0.666666666666666970E-02 -0.673540811814344947E-03
+  0.921830845677890955E-02 -0.435026047901024976E-03 -0.666666666666666970E-02
+  0.921631021320170989E-02 -0.671342130465355002E-03 -0.435072336809685012E-03
+ -0.622763518226994005E-03  0.926572086598713082E-02 -0.666666666666666970E-02
+  0.755741246850622026E-02 -0.410910644194978984E-11 -0.606515170668247041E-09
+  0.822717712895315936E-02  0.822711877443555936E-02 -0.222069534738639993E-09
+ -0.666666666666666970E-02 -0.774169806703398991E-11  0.819168186272310056E-02
+ -0.433633789741900023E-03 -0.666666666666666970E-02  0.917151071300197074E-02
+ -0.660635211420097053E-03 -0.436454594164531018E-03 -0.620192759941486029E-03
+  0.926043761605101039E-02 -0.666666666666666970E-02 -0.435400642028349984E-03
+  0.924569487161715914E-02 -0.602675681802749023E-03 -0.434697126892320004E-03
+ -0.641341627123005026E-03  0.919049697819046031E-02 -0.417766802147069983E-03
+ -0.666666666666666970E-02  0.884119022200588987E-02 -0.565196699593361013E-03
+  0.000000000000000000E+00 -0.450926780658111002E-03 -0.597865634330250052E-03
+  0.921780464690032976E-02 -0.666666666666666970E-02 -0.162927680290126010E-03
+  0.761083369281832008E-02 -0.416350633852703977E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645664085047E-03
+  0.000000000000000000E+00 -0.269827347276391011E-03  0.866009136742803935E-02
+ -0.666666666666666970E-02 -0.162144345304456006E-03  0.760092331739828028E-02
+ -0.320930758282627016E-05 -0.529813749405098054E-03 -0.838256824281321020E-04
+  0.879776792041750931E-02  0.822717682684740922E-02 -0.303187285772573992E-09
+ -0.666666666666666970E-02 -0.349301427021032004E-11  0.790044609360178972E-02
+ -0.666666666666666970E-02 -0.407938480218889010E-03  0.871084858846607989E-02
+ -0.512295469801531947E-03 -0.458837927071481999E-03 -0.588085060255285046E-03
+  0.920289209649226975E-02 -0.666666666666666970E-02  0.755933306797651038E-02
+ -0.410021674909237023E-11 -0.607789044576712979E-09  0.822725774217296021E-02
+ -0.162167689058960993E-03 -0.666666666666666970E-02  0.755666327531443957E-02
+ -0.112741283803980995E-10 -0.165135297900302988E-03 -0.164049635527082004E-08
+  0.822716577111987958E-02 -0.666666666666666970E-02 -0.415926985437479015E-03
+  0.874524524800943982E-02 -0.345826974275356008E-03 -0.452438516787343978E-03
+ -0.346461802416189973E-03  0.898436011623418053E-02 -0.666666666666666970E-02
+ -0.410332540159175974E-03  0.874022409518679030E-02 -0.526301953402972958E-03
+  0.000000000000000000E+00 -0.456978154604879979E-03 -0.594686220254121993E-03
+  0.921017399119665979E-02 -0.666666666666666970E-02 -0.349138358539420027E-03
+  0.808745335846144074E-02 -0.174389979817568010E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.493489760022879976E-03
+  0.000000000000000000E+00 -0.295632770892417004E-03  0.859314451744194047E-02
+ -0.666666666666666970E-02 -0.162165456200227012E-03  0.755894485257972964E-02
+ -0.104756674728628994E-10 -0.165141511934384990E-03 -0.152686256191205990E-08
+  0.822721248562563977E-02  0.925936813792594023E-02 -0.620622042811125991E-03
+ -0.666666666666666970E-02 -0.661330294502400018E-03  0.917272029101801056E-02
+ -0.666666666666666970E-02  0.917272029101801056E-02 -0.661330294502400018E-03
+ -0.620622042811125991E-03  0.925936813792594023E-02 -0.666666666666666970E-02
+ -0.162059451012688989E-03  0.755773564896643039E-02 -0.398147961980706018E-11
+ -0.164599233536076003E-03 -0.588248966932235027E-09  0.822718777261857021E-02
+ -0.435010710626149014E-03  0.921784913653543939E-02 -0.671757497579554964E-03
+ -0.435087669354308976E-03 -0.666666666666666970E-02 -0.624205044887739973E-03
+  0.926731518984568077E-02 -0.666666666666666970E-02 -0.435087680804149997E-03
+  0.926689880496680933E-02 -0.623818561754816976E-03 -0.435010699170602000E-03
+ -0.671588999254513967E-03  0.921721221696823051E-02 -0.666666666666666970E-02
+ -0.162064554135907989E-03  0.755721778431040023E-02 -0.421379742453608961E-11
+ -0.164615465489623012E-03 -0.621841597575447999E-09  0.822717725634031921E-02
+ -0.666666666666666970E-02 -0.435032270858094002E-03  0.921659999696208994E-02
+ -0.671477624444696982E-03  0.000000000000000000E+00 -0.435066115102212007E-03
+ -0.622858575415140996E-03  0.926582763556333018E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435031835174660986E-03  0.921598350730943976E-02
+ -0.671332923411385946E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066550710658023E-03  0.000000000000000000E+00 -0.622784369281448010E-03
+  0.926515556236343074E-02 -0.435081584415368018E-03 -0.666666666666666970E-02
+  0.926715509609260003E-02 -0.624058431079940984E-03 -0.435016797724696992E-03
+ -0.671759302322477020E-03  0.921784300407063953E-02 -0.666666666666666970E-02
+ -0.162062416434052993E-03  0.755721809516292017E-02 -0.413174197731702039E-11
+ -0.164605215334601997E-03 -0.609922581957833024E-09  0.822717754034535051E-02
+ -0.435056252509799003E-03  0.926585458068619923E-02 -0.622864166386205044E-03
+ -0.435042134635998002E-03 -0.666666666666666970E-02 -0.671542142824929958E-03
+  0.921692265814959971E-02 -0.164341185001132012E-03 -0.666666666666666970E-02
+  0.822700548023182036E-02 -0.388254230807434987E-09 -0.164366486837535988E-03
+ -0.391288606812102024E-09  0.822719359654751001E-02 -0.435043745553257018E-03
+ -0.666666666666666970E-02  0.921697677898357035E-02 -0.671554360684184981E-03
+ -0.435054641693514987E-03 -0.622866717131421977E-03  0.926586042794482964E-02
+ -0.666666666666666970E-02 -0.162059863500320009E-03  0.755722860146506008E-02
+ -0.403707215866304025E-11 -0.164610171594745992E-03 -0.596715205847239044E-09
+  0.822719066601751078E-02 -0.435029181351461001E-03  0.921782667546000058E-02
+ -0.671761723962046014E-03 -0.435069204038055987E-03 -0.666666666666666970E-02
+ -0.623757152955008967E-03  0.926682695577680030E-02 -0.165176244427055993E-03
+ -0.666666666666666970E-02  0.822774267939753966E-02 -0.421637432761214996E-09
+ -0.164358900259774010E-03 -0.403642926186002997E-09  0.822717766237675979E-02
+ -0.666666666666666970E-02  0.895231589389712014E-02 -0.601006644879792970E-03
+ -0.606288347943506004E-03  0.923179444186011944E-02  0.921580902981316945E-02
+ -0.671118538607027963E-03 -0.666666666666666970E-02 -0.628092231229955013E-03
+  0.927168311539479026E-02 -0.174566249107139995E-03 -0.666666666666666970E-02
+  0.823503974641245050E-02 -0.510772714070998049E-06 -0.167453071908487002E-03
+ -0.287656386989065009E-06  0.822679160061134084E-02 -0.666666666666666970E-02
+ -0.162064983519178004E-03  0.755722213005034001E-02 -0.423003049664173980E-11
+ -0.164617409769547002E-03 -0.624194129888606984E-09  0.822717667703955040E-02
+ -0.435027350434486978E-03 -0.666666666666666970E-02  0.921651333906866981E-02
+ -0.671533317952461976E-03  0.000000000000000000E+00 -0.435071034569932023E-03
+ -0.622941399756069946E-03  0.926589080870137034E-02 -0.666666666666666970E-02
+ -0.435031629619492017E-03  0.921595126475713983E-02 -0.671462224228685047E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066756229832998E-03  0.000000000000000000E+00 -0.622929617617789958E-03
+  0.926511475629528992E-02 -0.666666666666666970E-02 -0.162052976330494004E-03
+  0.755715655404278971E-02 -0.378464136913797002E-11 -0.164557846396870007E-03
+ -0.559379553656162034E-09  0.822711615671048967E-02  0.921569960267614070E-02
+ -0.671199548462878990E-03 -0.666666666666666970E-02 -0.624992541148623954E-03
+  0.926820621881743051E-02 -0.666666666666666970E-02 -0.170162317751058011E-03
+  0.823042266464544918E-02 -0.875297413337334996E-07 -0.166825903611445003E-03
+ -0.692727437231052063E-07  0.822728043872133011E-02 -0.666666666666666970E-02
+  0.900314235197411002E-02 -0.616359390013905962E-03 -0.610140243636164963E-03
+  0.923851413610003031E-02 -0.162042399128667987E-03 -0.666666666666666970E-02
+  0.755717695498461014E-02 -0.343592447834058983E-11 -0.164505035561990999E-03
+ -0.508562114575286970E-09  0.822712657561797990E-02 -0.666666666666666970E-02
+ -0.162064626628578005E-03  0.755722018474085013E-02 -0.421639430439463977E-11
+ -0.164615674182357004E-03 -0.622213619794602018E-09  0.822717716142255980E-02
+ -0.666666666666666970E-02 -0.435031360634292993E-03  0.921661780771376032E-02
+ -0.671522125948738010E-03  0.000000000000000000E+00 -0.435067025167480974E-03
+ -0.622906555198169992E-03  0.926587475408661987E-02 -0.666666666666666970E-02
+ -0.435031920197820993E-03  0.921597580576540010E-02 -0.671351984885895973E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066465702252001E-03  0.000000000000000000E+00 -0.622805921827540012E-03
+  0.926514231021871061E-02 -0.666666666666666970E-02 -0.162062001713370989E-03
+  0.755721764997798959E-02 -0.411598727890175012E-11 -0.164602606520337013E-03
+ -0.607613494988126020E-09  0.822717713025281072E-02  0.926588367987988062E-02
+ -0.622883813196616022E-03 -0.666666666666666970E-02 -0.671594845205971042E-03
+  0.921711546375830054E-02 -0.666666666666666970E-02  0.921711546375830054E-02
+ -0.671594845205971042E-03 -0.622883813196616022E-03  0.926588367987988062E-02
+ -0.666666666666666970E-02 -0.162080643229434011E-03  0.755719622314534028E-02
+ -0.488882760495459993E-11 -0.164695578667720994E-03 -0.719871721533777035E-09
+  0.822716411325388926E-02 -0.166686373905860005E-03  0.822673885725208956E-02
+ -0.487733280019655016E-07 -0.166616817170293989E-03 -0.666666666666666970E-02
+ -0.385414488502229971E-07  0.822045888465093924E-02 -0.666666666666666970E-02
+ -0.434813234051737991E-03  0.920937424335109035E-02 -0.669894558723525000E-03
+ -0.435284874465938004E-03 -0.622580915854730019E-03  0.926510963371094017E-02
+ -0.666666666666666970E-02 -0.435040408248740026E-03  0.921688082460559062E-02
+ -0.671547530054927020E-03 -0.435057978759970999E-03 -0.622879363153527985E-03
+  0.926586503625058920E-02 -0.666666666666666970E-02 -0.435048354812829995E-03
+  0.921714230585230984E-02 -0.671602199049514052E-03  0.000000000000000000E+00
+ -0.435050032579175011E-03 -0.622886590993160022E-03  0.926588809980974024E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162065782294302011E-03
+  0.755721459868404031E-02 -0.425813299026563987E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164621303266333996E-03  0.000000000000000000E+00
+ -0.628265660483508001E-09  0.822715314376032923E-02 -0.162542375670815992E-03
+ -0.666666666666666970E-02  0.755649757850690006E-02 -0.563432271384059970E-09
+ -0.167001893968643994E-03 -0.773903559881119996E-07  0.822676727708255076E-02
+ -0.666666666666666970E-02 -0.162062879664528998E-03  0.755721695458686990E-02
+ -0.414938028726372967E-11 -0.164606984614506006E-03 -0.612470146931394021E-09
+  0.822717691631253030E-02 -0.435061921280182988E-03  0.926591210512174059E-02
+ -0.622933977614011981E-03 -0.435036465303411009E-03 -0.666666666666666970E-02
+ -0.671580853268202043E-03  0.921680734331616937E-02 -0.435036519090423993E-03
+ -0.666666666666666970E-02  0.921831642815977917E-02 -0.673202190294888052E-03
+ -0.435061867500153025E-03 -0.624494985629193999E-03  0.926748120028762952E-02
+ -0.435006921432357974E-03 -0.666666666666666970E-02  0.921581685576767021E-02
+ -0.671357037929466998E-03 -0.435091457008391985E-03 -0.622887259624104036E-03
+  0.926580572699929952E-02 -0.666666666666666970E-02 -0.435000210414426987E-03
+  0.921805741531904997E-02 -0.673980656048901966E-03 -0.435098164960824000E-03
+ -0.625455737994957050E-03  0.926835240170618968E-02 -0.167002256657801997E-03
+  0.822676739801933045E-02 -0.775116142162273040E-07 -0.162540963549468999E-03
+ -0.666666666666666970E-02 -0.564105828717064959E-09  0.755607184337712002E-02
+ -0.162063151808471010E-03 -0.666666666666666970E-02  0.755610609873923027E-02
+ -0.425254382354864998E-11 -0.164619383929850006E-03 -0.628041057868921975E-09
+  0.822717599205483041E-02 -0.666666666666666970E-02  0.921687630329096064E-02
+ -0.671540851769114968E-03 -0.622872550846322991E-03  0.926585805654763991E-02
+  0.926585809314046070E-02 -0.622872566927465052E-03 -0.666666666666666970E-02
+ -0.671540928853904041E-03  0.921687664468394917E-02 -0.164358960620165987E-03
+ -0.666666666666666970E-02  0.822694395609338026E-02 -0.400847008848852017E-09
+ -0.164360413027222997E-03 -0.404398713049403977E-09  0.822717692924596067E-02
+ -0.666666666666666970E-02 -0.162064640983030012E-03  0.755718903530966032E-02
+ -0.421955570049609974E-11 -0.164616035944705001E-03 -0.622688578366344997E-09
+  0.822717711474709075E-02 -0.435031324368629995E-03 -0.666666666666666970E-02
+  0.921656922225446047E-02 -0.671471582389523958E-03  0.000000000000000000E+00
+ -0.435067061426646024E-03 -0.622858176290813008E-03  0.926582523607523932E-02
+ -0.666666666666666970E-02 -0.435031048387666001E-03  0.921594565739619967E-02
+ -0.671345384731177964E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435067337357646024E-03  0.000000000000000000E+00
+ -0.622804669702495955E-03  0.926513810679983028E-02 -0.666666666666666970E-02
+ -0.162062558200820996E-03  0.755721755342539988E-02 -0.413711391502012990E-11
+ -0.164605379088546999E-03 -0.610686125296504951E-09  0.822717713332750053E-02
+  0.926585808945160939E-02 -0.622872565296268030E-03 -0.666666666666666970E-02
+ -0.671540921029493000E-03  0.921687661003382959E-02 -0.666666666666666970E-02
+ -0.164358960452099987E-03  0.822694392878840081E-02 -0.400846613572565987E-09
+ -0.164360413029369988E-03 -0.404398732358932989E-09  0.822717692925123943E-02
+ -0.666666666666666970E-02  0.921687630355970053E-02 -0.671540851829773044E-03
+ -0.622872550859014987E-03  0.926585805657732971E-02 -0.162062558227438999E-03
+ -0.666666666666666970E-02  0.755721760853244034E-02 -0.413711033678103961E-11
+ -0.164605378673800013E-03 -0.610685584929304044E-09  0.822717713318102915E-02
+ -0.666666666666666970E-02 -0.162064570152263006E-03  0.755721841422958996E-02
+ -0.421433625240463005E-11 -0.164615390632404009E-03 -0.621914612647070953E-09
+  0.822717706877345960E-02 -0.666666666666666970E-02 -0.435032309673597981E-03
+  0.921659272513397979E-02 -0.671468254207514999E-03  0.000000000000000000E+00
+ -0.435066076293283009E-03 -0.622849275891169040E-03  0.926581888559491022E-02
+ -0.666666666666666970E-02 -0.435031684645659992E-03  0.921601778227170050E-02
+ -0.671377596560439956E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066701213360013E-03  0.000000000000000000E+00
+ -0.622828682917879006E-03  0.926519565944300058E-02 -0.666666666666666970E-02
+ -0.162168008367528997E-03  0.755720834454587983E-02 -0.111631510700418996E-10
+ -0.165130579339017988E-03 -0.162383743141729992E-08  0.822716472424545042E-02
+  0.822711943021554003E-02 -0.222017845615760010E-09 -0.666666666666666970E-02
+ -0.777761013879667033E-11  0.819196920656370918E-02 -0.666666666666666970E-02
+  0.917235355722251927E-02 -0.661205828628974026E-03 -0.620672923579346962E-03
+  0.926095692540444999E-02 -0.666666666666666970E-02 -0.162060886928861006E-03
+  0.755722230561571041E-02 -0.407434722062596981E-11 -0.164603197375618002E-03
+ -0.601754536458248027E-09  0.822718170485291916E-02 -0.435038460514972026E-03
+  0.921717884367924054E-02 -0.671598994291641997E-03 -0.435059926303323985E-03
+ -0.666666666666666970E-02 -0.623144536207649959E-03  0.926616805245615051E-02
+ -0.666666666666666970E-02 -0.435059886891442024E-03  0.926772984879662039E-02
+ -0.624637266544077001E-03 -0.435038499931220991E-03 -0.673028756927541039E-03
+  0.921880428832917015E-02 -0.666666666666666970E-02 -0.162064590679631006E-03
+  0.755721768483237979E-02 -0.421520958094493002E-11 -0.164615528433844000E-03
+ -0.622042930544206962E-09  0.822717716750567991E-02 -0.666666666666666970E-02
+ -0.435032046708114988E-03  0.921657978568275055E-02 -0.671461221312615953E-03
+  0.000000000000000000E+00 -0.435066339213917980E-03 -0.622843930781788978E-03
+  0.926581370321850027E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031628760092985E-03  0.921598912055172914E-02 -0.671353905916793046E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066757089098022E-03
+  0.000000000000000000E+00 -0.622806875303639953E-03  0.926516708273138027E-02
+ -0.435042591383769009E-03 -0.666666666666666970E-02  0.921694193442373043E-02
+ -0.671550566383684018E-03 -0.435055795793300019E-03 -0.622869755363486023E-03
+  0.926586026640358919E-02 -0.666666666666666970E-02 -0.162061536519942994E-03
+  0.755722454624959024E-02 -0.409902564328030991E-11 -0.164610432002776987E-03
+ -0.605475143914787990E-09  0.822718468701886044E-02 -0.435048024953900003E-03
+  0.921879226944885034E-02 -0.673030814417662025E-03 -0.435050362434787020E-03
+ -0.666666666666666970E-02 -0.624403907899169952E-03  0.926747756344729022E-02
+ -0.164429321124159010E-03 -0.666666666666666970E-02  0.822722510694912958E-02
+ -0.405099938099922984E-09 -0.164359494664249005E-03 -0.403602349458583981E-09
+  0.822717730152964025E-02 -0.435057319269020976E-03 -0.666666666666666970E-02
+  0.926766299189552001E-02 -0.624575498879247960E-03 -0.435041067795715979E-03
+ -0.673029869981998019E-03  0.921880276298027955E-02 -0.666666666666666970E-02
+ -0.162062471480618010E-03  0.755721793381790036E-02 -0.413381786270217965E-11
+ -0.164605232829300992E-03 -0.610216091166618047E-09  0.822717734939633002E-02
+ -0.435056876315470997E-03  0.926585635253173064E-02 -0.622868044974361963E-03
+ -0.435041510784235001E-03 -0.666666666666666970E-02 -0.671542372337881029E-03
+  0.921690563981085974E-02 -0.164351356240546013E-03 -0.666666666666666970E-02
+  0.822698086373702987E-02 -0.395512544131405997E-09 -0.164365831071158996E-03
+ -0.398742006637490978E-09  0.822718605874551978E-02 -0.666666666666666970E-02
+  0.920452787846717975E-02 -0.668731214842509985E-03 -0.622283241890150981E-03
+  0.926452697312771072E-02  0.822724025612022077E-02 -0.586193181292780024E-09
+ -0.666666666666666970E-02 -0.399370802774403963E-11  0.756944692768340022E-02
+ -0.162114939474388000E-03 -0.666666666666666970E-02  0.756941304335114005E-02
+ -0.521957327686064991E-11 -0.164756390953382006E-03 -0.760317714252139982E-09
+  0.822719049518821043E-02 -0.666666666666666970E-02 -0.162348325334986989E-03
+  0.757141350179527019E-02 -0.154261696442177995E-04 -0.530720487223206049E-03
+ -0.221684447280562997E-03  0.891605622167588978E-02 -0.163035610806201990E-03
+ -0.666666666666666970E-02  0.791126088236420930E-02 -0.568886536946596036E-05
+  0.000000000000000000E+00 -0.530385316755896006E-03 -0.644747260841582991E-04
+  0.878526676544224942E-02 -0.666666666666666970E-02 -0.399601245888043025E-03
+  0.843932649296406934E-02 -0.399753779534292998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.464883727737099017E-03
+  0.000000000000000000E+00 -0.476904229282087023E-03  0.874547695183781025E-02
+ -0.666666666666666970E-02 -0.162063070557396998E-03  0.755351614246298989E-02
+ -0.447630170821152977E-11 -0.164644944992450995E-03 -0.661962042461270997E-09
+  0.822717544552603076E-02  0.926551780075880044E-02 -0.622722840857442951E-03
+ -0.666666666666666970E-02 -0.670824267990035965E-03  0.921370888772457021E-02
+ -0.666666666666666970E-02 -0.434945178474266999E-03  0.921154593499040955E-02
+ -0.668579923648182010E-03 -0.435153154698610990E-03 -0.620572207908768009E-03
+  0.926326406979680050E-02 -0.666666666666666970E-02  0.755816621971812028E-02
+ -0.411326358485724017E-11 -0.610348502915667010E-09  0.822725905587247917E-02
+ -0.402874267847195980E-03 -0.666666666666666970E-02  0.865106149194994067E-02
+ -0.472609392802310021E-03 -0.462635021140100986E-03 -0.557606090580912007E-03
+  0.917391269482082002E-02 -0.666666666666666970E-02 -0.162519833129987003E-03
+  0.758488053948729989E-02 -0.277295971224945994E-04 -0.531411087991590981E-03
+ -0.271305613723866980E-03  0.895289446740019958E-02 -0.666666666666666970E-02
+ -0.435144823516233985E-03  0.927048838466438055E-02 -0.627189812839501980E-03
+  0.000000000000000000E+00 -0.434953518101383974E-03 -0.673638210139792994E-03
+  0.921926445312559045E-02 -0.666666666666666970E-02 -0.162365832936652005E-03
+  0.770469512965636042E-02 -0.341376585662229018E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.530094362034539009E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.670698858845583960E-04
+  0.878401949557726033E-02 -0.666666666666666970E-02 -0.406469382005018019E-03
+  0.868845408484301973E-02 -0.487445981816212999E-03 -0.459955463954967011E-03
+ -0.561455560363951947E-03  0.917876824764385021E-02  0.822724019224600929E-02
+ -0.586235995361394981E-09 -0.666666666666666970E-02 -0.399393156536155037E-11
+  0.756942275977688043E-02 -0.666666666666666970E-02  0.756938783851883977E-02
+ -0.398490198193408992E-11 -0.582276435921294038E-09  0.822717710481313952E-02
+ -0.666666666666666970E-02 -0.162061804589790998E-03  0.755638842160883967E-02
+ -0.417671676259941005E-11 -0.164610667790356010E-03 -0.616892111647492956E-09
+  0.822717774708076922E-02 -0.435068239353668981E-03  0.926581925277365943E-02
+ -0.622854391016698997E-03 -0.435030146223642990E-03 -0.666666666666666970E-02
+ -0.671460680594175010E-03  0.921652801901830070E-02 -0.666666666666666970E-02
+ -0.164225650703184010E-03  0.822670097400921008E-02 -0.310328437504517979E-09
+ -0.164276003216507990E-03 -0.316769467002113981E-09  0.822720900471031989E-02
+ -0.666666666666666970E-02 -0.435040486303126981E-03  0.921687614266927975E-02
+ -0.671540506442383023E-03 -0.435057900712426009E-03 -0.622872183625812047E-03
+  0.926585769785132074E-02 -0.666666666666666970E-02 -0.162127332716032000E-03
+  0.759121301448910032E-02 -0.381743951397943036E-11  0.000000000000000000E+00
+ -0.164582975768345007E-03 -0.545926699323485040E-09  0.822717722734729075E-02
+ -0.666666666666666970E-02 -0.166319286578935988E-03  0.755721781846689991E-02
+ -0.125531050075483999E-10  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854284080451002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.695180386014204010E-09  0.759121369168153960E-02
+ -0.435042566545895985E-03 -0.666666666666666970E-02  0.921694463459228966E-02
+ -0.671554769008383002E-03 -0.435055820629525977E-03 -0.622874016968664036E-03
+  0.926586380297488021E-02 -0.666666666666666970E-02 -0.435042888369998996E-03
+  0.921681962552989981E-02 -0.671408260658942963E-03 -0.435055498826277974E-03
+ -0.622730755410115045E-03  0.926572389917782943E-02 -0.164512123628282998E-03
+  0.822720916387005043E-02 -0.477817077089683953E-09 -0.162035062407616011E-03
+ -0.666666666666666970E-02 -0.321765105888902011E-11  0.755728112204505018E-02
+ -0.162062504982868002E-03 -0.666666666666666970E-02  0.755727880376912999E-02
+ -0.413009842789627016E-11 -0.164604675325971005E-03 -0.609640565546088004E-09
+  0.822717725918549990E-02 -0.162035022530451994E-03 -0.666666666666666970E-02
+  0.755725864235686981E-02 -0.321784768823900019E-11 -0.164512138073065988E-03
+ -0.477854634654902035E-09  0.822720916390744933E-02 -0.666666666666666970E-02
+ -0.162061944628342011E-03  0.755721772719453004E-02 -0.411382630968498961E-11
+ -0.164602365207998006E-03 -0.607300598025874993E-09  0.822717716454718015E-02
+ -0.435050067144334015E-03  0.926588448108721027E-02 -0.622883111379359043E-03
+ -0.435048320247372999E-03 -0.666666666666666970E-02 -0.671598373882453998E-03
+  0.921713778918750032E-02 -0.435048320032475997E-03 -0.666666666666666970E-02
+  0.921705200694796986E-02 -0.671504927255521023E-03 -0.435050067359229987E-03
+ -0.622795092030723960E-03  0.926579461935501947E-02 -0.666666666666666970E-02
+  0.755741246850972007E-02 -0.410910644191072979E-11 -0.606515170660810997E-09
+  0.822717712895315936E-02  0.822711877443554028E-02 -0.222069534616361010E-09
+ -0.666666666666666970E-02 -0.774169825927849969E-11  0.819168186428156052E-02
+ -0.433633789807636014E-03 -0.666666666666666970E-02  0.917151071499614987E-02
+ -0.660635211889124972E-03 -0.436454594099722020E-03 -0.620192760036731021E-03
+  0.926043761627064026E-02 -0.666666666666666970E-02 -0.435401639669350015E-03
+  0.924572083824642954E-02 -0.602697496588707024E-03 -0.434696125740512986E-03
+ -0.641341546899128006E-03  0.919049825465466028E-02 -0.417766807487059990E-03
+ -0.666666666666666970E-02  0.884119030057015955E-02 -0.565196723638367016E-03
+  0.000000000000000000E+00 -0.450926776155331018E-03 -0.597865637200630960E-03
+  0.921780465508625045E-02 -0.666666666666666970E-02 -0.162927680984810988E-03
+  0.761083385223151974E-02 -0.416350559257179991E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645905115008E-03
+  0.000000000000000000E+00 -0.269827250117159992E-03  0.866009133064084974E-02
+ -0.666666666666666970E-02 -0.162144345329803000E-03  0.760092333284733031E-02
+ -0.320930747823896000E-05 -0.529813749349416032E-03 -0.838256764278110937E-04
+  0.879776791477051949E-02  0.822717682684740922E-02 -0.303187284199640013E-09
+ -0.666666666666666970E-02 -0.349301427958089990E-11  0.790044609725218047E-02
+ -0.666666666666666970E-02 -0.407938480851945009E-03  0.871084859569853931E-02
+ -0.512295472619773030E-03 -0.458837926585046020E-03 -0.588085060897069957E-03
+  0.920289209739394085E-02 -0.666666666666666970E-02  0.755933306799180978E-02
+ -0.410021688445045033E-11 -0.607789044544431015E-09  0.822725774217296021E-02
+ -0.162167689058370998E-03 -0.666666666666666970E-02  0.755666327735462964E-02
+ -0.112741278002712006E-10 -0.165135297873814004E-03 -0.164049626541977997E-08
+  0.822716577112805013E-02 -0.666666666666666970E-02 -0.162508948582128004E-03
+  0.758386230747792964E-02 -0.268846850272863999E-04 -0.531370159696699959E-03
+ -0.268798452226511010E-03  0.895121067622118953E-02 -0.666666666666666970E-02
+ -0.435111783295980990E-03  0.926957338892927044E-02 -0.626355975132891958E-03
+  0.000000000000000000E+00 -0.434986584483092024E-03 -0.673610961683688029E-03
+  0.921919033051740028E-02 -0.666666666666666970E-02 -0.162383112105014989E-03
+  0.770842507081398020E-02 -0.352838184825081003E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.529946733926804961E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.685200481248618984E-04
+  0.878522933538209046E-02 -0.666666666666666970E-02 -0.406001492103280994E-03
+  0.868371047881797038E-02 -0.486532578576920989E-03 -0.460309450350815017E-03
+ -0.562408172008129988E-03  0.917930422133162068E-02  0.822717710596379027E-02
+ -0.583431679178089002E-09 -0.666666666666666970E-02 -0.399067558641347968E-11
+  0.756879025163890995E-02 -0.666666666666666970E-02  0.756882506046350958E-02
+ -0.399965820178691986E-11 -0.587373439708311021E-09  0.822723979745249023E-02
+ -0.666666666666666970E-02 -0.162059699334328987E-03  0.755640315367181025E-02
+ -0.409724983315023040E-11 -0.164617694447567010E-03 -0.605894425669846960E-09
+  0.822719082319614027E-02 -0.435046673499388009E-03  0.921780363530167943E-02
+ -0.671765264155256970E-03 -0.435051713864318020E-03 -0.666666666666666970E-02
+ -0.623330396350863002E-03  0.926636396497984918E-02 -0.666666666666666970E-02
+ -0.164528465340170010E-03  0.822727665463759972E-02 -0.426431688727269020E-09
+ -0.164384495696545988E-03 -0.423252858884490006E-09  0.822718001752816008E-02
+ -0.666666666666666970E-02 -0.435040505956391999E-03  0.921687628114545993E-02
+ -0.671540025560727998E-03 -0.435057881060873976E-03 -0.622871599674077966E-03
+  0.926585723223368081E-02 -0.666666666666666970E-02 -0.162127506812547009E-03
+  0.759121355987753960E-02 -0.382292666777456993E-11  0.000000000000000000E+00
+ -0.164583701911017989E-03 -0.546697495368834967E-09  0.822717719106172071E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319165150470994E-03
+  0.755721813537886013E-02 -0.125075381051564997E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854154167308990E-03  0.000000000000000000E+00
+ -0.692662263886114992E-09  0.759121431463458986E-02 -0.435081853759711993E-03
+ -0.666666666666666970E-02  0.926716255638974050E-02 -0.624065278105386955E-03
+ -0.435016528292601001E-03 -0.671759479502790952E-03  0.921784374777362935E-02
+ -0.666666666666666970E-02 -0.435040907342144000E-03  0.921689077912838002E-02
+ -0.671540798041408006E-03 -0.435057479709268001E-03 -0.622869631331658029E-03
+  0.926586003647436943E-02 -0.164634692602426999E-03  0.822718023129201045E-02
+ -0.639569061694972014E-09 -0.162067618394035989E-03 -0.666666666666666970E-02
+ -0.433481102637178975E-11  0.755721907361002043E-02 -0.162058744590199013E-03
+ -0.666666666666666970E-02  0.755722007835428998E-02 -0.399695343689710995E-11
+ -0.164609059967339006E-03 -0.591014713275666970E-09  0.822719388300061083E-02
+ -0.162067628996523003E-03 -0.666666666666666970E-02  0.755722473679793037E-02
+ -0.433474416210013021E-11 -0.164634688823378012E-03 -0.639556359277955005E-09
+  0.822718023127935044E-02 -0.666666666666666970E-02 -0.162059007940064013E-03
+  0.755722761558702029E-02 -0.400519574062733029E-11 -0.164602739721675997E-03
+ -0.591972127525160032E-09  0.822718830313987938E-02 -0.435017256528589982E-03
+  0.921784067930350032E-02 -0.671758935795494969E-03 -0.435081125759231022E-03
+ -0.666666666666666970E-02 -0.624046237027197953E-03  0.926714180760297018E-02
+ -0.435081137126827990E-03 -0.666666666666666970E-02  0.926672070407219030E-02
+ -0.623654480728591011E-03 -0.435017245156463000E-03 -0.671584693353717023E-03
+  0.921719987714082929E-02 -0.666666666666666970E-02  0.755944258238581013E-02
+ -0.409900186657922026E-11 -0.607550069673094021E-09  0.822725761878752987E-02
+  0.822717683107230917E-02 -0.307429328465473987E-09 -0.666666666666666970E-02
+ -0.346590495801865012E-11  0.788944074290687074E-02 -0.405969478945487005E-03
+ -0.666666666666666970E-02  0.868864830866255926E-02 -0.503334330728072020E-03
+ -0.460337024732346983E-03 -0.585683665883423958E-03  0.919951404153332084E-02
+ -0.666666666666666970E-02 -0.434976282613521999E-03  0.922412114032335022E-02
+ -0.680891393674205956E-03 -0.435122078171959979E-03 -0.632218355554151046E-03
+  0.927546870226302056E-02 -0.163258162677796989E-03 -0.666666666666666970E-02
+  0.771427722386641007E-02 -0.477633227286587011E-04  0.000000000000000000E+00
+ -0.532374011844484007E-03 -0.264816258060028011E-03  0.893589130313274076E-02
+ -0.666666666666666970E-02 -0.163022377118752999E-03  0.756010678634908007E-02
+ -0.985505930025391935E-05  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.528465266297688968E-03  0.000000000000000000E+00
+ -0.112888567375700995E-03  0.818271007019684068E-02  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.162166578290559013E-03  0.755933922022227964E-02
+ -0.104753666208369006E-10 -0.165099058432877988E-03 -0.152245985024684009E-08
+  0.822717141733115068E-02  0.822711899107518921E-02 -0.221645138348997998E-09
+ -0.666666666666666970E-02 -0.803318982792332002E-11  0.819386807586616923E-02
+ -0.666666666666666970E-02 -0.433725919557807001E-03  0.917430862025363052E-02
+ -0.661288984210802052E-03 -0.436363721932067020E-03 -0.620322135837134050E-03
+  0.926073953967261956E-02 -0.666666666666666970E-02  0.755741928801698017E-02
+ -0.410903028639137974E-11 -0.606500685137881987E-09  0.822717712893872993E-02
+ -0.162225051674325002E-03 -0.666666666666666970E-02  0.762801683545900975E-02
+ -0.369132323199304980E-05 -0.529778141458443004E-03 -0.883337008316910978E-04
+  0.880271404367008979E-02 -0.666666666666666970E-02 -0.435040483790848998E-03
+  0.921687610102163955E-02 -0.671540507590834041E-03 -0.435057903224484984E-03
+ -0.622872195907222962E-03  0.926585773544527994E-02 -0.666666666666666970E-02
+ -0.435049596668294974E-03  0.926588177948937074E-02 -0.622878658890590954E-03
+  0.000000000000000000E+00 -0.435048790726422978E-03 -0.671587306492918033E-03
+  0.921715304894545062E-02 -0.666666666666666970E-02 -0.162062557266168006E-03
+  0.755721780085659000E-02 -0.413875860912190023E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164620130721840991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.611410160549998955E-09
+  0.822718806718837972E-02 -0.666666666666666970E-02 -0.435040499350135008E-03
+  0.921687692625696060E-02 -0.671540975224692001E-03 -0.435057887666556015E-03
+ -0.622872560192916044E-03  0.926585810471101945E-02  0.822717712936562977E-02
+ -0.606929315017626020E-09 -0.666666666666666970E-02 -0.411128311855796991E-11
+  0.755721765969236993E-02 -0.666666666666666970E-02  0.755721765969236993E-02
+ -0.411128306434785967E-11 -0.606929314990520966E-09  0.822717712936562977E-02
+ -0.666666666666666970E-02 -0.162059344493046002E-03  0.755722651317146995E-02
+ -0.401759691005694024E-11 -0.164603667382743994E-03 -0.593754382165260968E-09
+  0.822718774646260045E-02 -0.435021349583778003E-03  0.921808799379093997E-02
+ -0.672054642719300036E-03 -0.435077033929117025E-03 -0.666666666666666970E-02
+ -0.624215698660223004E-03  0.926731300229175978E-02 -0.666666666666666970E-02
+ -0.435077064835482003E-03  0.926661685822192938E-02 -0.623558994505915955E-03
+ -0.435021318668023997E-03 -0.671590322333262958E-03  0.921719752068604972E-02
+ -0.666666666666666970E-02 -0.162064557104371002E-03  0.755721777860633021E-02
+ -0.421391246707965999E-11 -0.164615474626227001E-03 -0.621858123915516982E-09
+  0.822717725212621966E-02 -0.666666666666666970E-02 -0.435032241858439002E-03
+  0.921659906098254954E-02 -0.671477447178317022E-03  0.000000000000000000E+00
+ -0.435066144096944024E-03 -0.622858570800819051E-03  0.926582756926139055E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031827990191983E-03
+  0.921598377863167006E-02 -0.671334364038543986E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066557893876995E-03  0.000000000000000000E+00
+ -0.622785914397349987E-03  0.926515598810159942E-02 -0.435075666026484009E-03
+ -0.666666666666666970E-02  0.926727820268466972E-02 -0.624183768120280973E-03
+ -0.435022717858528992E-03 -0.672055614043404951E-03  0.921808835078249955E-02
+ -0.666666666666666970E-02 -0.162062424733906989E-03  0.755721807683177975E-02
+ -0.413205652975882023E-11 -0.164605235421495002E-03 -0.609967620239067977E-09
+  0.822717752452148052E-02 -0.435056352038405984E-03  0.926585352741150087E-02
+ -0.622863300988667017E-03 -0.435042035100298007E-03 -0.666666666666666970E-02
+ -0.671540654928718013E-03  0.921691864794477005E-02 -0.164343513344571992E-03
+ -0.666666666666666970E-02  0.822700195448362002E-02 -0.389924360631551020E-09
+ -0.164367623390296991E-03 -0.392998923185469007E-09  0.822719276781523970E-02
+ -0.435043552264714024E-03 -0.666666666666666970E-02  0.921696961930900012E-02
+ -0.671552161840081006E-03 -0.435054834971315978E-03 -0.622865703361334027E-03
+  0.926585903533608951E-02 -0.666666666666666970E-02 -0.162060264459924991E-03
+  0.755722786475403977E-02 -0.405192380789587974E-11 -0.164611230243440009E-03
+ -0.598848077831665017E-09  0.822718997281340035E-02 -0.435034019503558974E-03
+  0.921807335886390064E-02 -0.672057800833721053E-03 -0.435064366738565982E-03
+ -0.666666666666666970E-02 -0.623908223929953956E-03  0.926697860106714955E-02
+ -0.164987547210470009E-03 -0.666666666666666970E-02  0.822761164777149022E-02
+ -0.417375828718230023E-09 -0.164359000422478990E-03 -0.403612204340233986E-09
+  0.822717763383406034E-02 -0.666666666666666970E-02  0.755721765245576018E-02
+ -0.411128322661895974E-11 -0.606929330387248952E-09  0.822717712936565058E-02
+  0.822717692676105082E-02 -0.401916206499038982E-09 -0.666666666666666970E-02
+ -0.398504312842813992E-09  0.822695178414330035E-02 -0.435040783009246019E-03
+ -0.666666666666666970E-02  0.921688646246055995E-02 -0.671543142540194956E-03
+ -0.435057604031761002E-03 -0.622873025744601029E-03  0.926585914161285988E-02
+ -0.666666666666666970E-02 -0.162064588603687992E-03  0.755722246650228965E-02
+ -0.421473369469323961E-11 -0.164615514901262993E-03 -0.621972777019227005E-09
+  0.822717720033716050E-02 -0.435031817237567986E-03 -0.666666666666666970E-02
+  0.921658561270818043E-02 -0.671475161541192035E-03  0.000000000000000000E+00
+ -0.435066568644765022E-03 -0.622858805909676998E-03  0.926582686247801046E-02
+ -0.666666666666666970E-02 -0.435031985152347994E-03  0.921599362553296962E-02
+ -0.671355526523753957E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435066400758952999E-03  0.000000000000000000E+00
+ -0.622807256895068981E-03  0.926516002397628075E-02 -0.666666666666666970E-02
+ -0.162062535395593997E-03  0.755721770745760007E-02 -0.413623360119686986E-11
+ -0.164605263979841011E-03 -0.610558026872576020E-09  0.822717713332134053E-02
+  0.822717692676101960E-02 -0.401916174204150005E-09 -0.666666666666666970E-02
+ -0.398504881043303978E-09  0.822695182391632029E-02 -0.666666666666666970E-02
+ -0.435040784478542004E-03  0.921688651148403967E-02 -0.671543153316058990E-03
+ -0.435057602562588996E-03 -0.622873027716196984E-03  0.926585914660035988E-02
+ -0.666666666666666970E-02  0.755721765260307984E-02 -0.411128317076067032E-11
+ -0.606929330073791030E-09  0.822717712936565058E-02 -0.162062535451855009E-03
+ -0.666666666666666970E-02  0.755721767183212961E-02 -0.413623865254959015E-11
+ -0.164605264611767001E-03 -0.610558775336228996E-09  0.822717713310498061E-02
+ -0.666666666666666970E-02 -0.435041508530557993E-03  0.921691025951686983E-02
+ -0.671547950168372028E-03 -0.435056878568975021E-03 -0.622873496063732976E-03
+  0.926586118314657974E-02 -0.666666666666666970E-02 -0.435049193102497007E-03
+  0.921716879431322932E-02 -0.671606868811568960E-03  0.000000000000000000E+00
+ -0.435049194293033988E-03 -0.622886307249136953E-03  0.926588937729599070E-02
+ -0.666666666666666970E-02 -0.162062558223755992E-03  0.755721793600594959E-02
+ -0.413708123041369019E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605375314667000E-03  0.000000000000000000E+00
+ -0.610681172826885034E-09  0.822717711923300066E-02 -0.666666666666666970E-02
+ -0.435040497369072016E-03  0.921687685990914968E-02 -0.671540960664754978E-03
+ -0.435057889647446998E-03 -0.622872557534397998E-03  0.926585809770509930E-02
+  0.822717712936562977E-02 -0.606929315458585975E-09 -0.666666666666666970E-02
+ -0.411128312059151024E-11  0.755721765951058999E-02 -0.666666666666666970E-02
+  0.755721765951058999E-02 -0.411128314769655971E-11 -0.606929315377271019E-09
+  0.822717712936562977E-02 -0.666666666666666970E-02 -0.162062535431107012E-03
+  0.755721765815363031E-02 -0.413623888851339997E-11 -0.164605264643440992E-03
+ -0.610558828302725956E-09  0.822717713301761994E-02 -0.164360410638444992E-03
+  0.822717692977796913E-02 -0.404391259451850998E-09 -0.164359006270221988E-03
+ -0.666666666666666970E-02 -0.400958477955193996E-09  0.822695178639428967E-02
+ -0.666666666666666970E-02 -0.435040782988183007E-03  0.921688646176349081E-02
+ -0.671543142412070991E-03 -0.435057604052822009E-03 -0.622873025744216029E-03
+  0.926585914154554914E-02 -0.666666666666666970E-02 -0.435040480198357025E-03
+  0.921687629980787008E-02 -0.671540848012874966E-03 -0.435057906816663025E-03
+ -0.622872547218613964E-03  0.926585805277396063E-02 -0.666666666666666970E-02
+ -0.435049192310166017E-03  0.921716878050548964E-02 -0.671606876994400976E-03
+  0.000000000000000000E+00 -0.435049195085364978E-03 -0.622886319709903963E-03
+  0.926588938772177073E-02 -0.666666666666666970E-02 -0.162062559106551998E-03
+  0.755721765241205989E-02 -0.413713446085067003E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382365712999E-03
+  0.000000000000000000E+00 -0.610689035956021986E-09  0.822717709595601962E-02
+ -0.162062558807452007E-03 -0.666666666666666970E-02  0.755721765294025023E-02
+ -0.413712892374228960E-11 -0.164605381125477004E-03 -0.610688254193982979E-09
+  0.822717713365142024E-02 -0.666666666666666970E-02 -0.162062001820446010E-03
+  0.755721765238299981E-02 -0.411599096858730035E-11 -0.164602607028427003E-03
+ -0.607614049978665960E-09  0.822717713004911949E-02 -0.435050784134680991E-03
+  0.926588367152923927E-02 -0.622883807494963005E-03 -0.435047603248173024E-03
+ -0.666666666666666970E-02 -0.671594832025931951E-03  0.921711541901995036E-02
+ -0.435047603248167982E-03 -0.666666666666666970E-02  0.921711541808560922E-02
+ -0.671594830956834992E-03 -0.435050784134685979E-03 -0.622883806457666020E-03
+  0.926588367056311972E-02 -0.435040480225729010E-03 -0.666666666666666970E-02
+  0.921687630059584047E-02 -0.671540848078642995E-03 -0.435057906789294022E-03
+ -0.622872547125350026E-03  0.926585805273653050E-02 -0.666666666666666970E-02
+ -0.435040480222318978E-03  0.921687628821601067E-02 -0.671540835117597997E-03
+ -0.435057906792703025E-03 -0.622872534696777965E-03  0.926585803993784998E-02
+ -0.164605381122261992E-03  0.822717713365140983E-02 -0.610688243907033049E-09
+ -0.162062558816420988E-03 -0.666666666666666970E-02 -0.413712878804995986E-11
+  0.755721765777050969E-02 -0.162062559114418996E-03 -0.666666666666666970E-02
+  0.755721765773097014E-02 -0.413714007422677992E-11 -0.164605382603210006E-03
+ -0.610689880891928972E-09  0.822717713311491017E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041550968E-11 -0.606929330558722976E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224778808988E-09
+ -0.666666666666666970E-02 -0.398503993603989025E-09  0.822695176179714938E-02
+ -0.435040782183969974E-03 -0.666666666666666970E-02  0.921688643492011954E-02
+ -0.671543136482110042E-03 -0.435057604856967008E-03 -0.622873024631880002E-03
+  0.926585913880642077E-02 -0.666666666666666970E-02 -0.162064603069876005E-03
+  0.755721765255115002E-02 -0.421568943808864994E-11 -0.164615562666949012E-03
+ -0.622111784951490950E-09  0.822717714448729931E-02 -0.435031767249488988E-03
+ -0.666666666666666970E-02  0.921658397566323985E-02 -0.671474830577318974E-03
+  0.000000000000000000E+00 -0.435066618624126004E-03 -0.622858773502819960E-03
+  0.926582672350118042E-02 -0.666666666666666970E-02 -0.435031759838531019E-03
+  0.921598447237031992E-02 -0.671353608087835044E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626033650983E-03
+  0.000000000000000000E+00 -0.622806883370278998E-03  0.926515741219141944E-02
+ -0.666666666666666970E-02 -0.162062535484635997E-03  0.755721765240888969E-02
+ -0.413624141678407034E-11 -0.164605264966746989E-03 -0.610559194915996025E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224778717016E-09
+ -0.666666666666666970E-02 -0.398503993605691001E-09  0.822695176179727081E-02
+ -0.666666666666666970E-02 -0.435040782183975015E-03  0.921688643492027046E-02
+ -0.671543136482143001E-03 -0.435057604856962997E-03 -0.622873024631885965E-03
+  0.926585913880642945E-02 -0.666666666666666970E-02  0.755721765237515972E-02
+ -0.411128320041550968E-11 -0.606929330558722046E-09  0.822717712936565058E-02
+ -0.162062535484635997E-03 -0.666666666666666970E-02  0.755721765240875958E-02
+ -0.413624138969446992E-11 -0.164605264966748995E-03 -0.610559194918309029E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064573740720012E-03
+  0.755721745014789007E-02 -0.421455659253393022E-11 -0.164615418100806996E-03
+ -0.621947116183705050E-09  0.822717706997160014E-02 -0.666666666666666970E-02
+ -0.435032332380832999E-03  0.921659235812519011E-02 -0.671466933340958026E-03
+  0.000000000000000000E+00 -0.435066053589888019E-03 -0.622847839674434958E-03
+  0.926581781614685060E-02 -0.666666666666666970E-02 -0.435031592540386985E-03
+  0.921601550080636961E-02 -0.671377077483803031E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066793302425986E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.622828624049149020E-03
+  0.926519622533869026E-02 -0.666666666666666970E-02 -0.162168008363252009E-03
+  0.755720833616017017E-02 -0.111631527935513995E-10 -0.165130579415455000E-03
+ -0.162383770179478996E-08  0.822716472435720998E-02  0.822711943027826069E-02
+ -0.222017847415367999E-09 -0.666666666666666970E-02 -0.777760943851021986E-11
+  0.819196920090728073E-02 -0.666666666666666970E-02  0.917235354987984940E-02
+ -0.661205826876668044E-03 -0.620672923200048048E-03  0.926095692459277982E-02
+ -0.666666666666666970E-02 -0.162061842522603000E-03  0.755722298561666960E-02
+ -0.411041893471449994E-11 -0.164609505054689987E-03 -0.607053573850319957E-09
+  0.822718286444302019E-02 -0.164360468540171998E-03  0.822717730472606081E-02
+ -0.403903416083619976E-09 -0.164359675113830009E-03 -0.666666666666666970E-02
+ -0.403282765837450980E-09  0.822713708288526011E-02 -0.666666666666666970E-02
+ -0.435047234874819013E-03  0.921870444385484988E-02 -0.672996776069070043E-03
+ -0.435051152501487017E-03 -0.624363760097220040E-03  0.926743532429376987E-02
+ -0.666666666666666970E-02 -0.435040488105227008E-03  0.921687626274407014E-02
+ -0.671540573261780044E-03 -0.435057898910482974E-03 -0.622872236309804958E-03
+  0.926585776729671982E-02 -0.666666666666666970E-02 -0.435049476964798998E-03
+  0.926588407933098997E-02 -0.622880979701605024E-03  0.000000000000000000E+00
+ -0.435048910430331005E-03 -0.671593403370661043E-03  0.921715753812691989E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062536856723012E-03
+  0.755721779084045971E-02 -0.413747795884770991E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164615649658616002E-03  0.000000000000000000E+00
+ -0.611079896253776960E-09  0.822718482294398055E-02 -0.162062474956520004E-03
+ -0.666666666666666970E-02  0.755722267180921042E-02 -0.413358698004224000E-11
+ -0.164605418170935013E-03 -0.610187187425962033E-09  0.822717750858028077E-02
+ -0.666666666666666970E-02 -0.162060451582619013E-03  0.755722364032399008E-02
+ -0.405820744876872962E-11 -0.164603101435012986E-03 -0.599472333441493042E-09
+  0.822718324339584070E-02 -0.435033610082050993E-03  0.921880734854766022E-02
+ -0.673021763224571968E-03 -0.435064776097052002E-03 -0.666666666666666970E-02
+ -0.624752427388953016E-03  0.926785482617738070E-02 -0.435064833463209995E-03
+ -0.666666666666666970E-02  0.926629223154662972E-02 -0.623259249505167968E-03
+ -0.435033552706628004E-03 -0.671592121883009046E-03  0.921718022533242996E-02
+ -0.435057725512486980E-03 -0.666666666666666970E-02  0.926766886039601979E-02
+ -0.624580707385012017E-03 -0.435040661518347027E-03 -0.673023701802679037E-03
+  0.921879973034194986E-02 -0.666666666666666970E-02 -0.435040745621057988E-03
+  0.921687745366213916E-02 -0.671533468502133989E-03 -0.435057641416789994E-03
+ -0.622863794394841984E-03  0.926585102903257005E-02 -0.164605418982737001E-03
+  0.822717750858287938E-02 -0.610189797184088988E-09 -0.162062472691372001E-03
+ -0.666666666666666970E-02 -0.413360072924052010E-11  0.755722145179233036E-02
+ -0.162060895421523010E-03 -0.666666666666666970E-02  0.755722209603739998E-02
+ -0.407539453490869988E-11 -0.164609527091146007E-03 -0.602110183541000016E-09
+  0.822718635496286037E-02 -0.666666666666666970E-02  0.920452788146493978E-02
+ -0.668731215529775008E-03 -0.622283242035163997E-03  0.926452697345470956E-02
+  0.822724025612128936E-02 -0.586193175774719989E-09 -0.666666666666666970E-02
+ -0.399370800068332996E-11  0.756944693050668957E-02 -0.162114939472544992E-03
+ -0.666666666666666970E-02  0.756941304617413969E-02 -0.521957294732459965E-11
+ -0.164756390920850005E-03 -0.760317657384023048E-09  0.822719049519187069E-02
+ -0.666666666666666970E-02 -0.162348328933429009E-03  0.757141579588467995E-02
+ -0.154258694392289985E-04 -0.530720471615547980E-03 -0.221681707782894001E-03
+  0.891605396648051038E-02 -0.163035609389389007E-03 -0.666666666666666970E-02
+  0.791126060525454075E-02 -0.568885700258806994E-05  0.000000000000000000E+00
+ -0.530385314277834956E-03 -0.644746756760560056E-04  0.878526670786249943E-02
+ -0.666666666666666970E-02 -0.399601863075062019E-03  0.843933027528507955E-02
+ -0.399755487421508981E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.464883291727197006E-03  0.000000000000000000E+00
+ -0.476904595508027981E-03  0.874547668628808915E-02 -0.666666666666666970E-02
+ -0.162063070476976004E-03  0.755351620828405030E-02 -0.447629245041502000E-11
+ -0.164644943926219012E-03 -0.661960659410746001E-09  0.822717544592907989E-02
+  0.926551780966223919E-02 -0.622722844759921010E-03 -0.666666666666666970E-02
+ -0.670824286631610035E-03  0.921370896996778939E-02 -0.666666666666666970E-02
+ -0.434945180954293003E-03  0.921154601677883975E-02 -0.668579941724334022E-03
+ -0.435153152221171024E-03 -0.620572211261187045E-03  0.926326407816084954E-02
+ -0.666666666666666970E-02  0.755816621995536019E-02 -0.411326355512401980E-11
+ -0.610348502411631949E-09  0.822725905587248958E-02 -0.402874286571233979E-03
+ -0.666666666666666970E-02  0.865106167785427947E-02 -0.472609462967472005E-03
+ -0.462635007412455013E-03 -0.557606102878732972E-03  0.917391271174717964E-02
+ -0.666666666666666970E-02 -0.415928551473747994E-03  0.874525952715160017E-02
+ -0.345831822895715981E-03 -0.452437222882241992E-03 -0.346464452179155984E-03
+  0.898436212884005936E-02 -0.666666666666666970E-02 -0.410333854430279001E-03
+  0.874024567848459034E-02 -0.526325481552128048E-03  0.000000000000000000E+00
+ -0.456977123849370008E-03 -0.594710599463255975E-03  0.921019356669105019E-02
+ -0.666666666666666970E-02 -0.349138128782108001E-03  0.808745191949900022E-02
+ -0.174389554292269988E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.493489859954412968E-03  0.000000000000000000E+00
+ -0.295632059850581025E-03  0.859314138531184005E-02 -0.666666666666666970E-02
+ -0.162165456183634003E-03  0.755894482058337962E-02 -0.104756736090630003E-10
+ -0.165141512110283999E-03 -0.152686347573174001E-08  0.822721248554279978E-02
+  0.925936813557192914E-02 -0.620622041734691043E-03 -0.666666666666666970E-02
+ -0.661330289554338960E-03  0.917272027027182975E-02 -0.666666666666666970E-02
+  0.917272027027182975E-02 -0.661330289554338960E-03 -0.620622041734691043E-03
+  0.925936813557192914E-02 -0.666666666666666970E-02 -0.162062426567090008E-03
+  0.755772612433826957E-02 -0.409070240552687966E-11 -0.164599684884186013E-03
+ -0.603681595043669966E-09  0.822717664154948985E-02 -0.435043942803598025E-03
+  0.921717692320900978E-02 -0.671606591937570947E-03 -0.435054444453763980E-03
+ -0.666666666666666970E-02 -0.623016030658318006E-03  0.926602914632313965E-02
+ -0.666666666666666970E-02 -0.435054444436950010E-03  0.926603032447032016E-02
+ -0.623017161092738014E-03 -0.435043942820411995E-03 -0.671607789864231051E-03
+  0.921717803397737075E-02 -0.666666666666666970E-02 -0.162064603090850004E-03
+  0.755721765255309031E-02 -0.421569022865129025E-11 -0.164615562771554003E-03
+ -0.622111903849669951E-09  0.822717714451166003E-02 -0.666666666666666970E-02
+ -0.435031767124851004E-03  0.921658398188585082E-02 -0.671474840199063022E-03
+  0.000000000000000000E+00 -0.435066618748741979E-03 -0.622858783402943975E-03
+  0.926582673394005067E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031759684571979E-03  0.921598447096053965E-02 -0.671353614127903012E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066626187581996E-03
+  0.000000000000000000E+00 -0.622806890458470035E-03  0.926515741526657936E-02
+ -0.435040444163285018E-03 -0.666666666666666970E-02  0.921687548937809992E-02
+ -0.671541060679635036E-03 -0.435057942848580980E-03 -0.622872964977058013E-03
+  0.926585833344844949E-02 -0.666666666666666970E-02 -0.162062561393839003E-03
+  0.755721764798701970E-02 -0.413722734407884995E-11 -0.164605394042161002E-03
+ -0.610702580324128951E-09  0.822717713158752952E-02 -0.435057935435147018E-03
+  0.926585952661160978E-02 -0.622874138897695035E-03 -0.435040451577369989E-03
+ -0.666666666666666970E-02 -0.671542323723484992E-03  0.921687685921447966E-02
+ -0.164358967946371006E-03 -0.666666666666666970E-02  0.822694291289794051E-02
+ -0.400842285147718976E-09 -0.164360426857548987E-03 -0.404409911340958016E-09
+  0.822717692852258965E-02 -0.435040431638602989E-03 -0.666666666666666970E-02
+  0.921687619006436934E-02 -0.671542172631628951E-03 -0.435057955372164984E-03
+ -0.622874107379275976E-03  0.926585945492748933E-02 -0.666666666666666970E-02
+ -0.162062560447055011E-03  0.755721764904289991E-02 -0.413719132263620016E-11
+ -0.164605389320235000E-03 -0.610697341609479036E-09  0.822717713218053953E-02
+ -0.435057923606845009E-03  0.926585840260774063E-02 -0.622872995394380970E-03
+ -0.435040463406708008E-03 -0.666666666666666970E-02 -0.671541206497535000E-03
+  0.921687613516897035E-02 -0.164358974674477006E-03 -0.666666666666666970E-02
+  0.822694323093815967E-02 -0.400850494105121994E-09 -0.164360431601409999E-03
+ -0.404413311485434000E-09  0.822717692781809069E-02 -0.666666666666666970E-02
+  0.895231589357087069E-02 -0.601006644778351003E-03 -0.606288347917667949E-03
+  0.923179444181664033E-02  0.921580902911038960E-02 -0.671118539139058980E-03
+ -0.666666666666666970E-02 -0.628092211209406968E-03  0.927168309255030007E-02
+ -0.174566226528932001E-03 -0.666666666666666970E-02  0.823503972118425076E-02
+ -0.510771931197153037E-06 -0.167453071908171011E-03 -0.287656482013064974E-06
+  0.822679160086088948E-02 -0.666666666666666970E-02 -0.162064998742904011E-03
+  0.755721692802432000E-02 -0.423105196131728011E-11 -0.164617536966823991E-03
+ -0.624345265608089027E-09  0.822717661914844009E-02 -0.435027295434020991E-03
+ -0.666666666666666970E-02  0.921651153587084958E-02 -0.671532950972917025E-03
+  0.000000000000000000E+00 -0.435071089558342007E-03 -0.622941361344091053E-03
+  0.926589065346502053E-02 -0.666666666666666970E-02 -0.435031391217050999E-03
+  0.921594150620621937E-02 -0.671460176695444977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066994590015989E-03
+  0.000000000000000000E+00 -0.622929213193418037E-03  0.926511191021490987E-02
+ -0.666666666666666970E-02 -0.162052976422443997E-03  0.755715649690166975E-02
+ -0.378464866461816970E-11 -0.164557847410658995E-03 -0.559380649971562964E-09
+  0.822711615636883935E-02  0.921569960071171035E-02 -0.671199549931674020E-03
+ -0.666666666666666970E-02 -0.624992484434852037E-03  0.926820615627817028E-02
+ -0.666666666666666970E-02 -0.170162251258758009E-03  0.823042260002626071E-02
+ -0.875293478015414056E-07 -0.166825903638244990E-03 -0.692727617230331017E-07
+  0.822728043872173083E-02 -0.666666666666666970E-02  0.900314235025987016E-02
+ -0.616359389511348982E-03 -0.610140243511595988E-03  0.923851413586814982E-02
+ -0.162042399164606009E-03 -0.666666666666666970E-02  0.755717693475327010E-02
+ -0.343592691293592996E-11 -0.164505035934295996E-03 -0.508562477958249036E-09
+  0.822712657547360927E-02 -0.666666666666666970E-02 -0.435040441298520982E-03
+  0.921688002980029959E-02 -0.671545791530904035E-03 -0.435057945713094023E-03
+ -0.622877507553345035E-03  0.926586316856804065E-02 -0.666666666666666970E-02
+ -0.162131095725141989E-03  0.759120433096500003E-02 -0.393925786733449011E-11
+  0.000000000000000000E+00 -0.164599730797555013E-03 -0.563072851826958952E-09
+  0.822717559878123963E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319227572208993E-03  0.755721552651014965E-02 -0.125030561412552005E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854118797604998E-03
+  0.000000000000000000E+00 -0.692387796644269027E-09  0.759120514631322994E-02
+ -0.666666666666666970E-02 -0.162099247780077998E-03  0.755721979757678023E-02
+ -0.581369082613002011E-11 -0.164789517015928995E-03 -0.853933169653901039E-09
+  0.822717846337374034E-02  0.926237298745190979E-02 -0.621979742880892045E-03
+ -0.666666666666666970E-02 -0.667643955617555022E-03  0.919963617413947950E-02
+ -0.666666666666666970E-02  0.920119114136109043E-02 -0.667964673290031966E-03
+ -0.622121226735743030E-03  0.926416149733888007E-02 -0.666666666666666970E-02
+ -0.435000220488568974E-03  0.921804320947046917E-02 -0.673963456108928050E-03
+ -0.435098154891607001E-03 -0.625438906427754982E-03  0.926833740265235045E-02
+ -0.167002100358977013E-03  0.822676745335870936E-02 -0.774836785822800982E-07
+ -0.162540956180643993E-03 -0.666666666666666970E-02 -0.563903073588569969E-09
+  0.755607876483108988E-02 -0.666666666666666970E-02 -0.162059390613440994E-03
+  0.755611424124089017E-02 -0.410936949933791987E-11 -0.164621823131188991E-03
+ -0.607895080715946992E-09  0.822719290698344964E-02 -0.666666666666666970E-02
+ -0.162064679580902008E-03  0.755721760719274029E-02 -0.421866212845835019E-11
+  0.000000000000000000E+00 -0.164615944113866006E-03 -0.622543990942232019E-09
+  0.822717713446135049E-02 -0.666666666666666970E-02 -0.435031107052827001E-03
+  0.921656310812869056E-02 -0.671471001355917964E-03  0.000000000000000000E+00
+ -0.435067278703290996E-03 -0.622858834069494032E-03  0.926582569486444040E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031446660309002E-03
+  0.921599853918038045E-02 -0.671396777488897042E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066939156653987E-03  0.000000000000000000E+00
+ -0.622852608399671048E-03  0.926518020755455994E-02 -0.162542334394280010E-03
+ -0.666666666666666970E-02  0.755649428756813030E-02 -0.563245912569737047E-09
+ -0.167001746375183998E-03 -0.773653708482778048E-07  0.822676733536495917E-02
+ -0.666666666666666970E-02 -0.162063551455513005E-03  0.755721718231079035E-02
+ -0.417514955388785973E-11 -0.164610567026664999E-03 -0.616225367346680050E-09
+  0.822717731694085942E-02 -0.164377985077527987E-03  0.822719270254208997E-02
+ -0.401055733513956985E-09 -0.164351625080718007E-03 -0.666666666666666970E-02
+ -0.392176583167848991E-09  0.822661805239805963E-02 -0.435027796634762003E-03
+ -0.666666666666666970E-02  0.921803071428142946E-02 -0.673143999294995021E-03
+ -0.435070588466728980E-03 -0.624488956449440998E-03  0.926745798940319977E-02
+ -0.162061399017261011E-03 -0.666666666666666970E-02  0.755719884111734960E-02
+ -0.409719349842826014E-11 -0.164621105345924009E-03 -0.605582245755702968E-09
+  0.822719290467982013E-02 -0.666666666666666970E-02 -0.162080612318650990E-03
+  0.755719632784433975E-02 -0.488742792731292979E-11 -0.164695424088829993E-03
+ -0.719668653572314038E-09  0.822716417739921026E-02 -0.166686224033829990E-03
+  0.822673892793746031E-02 -0.487607625654436989E-07 -0.166616778531177993E-03
+ -0.666666666666666970E-02 -0.385465175249295009E-07  0.822046889213087079E-02
+ -0.434813594858741027E-03 -0.666666666666666970E-02  0.920936888606463042E-02
+ -0.669875781881485994E-03 -0.435284514510405013E-03 -0.622560397625597029E-03
+  0.926509289165874939E-02 -0.666666666666666970E-02  0.927696108602655070E-02
+ -0.632627915671010992E-03 -0.671545420293352051E-03  0.921776198077606072E-02
+  0.922026582434420060E-02 -0.602408833914311001E-03 -0.666666666666666970E-02
+ -0.556545703137562042E-03  0.881922173549517033E-02 -0.163017068024577997E-03
+ -0.666666666666666970E-02  0.795594792411035041E-02 -0.273445418458471009E-11
+  0.000000000000000000E+00 -0.164269850502767994E-03 -0.212776837501969996E-09
+  0.822716867001030069E-02 -0.666666666666666970E-02 -0.435047888244456974E-03
+  0.922079054223712974E-02 -0.674652724066845955E-03 -0.435050499142537988E-03
+ -0.626174907785970051E-03  0.926933372330603945E-02 -0.317388785409780010E-03
+ -0.666666666666666970E-02  0.807555137448344972E-02 -0.161231757480141994E-03
+  0.000000000000000000E+00 -0.506658477841221994E-03 -0.311775274478785975E-03
+  0.895372834363604084E-02 -0.666666666666666970E-02 -0.163033771455105996E-03
+  0.757711880141135989E-02 -0.230783976059587012E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.528851957736768035E-03
+  0.000000000000000000E+00 -0.189450184417205001E-03  0.832713861616275926E-02
+ -0.666666666666666970E-02 -0.161988418302652995E-03  0.755983344534295974E-02
+ -0.201738904538359992E-11 -0.164212922771786994E-03 -0.300297007087386020E-09
+  0.822710829050683073E-02  0.921859794089704976E-02 -0.671461758609938038E-03
+ -0.666666666666666970E-02 -0.645287728552976008E-03  0.929336166647132082E-02
+ -0.666666666666666970E-02 -0.192231196795263987E-03  0.825469366452986041E-02
+ -0.211365722510570006E-06  0.000000000000000000E+00 -0.166616012173876997E-03
+ -0.134858265397606994E-07  0.822777128965413058E-02 -0.666666666666666970E-02
+  0.902089049924758961E-02 -0.621512175798593952E-03 -0.611398199502805962E-03
+  0.924071304560030934E-02 -0.162013975003999001E-03 -0.666666666666666970E-02
+  0.758489414663056968E-02 -0.176583073381748006E-05 -0.529428370961672052E-03
+ -0.381594181184865010E-04  0.875176256452839946E-02 -0.666666666666666970E-02
+ -0.435042623232502024E-03  0.921694787103495040E-02 -0.671556689509785008E-03
+ -0.435055763946669001E-03 -0.622875549110628985E-03  0.926586538965971929E-02
+ -0.666666666666666970E-02 -0.435049192429825995E-03  0.921716878108901072E-02
+ -0.671606874278178004E-03  0.000000000000000000E+00 -0.435049194965705000E-03
+ -0.622886316419352971E-03  0.926588938457374028E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.162062558982806990E-03  0.755721781610755976E-02
+ -0.413711682785996024E-11 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605380148779006E-03  0.000000000000000000E+00 -0.610686395487684964E-09
+  0.822717709924931928E-02 -0.666666666666666970E-02 -0.435040485643667027E-03
+  0.921687647065858040E-02 -0.671540876851966020E-03 -0.435057901371828989E-03
+ -0.622872543909110035E-03  0.926585805994004953E-02  0.822717712936564018E-02
+ -0.606929319904127040E-09 -0.666666666666666970E-02 -0.411128314368110998E-11
+  0.755721765744662034E-02 -0.666666666666666970E-02  0.755721765744662034E-02
+ -0.411128314368110998E-11 -0.606929319768601975E-09  0.822717712936562977E-02
+ -0.666666666666666970E-02 -0.162062535498602012E-03  0.755721765388590004E-02
+ -0.413624192489096004E-11 -0.164605265022377999E-03 -0.610559256318664035E-09
+  0.822717713308667060E-02 -0.164360427767194014E-03  0.822717693070432014E-02
+ -0.404403342469463976E-09 -0.164359021906931011E-03 -0.666666666666666970E-02
+ -0.400970042731464983E-09  0.822695176085696048E-02 -0.666666666666666970E-02
+ -0.435040781993407998E-03  0.921688642847521070E-02 -0.671543134942478989E-03
+ -0.435057605047514022E-03 -0.622873024233365990E-03  0.926585913807351051E-02
+ -0.666666666666666970E-02 -0.435040480200654992E-03  0.921687629983308949E-02
+ -0.671540847971295054E-03  0.000000000000000000E+00 -0.435057906814365980E-03
+ -0.622872547165051992E-03  0.926585805272850914E-02 -0.666666666666666970E-02
+ -0.435049192316071991E-03  0.921716878067056940E-02 -0.671606877003452980E-03
+  0.000000000000000000E+00 -0.435049195079459979E-03 -0.622886319684859977E-03
+  0.926588938770844979E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.162062559106629004E-03  0.755721765243790033E-02 -0.413713440957626983E-11
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382367111999E-03
+  0.000000000000000000E+00 -0.610689040449987030E-09  0.822717709614393007E-02
+ -0.162062562108060003E-03 -0.666666666666666970E-02  0.755721765207954983E-02
+ -0.413725467961124014E-11 -0.164605397579012993E-03 -0.610706551944964963E-09
+  0.822717713458392085E-02 -0.666666666666666970E-02 -0.162062001816588988E-03
+  0.755721765239816996E-02 -0.411599093120005983E-11 -0.164602607009130996E-03
+ -0.607614028771139961E-09  0.822717713005806026E-02 -0.435050784089093976E-03
+  0.926588367178806001E-02 -0.622883807627942028E-03 -0.435047603293761014E-03
+ -0.666666666666666970E-02 -0.671594832438412981E-03  0.921711542063954024E-02
+ -0.435047603293765026E-03 -0.666666666666666970E-02  0.921711542225988993E-02
+ -0.671594833533352984E-03 -0.435050784089088989E-03 -0.622883808601936025E-03
+  0.926588367352939972E-02 -0.435040480241351984E-03 -0.666666666666666970E-02
+  0.921687630121104974E-02 -0.671540848264394041E-03 -0.435057906773672024E-03
+ -0.622872547211143052E-03  0.926585805288993036E-02 -0.666666666666666970E-02
+ -0.435040480209677994E-03  0.921687629817317076E-02 -0.671540845217228006E-03
+ -0.435057906805342979E-03 -0.622872544373526951E-03  0.926585805077998967E-02
+ -0.164605397577960991E-03  0.822717713458392085E-02 -0.610706548612634985E-09
+ -0.162062562110996998E-03 -0.666666666666666970E-02 -0.413725463469866987E-11
+  0.755721765366132014E-02 -0.162062559104681993E-03 -0.666666666666666970E-02
+  0.755721765355718989E-02 -0.413713996767741984E-11 -0.164605382596239995E-03
+ -0.610689879340446991E-09  0.822717713313662023E-02 -0.666666666666666970E-02
+  0.755721765237517013E-02 -0.411128320041531016E-11 -0.606929330558691957E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224886744012E-09
+ -0.666666666666666970E-02 -0.398503993612473003E-09  0.822695176179774959E-02
+ -0.435040782183991983E-03 -0.666666666666666970E-02  0.921688643492086027E-02
+ -0.671543136482271046E-03 -0.435057604856944999E-03 -0.622873024631908950E-03
+  0.926585913880649016E-02 -0.666666666666666970E-02 -0.162064603069654990E-03
+  0.755721765252322011E-02 -0.421568940485412024E-11 -0.164615562666129003E-03
+ -0.622111784075274967E-09  0.822717714448810943E-02 -0.435031767248445010E-03
+ -0.666666666666666970E-02  0.921658397562742995E-02 -0.671474830568459958E-03
+  0.000000000000000000E+00 -0.435066618625169982E-03 -0.622858773500244004E-03
+  0.926582672349660075E-02 -0.666666666666666970E-02 -0.435031759841376020E-03
+  0.921598447242718069E-02 -0.671353608099204964E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626030805982E-03
+  0.000000000000000000E+00 -0.622806883368841996E-03  0.926515741215891940E-02
+ -0.666666666666666970E-02 -0.162062535484632989E-03  0.755721765241007971E-02
+ -0.413624141658310988E-11 -0.164605264966721999E-03 -0.610559194886172028E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224750255024E-09
+ -0.666666666666666970E-02 -0.398503993629537016E-09  0.822695176179893961E-02
+ -0.666666666666666970E-02 -0.435040782184036978E-03  0.921688643492233999E-02
+ -0.671543136482596957E-03 -0.435057604856900980E-03 -0.622873024631969015E-03
+  0.926585913880663935E-02 -0.666666666666666970E-02  0.755721765237517013E-02
+ -0.411128320041521968E-11 -0.606929330558677998E-09  0.822717712936565058E-02
+ -0.162062535484634994E-03 -0.666666666666666970E-02  0.755721765240914990E-02
+ -0.413624147093784990E-11 -0.164605264966739996E-03 -0.610559194907676036E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.435040482273178026E-03
+  0.921687613607913986E-02 -0.671540613305316998E-03 -0.435057904742022978E-03
+ -0.622872309434753979E-03  0.926585781733488961E-02 -0.666666666666666970E-02
+ -0.162127387857516993E-03  0.759121301019225012E-02 -0.381918248612579987E-11
+  0.000000000000000000E+00 -0.164583146079131992E-03 -0.546169842951880971E-09
+  0.822717717071064919E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319269560149001E-03  0.755721776272199035E-02 -0.125468739914664006E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854266156419006E-03
+  0.000000000000000000E+00 -0.694836199054006039E-09  0.759121369204142967E-02
+ -0.666666666666666970E-02 -0.162091523337547997E-03  0.755721898424006021E-02
+ -0.540934677579967032E-11  0.000000000000000000E+00 -0.164750533637444990E-03
+ -0.795365337200214006E-09  0.822717800588011056E-02  0.926452030343002941E-02
+ -0.622281047419194033E-03 -0.666666666666666970E-02 -0.668721216430620004E-03
+  0.920448407564962955E-02 -0.666666666666666970E-02  0.920291934243665927E-02
+ -0.668399806909403954E-03 -0.622140086487220031E-03  0.926273191293569033E-02
+ -0.666666666666666970E-02 -0.435042276336174984E-03  0.921682348368780928E-02
+ -0.671431373639223947E-03 -0.435056110819545018E-03 -0.622756694074473051E-03
+  0.926574695221287978E-02 -0.164604656270789013E-03  0.822717725536971020E-02
+ -0.609640574535708954E-09 -0.162062479874466994E-03 -0.666666666666666970E-02
+ -0.413006085521291964E-11  0.755726759055258979E-02 -0.666666666666666970E-02
+ -0.162006162226876999E-03  0.755727139187646006E-02 -0.248123275689687997E-11
+ -0.164398193105456003E-03 -0.370429927246309000E-09  0.822722962912524058E-02
+ -0.666666666666666970E-02 -0.162064599546255997E-03  0.755721765602699984E-02
+ -0.421555276492109033E-11  0.000000000000000000E+00 -0.164615545095072007E-03
+ -0.622091901725074988E-09  0.822717714627387020E-02 -0.666666666666666970E-02
+ -0.435031818763029980E-03  0.921658104872762930E-02 -0.671470214801774966E-03
+  0.000000000000000000E+00 -0.435066567119568007E-03 -0.622854034507566018E-03
+  0.926582206155607963E-02 -0.666666666666666970E-02 -0.435031752461292008E-03
+  0.921598570305219057E-02 -0.671353593598220024E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066633409600010E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.622806719992244047E-03  0.926515905834469043E-02 -0.162062393641909007E-03
+ -0.666666666666666970E-02  0.755722114123722009E-02 -0.413058279052571960E-11
+ -0.164604687177714008E-03 -0.609739841581688048E-09  0.822717725546858077E-02
+ -0.666666666666666970E-02 -0.162060731985976013E-03  0.755721979203457985E-02
+ -0.406827649258389981E-11 -0.164598252833624004E-03 -0.600736958280085992E-09
+  0.822717859850459997E-02 -0.164089877345694000E-03  0.822722950591394005E-02
+ -0.245723983761991010E-09 -0.164740360809531990E-03 -0.666666666666666970E-02
+ -0.253032242462855987E-09  0.822760867483479999E-02 -0.435063218163868012E-03
+ -0.666666666666666970E-02  0.926625279535238031E-02 -0.623222898203098044E-03
+ -0.435035168246062981E-03 -0.671596115313023018E-03  0.921718087666609925E-02
+ -0.162006191893605003E-03 -0.666666666666666970E-02  0.755728897650767041E-02
+ -0.248111433259074991E-11 -0.164398182179902004E-03 -0.370407208686471976E-09
+  0.822722962910260938E-02 -0.666666666666666970E-02 -0.162062408593543990E-03
+  0.755721778752635031E-02 -0.413141811987293010E-11 -0.164604734980651994E-03
+ -0.609861061658230019E-09  0.822717721123178945E-02 -0.164359758086323008E-03
+  0.822717705188595952E-02 -0.403734889040530024E-09 -0.164358443027807014E-03
+ -0.666666666666666970E-02 -0.400953602691749013E-09  0.822699461577017926E-02
+ -0.435042369262157974E-03 -0.666666666666666970E-02  0.921678605589727924E-02
+ -0.671388823848062012E-03 -0.435056017899960990E-03 -0.622715238128926981E-03
+  0.926570506586136926E-02 -0.666666666666666970E-02  0.897119442339463016E-02
+ -0.606814776206859971E-03 -0.607757345455318975E-03  0.923427126754335081E-02
+  0.921865858631707932E-02 -0.671454241685535990E-03 -0.666666666666666970E-02
+ -0.646212508082314027E-03  0.929466349621911950E-02 -0.192798550395747987E-03
+ -0.666666666666666970E-02  0.825745095197302943E-02 -0.932632147974648003E-06
+  0.000000000000000000E+00 -0.167240534661016013E-03 -0.225014932097518994E-07
+  0.822685297960725967E-02 -0.666666666666666970E-02 -0.435853333045042000E-03
+  0.921982672920109922E-02 -0.576029289608052963E-03 -0.434241817042410974E-03
+ -0.604491260295870981E-03  0.915701687929209966E-02 -0.413213785007987978E-03
+ -0.666666666666666970E-02  0.877723644056935953E-02 -0.541755128362727992E-03
+  0.000000000000000000E+00 -0.454683425554934008E-03 -0.590990666065788033E-03
+  0.920776909864309996E-02 -0.666666666666666970E-02 -0.163098438357499011E-03
+  0.763192860819197037E-02 -0.471912218750803974E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547245995E-03
+  0.000000000000000000E+00 -0.266001580375158011E-03  0.861802239077489977E-02
+ -0.666666666666666970E-02 -0.162004212070115012E-03  0.757730725874160033E-02
+ -0.176948864124387004E-05 -0.529438965863178955E-03 -0.405266342026753998E-04
+  0.875391878258711953E-02  0.921979766533967057E-02 -0.602143233936805987E-03
+ -0.666666666666666970E-02 -0.555336769390312010E-03  0.881585236637339939E-02
+ -0.666666666666666970E-02 -0.163010868918803012E-03  0.795402864518519014E-02
+ -0.273140611218113010E-11  0.000000000000000000E+00 -0.164270868404786012E-03
+ -0.213417891562774994E-09  0.822716826171294990E-02 -0.666666666666666970E-02
+  0.927693305629184034E-02 -0.632604555455441018E-03 -0.671545581847466972E-03
+  0.921776054189130979E-02 -0.161978982117798992E-03 -0.666666666666666970E-02
+  0.755685740523827972E-02 -0.195386055106566009E-11 -0.164191968508161013E-03
+ -0.291698868024501998E-09  0.822713227244221967E-02 -0.666666666666666970E-02
+ -0.435040480198179975E-03  0.921687629980929950E-02 -0.671540848019947000E-03
+ -0.435057906816840021E-03 -0.622872547226495030E-03  0.926585805278096024E-02
+ -0.666666666666666970E-02 -0.435049192309293017E-03  0.921716878045670054E-02
+ -0.671606876968350959E-03  0.000000000000000000E+00 -0.435049195086237978E-03
+ -0.622886319690002022E-03  0.926588938769827043E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.162062559109026988E-03  0.755721765240835973E-02
+ -0.413713441548758037E-11 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605382377888996E-03  0.000000000000000000E+00 -0.610689049049815970E-09
+  0.822717709592826925E-02 -0.666666666666666970E-02 -0.435040480192435005E-03
+  0.921687628880354937E-02 -0.671540836694003019E-03 -0.435057906822584992E-03
+ -0.622872536396895009E-03  0.926585804147755931E-02  0.822717712936564018E-02
+ -0.606929320743386971E-09 -0.666666666666666970E-02 -0.411128314894905010E-11
+  0.755721765697572965E-02 -0.666666666666666970E-02  0.755721765697572965E-02
+ -0.411128314894905010E-11 -0.606929320770492025E-09  0.822717712936564018E-02
+ -0.666666666666666970E-02 -0.162062535444674989E-03  0.755721765241194020E-02
+ -0.413623989629973001E-11 -0.164605264767690994E-03 -0.610558973788768004E-09
+  0.822717713298089064E-02 -0.164360410611332999E-03  0.822717692931345009E-02
+ -0.404391742677632997E-09 -0.164359006864496997E-03 -0.666666666666666970E-02
+ -0.400958843987352986E-09  0.822695177799862010E-02 -0.666666666666666970E-02
+ -0.435040782694775998E-03  0.921688645197418083E-02 -0.671543140259736040E-03
+ -0.435057604346205003E-03 -0.622873025349945041E-03  0.926585914054997052E-02
+ -0.666666666666666970E-02 -0.435040480198194016E-03  0.921687629980919021E-02
+ -0.671540848019547038E-03  0.000000000000000000E+00 -0.435057906816825981E-03
+ -0.622872547226044002E-03  0.926585805278038951E-02 -0.666666666666666970E-02
+ -0.435049192309309985E-03  0.921716878047955032E-02 -0.671606876991275004E-03
+  0.000000000000000000E+00 -0.435049195086221010E-03 -0.622886319711830048E-03
+  0.926588938772157991E-02 -0.666666666666666970E-02 -0.162062559108390995E-03
+  0.755721765240854968E-02 -0.413713449972606017E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382374721012E-03
+  0.000000000000000000E+00 -0.610689045533098955E-09  0.822717709592844966E-02
+ -0.162062558941643005E-03 -0.666666666666666970E-02  0.755721765259115014E-02
+ -0.413713398632515962E-11 -0.164605381794141997E-03 -0.610688990283018023E-09
+  0.822717713318714926E-02 -0.666666666666666970E-02 -0.162062001821220998E-03
+  0.755721765238152009E-02 -0.411599110620314974E-11 -0.164602607032291994E-03
+ -0.607614054224395964E-09  0.822717713004737956E-02 -0.435050784143862991E-03
+  0.926588367156189023E-02 -0.622883807550479035E-03 -0.435047603238991024E-03
+ -0.666666666666666970E-02 -0.671594832028562985E-03  0.921711541877510976E-02
+ -0.435047603238989018E-03 -0.666666666666666970E-02  0.921711541842836976E-02
+ -0.671594831671507020E-03 -0.435050784143864997E-03 -0.622883807208665977E-03
+  0.926588367119991069E-02 -0.435040480204769973E-03 -0.666666666666666970E-02
+  0.921687629995552975E-02 -0.671540847991965043E-03 -0.435057906810250024E-03
+ -0.622872547162217020E-03  0.926585805272642921E-02 -0.666666666666666970E-02
+ -0.435040480200663991E-03  0.921687628816156984E-02 -0.671540835793514026E-03
+ -0.435057906814356981E-03 -0.622872535485858030E-03  0.926585804054953083E-02
+ -0.164605381790940998E-03  0.822717713318714058E-02 -0.610688980014658047E-09
+ -0.162062558950573009E-03 -0.666666666666666970E-02 -0.413713385087310001E-11
+  0.755721765740006036E-02 -0.162062559115579987E-03 -0.666666666666666970E-02
+  0.755721765739483971E-02 -0.413714011819061037E-11 -0.164605382612299008E-03
+ -0.610689891393699981E-09  0.822717713311063928E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558715014E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224751758011E-09
+ -0.666666666666666970E-02 -0.398503993602972989E-09  0.822695176179707999E-02
+ -0.435040782183968022E-03 -0.666666666666666970E-02  0.921688643492003974E-02
+ -0.671543136482089984E-03 -0.435057604856969990E-03 -0.622873024631875991E-03
+  0.926585913880641036E-02 -0.666666666666666970E-02 -0.162064603069938997E-03
+  0.755721765252195983E-02 -0.421568947004117974E-11 -0.164615562667550012E-03
+ -0.622111785671113991E-09  0.822717714448712063E-02 -0.435031767249090978E-03
+ -0.666666666666666970E-02  0.921658397565005075E-02 -0.671474830574493001E-03
+  0.000000000000000000E+00 -0.435066618624524015E-03 -0.622858773502375004E-03
+  0.926582672349991927E-02 -0.666666666666666970E-02 -0.435031759837511002E-03
+  0.921598447232290993E-02 -0.671353608077845964E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034670025E-03
+  0.000000000000000000E+00 -0.622806883367964985E-03  0.926515741217292035E-02
+ -0.666666666666666970E-02 -0.162062535484635997E-03  0.755721765240869019E-02
+ -0.413624141680835988E-11 -0.164605264966749998E-03 -0.610559194919630036E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224778863996E-09
+ -0.666666666666666970E-02 -0.398503993603033011E-09  0.822695176179709040E-02
+ -0.666666666666666970E-02 -0.435040782183968022E-03  0.921688643492003974E-02
+ -0.671543136482092044E-03 -0.435057604856969990E-03 -0.622873024631875991E-03
+  0.926585913880641036E-02 -0.666666666666666970E-02  0.755721765237515972E-02
+ -0.411128320041545960E-11 -0.606929330558716048E-09  0.822717712936565058E-02
+ -0.162062535484635997E-03 -0.666666666666666970E-02  0.755721765240869019E-02
+ -0.413624144391367996E-11 -0.164605264966749998E-03 -0.610559194919673050E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064603069936991E-03
+  0.755721765251413016E-02 -0.421568947063007983E-11 -0.164615562667617992E-03
+ -0.622111785760815008E-09  0.822717714448719002E-02 -0.666666666666666970E-02
+ -0.435031767248966023E-03  0.921658397564588047E-02 -0.671474830573548986E-03
+  0.000000000000000000E+00 -0.435066618624647993E-03 -0.622858773502177028E-03
+  0.926582672349948039E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031759837441992E-03  0.921598447233066935E-02 -0.671353608091401961E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066626034738980E-03
+  0.000000000000000000E+00 -0.622806883381814042E-03  0.926515741218287940E-02
+ -0.666666666666666970E-02 -0.162062001821317004E-03  0.755721765238150968E-02
+ -0.411599108277345033E-11 -0.164602607032771997E-03 -0.607614054759129973E-09
+  0.822717713004762069E-02  0.926588367541716050E-02 -0.622883811249623975E-03
+ -0.666666666666666970E-02 -0.671594835878273032E-03  0.921711542243204042E-02
+ -0.666666666666666970E-02  0.921711542243204042E-02 -0.671594835878273032E-03
+ -0.622883811249622999E-03  0.926588367541716050E-02 -0.666666666666666970E-02
+ -0.162062535450388002E-03  0.755721765240867979E-02 -0.413624016808590973E-11
+ -0.164605264796174991E-03 -0.610559005432624996E-09  0.822717713298075012E-02
+ -0.164360411311726003E-03  0.822717692923711011E-02 -0.404392344956488014E-09
+ -0.164359007643746004E-03 -0.666666666666666970E-02 -0.400959412732088008E-09
+  0.822695177596773942E-02 -0.666666666666666970E-02 -0.435040782621792983E-03
+  0.921688644954029961E-02 -0.671543139722759002E-03 -0.435057604419182000E-03
+ -0.622873025249980950E-03  0.926585914030375081E-02 -0.666666666666666970E-02
+ -0.435040480198179975E-03  0.921687629980933072E-02 -0.671540848019966949E-03
+  0.000000000000000000E+00 -0.435057906816840021E-03 -0.622872547226514003E-03
+  0.926585805278100014E-02 -0.666666666666666970E-02 -0.435049192309251980E-03
+  0.921716878047779999E-02 -0.671606876990949960E-03  0.000000000000000000E+00
+ -0.435049195086279015E-03 -0.622886319711839047E-03  0.926588938772160073E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062559109029997E-03
+  0.755721765240835019E-02 -0.413713457806971004E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605382377896992E-03  0.000000000000000000E+00
+ -0.610689049042042024E-09  0.822717709592721974E-02 -0.162062559106823997E-03
+ -0.666666666666666970E-02  0.755721765240988975E-02 -0.413714016632381966E-11
+ -0.164605382618112988E-03 -0.610689904846968049E-09  0.822717713311111980E-02
+ -0.666666666666666970E-02 -0.162062001821342998E-03  0.755721765238148973E-02
+ -0.411599100244394014E-11 -0.164602607032901993E-03 -0.607614054902494021E-09
+  0.822717713004762069E-02 -0.435050784145525995E-03  0.926588367157234020E-02
+ -0.622883807563274030E-03 -0.435047603237328020E-03 -0.666666666666666970E-02
+ -0.671594832032081004E-03  0.921711541873500989E-02 -0.435047603237328020E-03
+ -0.666666666666666970E-02  0.921711541879069972E-02 -0.671594832090159004E-03
+ -0.435050784145525995E-03 -0.622883807618967002E-03  0.926588367163041007E-02
+ -0.435040480198147991E-03 -0.666666666666666970E-02  0.921687629974903000E-02
+ -0.671540847957907000E-03 -0.435057906816872005E-03 -0.622872547167174968E-03
+  0.926585805271907051E-02 -0.666666666666666970E-02 -0.435040480192394998E-03
+  0.921687628872891983E-02 -0.671540836617235976E-03 -0.435057906822624999E-03
+ -0.622872536323500051E-03  0.926585804140095046E-02 -0.164605382615034992E-03
+  0.822717713311110939E-02 -0.610689894920461967E-09 -0.162062559115410987E-03
+ -0.666666666666666970E-02 -0.413714014137465006E-11  0.755721765703380993E-02
+ -0.162062559115412993E-03 -0.666666666666666970E-02  0.755721765703380993E-02
+ -0.413714014146466007E-11 -0.164605382615047000E-03 -0.610689894960657982E-09
+  0.822717713311110939E-02 -0.666666666666666970E-02  0.921687630349792009E-02
+ -0.671540851815856984E-03 -0.622872550856052947E-03  0.926585805656933957E-02
+  0.926585809685315925E-02 -0.622872568564181964E-03 -0.666666666666666970E-02
+ -0.671540936702172951E-03  0.921687667944103006E-02 -0.164358960784040002E-03
+ -0.666666666666666970E-02  0.822694398364712948E-02 -0.400847403514244019E-09
+  0.000000000000000000E+00 -0.164360413019343991E-03 -0.404398689239843015E-09
+  0.822717692924298041E-02 -0.666666666666666970E-02 -0.162064603069942006E-03
+  0.755721765251538002E-02 -0.421568947070519009E-11 -0.164615562667628997E-03
+ -0.622111785771050024E-09  0.822717714448712931E-02 -0.435031767249002019E-03
+ -0.666666666666666970E-02  0.921658397564648069E-02 -0.671474830573193042E-03
+  0.000000000000000000E+00 -0.435066618624612973E-03 -0.622858773501632000E-03
+  0.926582672349898079E-02 -0.666666666666666970E-02 -0.435031759837399004E-03
+  0.921598447231553042E-02 -0.671353608075763971E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034782023E-03
+  0.000000000000000000E+00 -0.622806883366930982E-03  0.926515741216851069E-02
+ -0.666666666666666970E-02 -0.162062558230456009E-03  0.755721765240984032E-02
+ -0.413710687281216962E-11 -0.164605378253319007E-03 -0.610685055063975992E-09
+  0.822717713310629033E-02  0.926585809685315925E-02 -0.622872568564181964E-03
+ -0.666666666666666970E-02 -0.671540936702172951E-03  0.921687667944103006E-02
+ -0.666666666666666970E-02 -0.164358960784040002E-03  0.822694398364712948E-02
+ -0.400847403514244019E-09  0.000000000000000000E+00 -0.164360413019343991E-03
+ -0.404398689239843015E-09  0.822717692924298041E-02 -0.666666666666666970E-02
+  0.921687630349792009E-02 -0.671540851815856984E-03 -0.622872550856052947E-03
+  0.926585805656933957E-02 -0.162062558230456009E-03 -0.666666666666666970E-02
+  0.755721765240984032E-02 -0.413710687281216962E-11  0.000000000000000000E+00
+ -0.164605378253319007E-03 -0.610685055063975992E-09  0.822717713310629033E-02
+ -0.666666666666666970E-02 -0.162064604275187002E-03  0.755721677996594975E-02
+ -0.421580947944237963E-11 -0.164615577356503002E-03 -0.622129690587314019E-09
+  0.822717714566923060E-02 -0.666666666666666970E-02 -0.435031753444996012E-03
+  0.921658351635084964E-02 -0.671474729715682982E-03  0.000000000000000000E+00
+ -0.435066632426207010E-03 -0.622858755231564033E-03  0.926582667773451045E-02
+ -0.666666666666666970E-02 -0.435031736935309978E-03  0.921598323977216964E-02
+ -0.671353347083358007E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066648932866005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.622806813899395959E-03  0.926515680964767017E-02
+ -0.666666666666666970E-02 -0.162062535483415999E-03  0.755721764896765975E-02
+ -0.413624165743050033E-11 -0.164605264995012004E-03 -0.610559231671319039E-09
+  0.822717713301475070E-02  0.822717692676106990E-02 -0.401916226534076012E-09
+ -0.666666666666666970E-02 -0.398503963674476984E-09  0.822695175970213945E-02
+ -0.666666666666666970E-02  0.921688643582909035E-02 -0.671543139638232974E-03
+ -0.622873028116618000E-03  0.926585914228755976E-02 -0.666666666666666970E-02
+ -0.162062558230619994E-03  0.755721765087292996E-02 -0.413710692525393008E-11
+ -0.164605378269444997E-03 -0.610685075298003985E-09  0.822717713311651999E-02
+ -0.435057895657227020E-03  0.926585809286093083E-02 -0.622872564813008981E-03
+ -0.435040491358768000E-03 -0.666666666666666970E-02 -0.671540932535798999E-03
+  0.921687667431765958E-02 -0.666666666666666970E-02 -0.164358960778226998E-03
+  0.822694398251594058E-02 -0.400847388083247026E-09 -0.164360413020582990E-03
+ -0.404398690916013012E-09  0.822717692924319031E-02 -0.666666666666666970E-02
+ -0.435040480197697993E-03  0.921687629888584027E-02 -0.671540847069607041E-03
+  0.000000000000000000E+00 -0.435057906817322003E-03 -0.622872546317801987E-03
+  0.926585805183252967E-02 -0.666666666666666970E-02 -0.162127546311873996E-03
+  0.759121249955739975E-02 -0.382423929799540036E-11  0.000000000000000000E+00
+ -0.164583726855931989E-03 -0.546878175870651968E-09  0.822717706955712960E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319285307283012E-03
+  0.755721766126241958E-02 -0.125511383854575001E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854277196956007E-03  0.000000000000000000E+00
+ -0.695070274241472012E-09  0.759121319033947018E-02 -0.435040480198167995E-03
+ -0.666666666666666970E-02  0.921687629975386988E-02 -0.671540847960958053E-03
+ -0.435057906816852002E-03 -0.622872547169800038E-03  0.926585805272362069E-02
+ -0.666666666666666970E-02 -0.435040480192397004E-03  0.921687628872910024E-02
+ -0.671540836617267960E-03 -0.435057906822622017E-03 -0.622872536323503955E-03
+  0.926585804140104934E-02 -0.164605382615351010E-03  0.822717713311131062E-02
+ -0.610689895410060039E-09 -0.162062559115474006E-03 -0.666666666666666970E-02
+ -0.413714014380830024E-11  0.755721765703380993E-02 -0.162062559115409008E-03
+ -0.666666666666666970E-02  0.755721765703380038E-02 -0.413714008712550966E-11
+ -0.164605382615029002E-03 -0.610689894941937954E-09  0.822717713311117010E-02
+ -0.162062559106887993E-03 -0.666666666666666970E-02  0.755721765240996001E-02
+ -0.413714022296702998E-11 -0.164605382618428003E-03 -0.610689905200928972E-09
+  0.822717713311131929E-02 -0.666666666666666970E-02 -0.162062001821339013E-03
+  0.755721765238150968E-02 -0.411599102940498021E-11 -0.164602607032881989E-03
+ -0.607614054881568006E-09  0.822717713004766059E-02 -0.435050784145494011E-03
+  0.926588367157693028E-02 -0.622883807565917966E-03 -0.435047603237360004E-03
+ -0.666666666666666970E-02 -0.671594832035220962E-03  0.921711541874025049E-02
+ -0.435047603237360004E-03 -0.666666666666666970E-02  0.921711541879170933E-02
+ -0.671594832090262003E-03 -0.435050784145494011E-03 -0.622883807618874953E-03
+  0.926588367163047079E-02 -0.666666666666666970E-02  0.921687630349792009E-02
+ -0.671540851815856984E-03 -0.622872550856052947E-03  0.926585805656933957E-02
+  0.822717712936564018E-02 -0.606929320872887989E-09 -0.666666666666666970E-02
+ -0.411128314877483972E-11  0.755721765699129966E-02 -0.162062559115238003E-03
+ -0.666666666666666970E-02  0.755721765701823991E-02 -0.413714005475985968E-11
+ -0.164605382614328987E-03 -0.610689894184682021E-09  0.822717713311110939E-02
+ -0.666666666666666970E-02 -0.435040480197263987E-03  0.921687629887128941E-02
+ -0.671540847066263036E-03 -0.435057906817756009E-03 -0.622872546317053020E-03
+  0.926585805183098923E-02 -0.162127546311872993E-03 -0.666666666666666970E-02
+  0.759121249955737026E-02 -0.382423935215729017E-11  0.000000000000000000E+00
+ -0.164583726855924996E-03 -0.546878175863874980E-09  0.822717706955712960E-02
+ -0.666666666666666970E-02 -0.166319285307283988E-03  0.755721766126244039E-02
+ -0.125511383583243006E-10  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166854277196956007E-03  0.000000000000000000E+00
+ -0.695070274239683027E-09  0.759121319033943982E-02 -0.666666666666666970E-02
+ -0.162062001821339989E-03  0.755721765238148973E-02 -0.411599111074019016E-11
+ -0.164602607032884998E-03 -0.607614054884462001E-09  0.822717713004762069E-02
+  0.926588367541610926E-02 -0.622883811249163948E-03 -0.666666666666666970E-02
+ -0.671594835876070041E-03  0.921711542242228087E-02 -0.666666666666666970E-02
+ -0.435047603237369979E-03  0.921711541881043914E-02 -0.671594832109593978E-03
+ -0.435050784145483982E-03 -0.622883807637366021E-03  0.926588367164967938E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128320041545960E-11
+ -0.606929330558716048E-09  0.822717712936565058E-02 -0.435040480192415977E-03
+ -0.666666666666666970E-02  0.921687628876629965E-02 -0.671540836655632996E-03
+ -0.435057906822602990E-03 -0.622872536360199970E-03  0.926585804143926009E-02
+ -0.666666666666666970E-02 -0.162816160491764011E-03  0.755629729934281964E-02
+ -0.145267310755072998E-07 -0.168362009615544009E-03 -0.190258328993087994E-05
+  0.822792910352069043E-02 -0.666666666666666970E-02 -0.434980287163910990E-03
+  0.921591582112040938E-02 -0.672220540750253994E-03  0.000000000000000000E+00
+ -0.435118076441907997E-03 -0.623871210047872993E-03  0.926674480647901068E-02
+ -0.666666666666666970E-02 -0.391663109199982980E-03  0.857592364724167958E-02
+ -0.270377370428809022E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.470345934721972021E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.290023568862184014E-03  0.893922648671382006E-02
+ -0.666666666666666970E-02 -0.317994853278870024E-03  0.807402136254166923E-02
+ -0.163000017519030990E-03 -0.506466269753455046E-03 -0.315285081402617016E-03
+  0.895741238458851001E-02  0.822721524635505914E-02 -0.564092130791223009E-09
+ -0.666666666666666970E-02 -0.389036333446409034E-11  0.758050134032871043E-02
+ -0.666666666666666970E-02  0.758050134032871043E-02 -0.389036336156913981E-11
+ -0.564092130737013004E-09  0.822721524635505914E-02 -0.666666666666666970E-02
+ -0.162062221237022010E-03  0.755675930893081986E-02 -0.416186344820719000E-11
+ -0.164608243083756012E-03 -0.614516634575204050E-09  0.822717669815483006E-02
+ -0.435064268078357019E-03  0.926583720685853970E-02 -0.622864535863296978E-03
+ -0.435034118178185007E-03 -0.666666666666666970E-02 -0.671494763562757962E-03
+  0.921666478489410919E-02 -0.666666666666666970E-02 -0.435034118314003989E-03
+  0.921666821285587953E-02 -0.671498187612958996E-03 -0.435064267942557987E-03
+ -0.622867800074395988E-03  0.926584079408090955E-02 -0.666666666666666970E-02
+ -0.162064603060627001E-03  0.755721765249593985E-02 -0.421568916772846982E-11
+  0.000000000000000000E+00 -0.164615562621545010E-03 -0.622111733862203048E-09
+  0.822717714450407062E-02 -0.666666666666666970E-02 -0.435031767219332013E-03
+  0.921658395702201066E-02 -0.671474812089471003E-03  0.000000000000000000E+00
+ -0.435066618654277992E-03 -0.622858755959619019E-03  0.926582670497335011E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031759978188998E-03
+  0.921598448078022985E-02 -0.671353613199229967E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066625894017019E-03  0.000000000000000000E+00
+ -0.622806887499908037E-03  0.926515741657525996E-02 -0.435040451653885985E-03
+ -0.666666666666666970E-02  0.921687738713831972E-02 -0.671542772385395025E-03
+ -0.435057935358636985E-03 -0.622874555977533041E-03  0.926586008373840933E-02
+ -0.666666666666666970E-02 -0.162062556280765003E-03  0.755721766011261970E-02
+ -0.413703259232341988E-11 -0.164605370112559998E-03 -0.610674307777927029E-09
+  0.822717713673958018E-02 -0.435057871981406005E-03  0.926586389913170057E-02
+ -0.622877920215035986E-03 -0.435040515036651005E-03 -0.666666666666666970E-02
+ -0.671546676151164976E-03  0.921688293990952957E-02 -0.164358975070530991E-03
+ -0.666666666666666970E-02  0.822694462180378026E-02 -0.400864471382896996E-09
+ -0.164360425623554987E-03 -0.404406184767255019E-09  0.822717693084428978E-02
+ -0.435040521728764014E-03 -0.666666666666666970E-02  0.921688316448704067E-02
+ -0.671546726860146044E-03 -0.435057865289874995E-03 -0.622877930792390950E-03
+  0.926586392317507029E-02 -0.666666666666666970E-02 -0.162062560143134006E-03
+  0.755721765072739013E-02 -0.413717992975032974E-11 -0.164605387803321994E-03
+ -0.610695685352230028E-09  0.822717713417366048E-02 -0.435057920486110002E-03
+  0.926586013716025961E-02 -0.622874579485410002E-03 -0.435040466527716017E-03
+ -0.666666666666666970E-02 -0.671542885086807029E-03  0.921687788626173940E-02
+ -0.164358940726689012E-03 -0.666666666666666970E-02  0.822694332169499921E-02
+ -0.400825748326332007E-09 -0.164360402303031009E-03 -0.404387092386443002E-09
+  0.822717693284892929E-02 -0.666666666666666970E-02  0.755853528834142963E-02
+ -0.410907237024926035E-11 -0.609517161481544010E-09  0.822725835180459011E-02
+  0.822699835769964000E-02 -0.229769146389950011E-09 -0.666666666666666970E-02
+ -0.552503881834807986E-11  0.815277410257971014E-02 -0.431800076739472991E-03
+ -0.666666666666666970E-02  0.911683250971202959E-02 -0.645107775821699976E-03
+ -0.438245882118646008E-03 -0.614761191522313001E-03  0.925173111512469018E-02
+ -0.666666666666666970E-02 -0.162066807231452001E-03  0.755721537354540028E-02
+ -0.430232887169997028E-11 -0.164626562654960999E-03 -0.634707558119433995E-09
+  0.822717705664591063E-02 -0.435011184592201997E-03 -0.666666666666666970E-02
+  0.921541718621200941E-02 -0.670832821851847973E-03  0.000000000000000000E+00
+ -0.435087195563132003E-03 -0.622361407616723998E-03  0.926525297162917975E-02
+ -0.666666666666666970E-02 -0.435024023581949979E-03  0.921463310923579040E-02
+ -0.670757573524478033E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435074360637873977E-03  0.000000000000000000E+00
+ -0.622355873608441013E-03  0.926393658622583042E-02 -0.666666666666666970E-02
+ -0.162274431703270989E-03  0.755738308036761966E-02 -0.318241338988657979E-10
+ -0.165730950021739988E-03 -0.457847648587293988E-08  0.822724281267098005E-02
+  0.822699408685815918E-02 -0.228425571392821005E-09 -0.666666666666666970E-02
+ -0.569261988092565005E-11  0.815841787771750919E-02 -0.666666666666666970E-02
+ -0.432079853778008993E-03  0.912452759981858923E-02 -0.647134253481845053E-03
+ -0.437974705900792997E-03 -0.615294574958452005E-03  0.925273449425225922E-02
+ -0.666666666666666970E-02  0.755849436233489019E-02 -0.410955652784976012E-11
+ -0.609618023537668031E-09  0.822725856237870035E-02 -0.162292777295442987E-03
+ -0.666666666666666970E-02  0.755742123690941042E-02 -0.382880604873281030E-10
+ -0.165829805992468013E-03 -0.549637808756437989E-08  0.822725133924324080E-02
+ -0.666666666666666970E-02 -0.162519832090693000E-03  0.758487998779587991E-02
+ -0.277297390858832013E-04 -0.531411093209674000E-03 -0.271306402284292975E-03
+  0.895289512037131946E-02 -0.666666666666666970E-02 -0.435144737424841995E-03
+  0.927049381409235027E-02 -0.627195150452744977E-03  0.000000000000000000E+00
+ -0.434953604275157982E-03 -0.673646028178072039E-03  0.921927160555094036E-02
+ -0.666666666666666970E-02 -0.162365832331587992E-03  0.770469465208874042E-02
+ -0.341374299192897006E-05  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.530093823283070008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.670683422235064935E-04  0.878401693892019025E-02
+ -0.666666666666666970E-02 -0.406469352273000986E-03  0.868845376516133062E-02
+ -0.487445861240936023E-03 -0.459955486486156979E-03 -0.561455535102076998E-03
+  0.917876821200271954E-02  0.822724019224538999E-02 -0.586236000132447954E-09
+ -0.666666666666666970E-02 -0.399393158957938003E-11  0.756942275725619972E-02
+ -0.666666666666666970E-02  0.756938783599863958E-02 -0.398490214171924003E-11
+ -0.582276440781839991E-09  0.822717710481313952E-02 -0.666666666666666970E-02
+ -0.162064188320470992E-03  0.755638508915526964E-02 -0.426955665163783005E-11
+ -0.164621776680132990E-03 -0.630372622134120969E-09  0.822717559459638058E-02
+ -0.435097665935211988E-03  0.926770146162825964E-02 -0.624846566023037008E-03
+ -0.435000709682981980E-03 -0.666666666666666970E-02 -0.673347124029987048E-03
+  0.921745137510697997E-02 -0.666666666666666970E-02 -0.164221437098695006E-03
+  0.822590307436798021E-02 -0.301540515380110992E-09 -0.164276463597796989E-03
+ -0.317267646597123000E-09  0.822720896113241992E-02 -0.666666666666666970E-02
+ -0.435040484785290994E-03  0.921687621618138067E-02 -0.671540621958418996E-03
+ -0.435057902230129995E-03 -0.622872302682172954E-03  0.926585782249826975E-02
+ -0.666666666666666970E-02 -0.162127341684405001E-03  0.759121275229191969E-02
+ -0.381774206802719039E-11  0.000000000000000000E+00 -0.164583013632061998E-03
+ -0.545969305880644018E-09  0.822717722380156974E-02 -0.666666666666666970E-02
+ -0.166319316569640000E-03  0.755721776450153995E-02 -0.125637203248046992E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854313971363013E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.695766372139392979E-09  0.759121342784058965E-02 -0.435007341079486002E-03
+ -0.666666666666666970E-02  0.921767423501631059E-02 -0.673397510119870990E-03
+ -0.435091037539934999E-03 -0.624857046910582011E-03  0.926772529095947969E-02
+ -0.666666666666666970E-02 -0.435042780427758998E-03  0.921682237061272926E-02
+ -0.671414330503569993E-03 -0.435055606761638005E-03 -0.622737221386400041E-03
+  0.926573013279861045E-02 -0.164512437178424009E-03  0.822720912015354085E-02
+ -0.478141451603002007E-09 -0.162035132712254993E-03 -0.666666666666666970E-02
+ -0.321987108658194010E-11  0.755727832217645019E-02 -0.162065357686449010E-03
+ -0.666666666666666970E-02  0.755727582940845036E-02 -0.423979678728262026E-11
+ -0.164618725864026005E-03 -0.625584794059249953E-09  0.822717479197863921E-02
+ -0.162035093479698011E-03 -0.666666666666666970E-02  0.755725620859342036E-02
+ -0.322006454574526012E-11 -0.164512451389200987E-03 -0.478178423362356003E-09
+  0.822720912019037076E-02 -0.666666666666666970E-02 -0.162063731980670002E-03
+  0.755721533834740992E-02 -0.418199544479783967E-11 -0.164611237647310007E-03
+ -0.617212851806125045E-09  0.822717573520461917E-02 -0.435072205240186985E-03
+  0.926779361315555050E-02 -0.624886867600672031E-03 -0.435026179501545984E-03
+ -0.666666666666666970E-02 -0.673540811812492046E-03  0.921830845677282934E-02
+ -0.435026047900845974E-03 -0.666666666666666970E-02  0.921631021319617959E-02
+ -0.671342130464456957E-03 -0.435072336809864014E-03 -0.622763518227145035E-03
+  0.926572086598697990E-02 -0.666666666666666970E-02  0.755741246850622026E-02
+ -0.410910644194978984E-11 -0.606515170668247041E-09  0.822717712895315936E-02
+  0.822711877443555936E-02 -0.222069534738638002E-09 -0.666666666666666970E-02
+ -0.774169806703297047E-11  0.819168186272309015E-02 -0.433633789741900023E-03
+ -0.666666666666666970E-02  0.917151071300196033E-02 -0.660635211420094017E-03
+ -0.436454594164531018E-03 -0.620192759941485053E-03  0.926043761605101039E-02
+ -0.666666666666666970E-02 -0.435400642028797977E-03  0.924569487162926924E-02
+ -0.602675681812970990E-03 -0.434697126891870982E-03 -0.641341627123363030E-03
+  0.919049697819149941E-02 -0.417766802147073019E-03 -0.666666666666666970E-02
+  0.884119022200594018E-02 -0.565196699593391045E-03  0.000000000000000000E+00
+ -0.450926780658108997E-03 -0.597865634330268050E-03  0.921780464690035925E-02
+ -0.666666666666666970E-02 -0.162927680290126010E-03  0.761083369281826978E-02
+ -0.416350633852817005E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.530230645664085047E-03  0.000000000000000000E+00
+ -0.269827347276449016E-03  0.866009136742811915E-02 -0.666666666666666970E-02
+ -0.162144345304456006E-03  0.760092331739826033E-02 -0.320930758282650987E-05
+ -0.529813749405098054E-03 -0.838256824281413042E-04  0.879776792041751972E-02
+  0.822717682684740922E-02 -0.303187285528630003E-09 -0.666666666666666970E-02
+ -0.349301427021029015E-11  0.790044609360177931E-02 -0.666666666666666970E-02
+ -0.407938480218887980E-03  0.871084858846606080E-02 -0.512295469801526959E-03
+ -0.458837927071482975E-03 -0.588085060255285046E-03  0.920289209649226975E-02
+ -0.666666666666666970E-02  0.755933306797651038E-02 -0.410021685751258990E-11
+ -0.607789044576714013E-09  0.822725774217296021E-02 -0.162167689058960993E-03
+ -0.666666666666666970E-02  0.755666327531443957E-02 -0.112741283804007006E-10
+ -0.165135297900302988E-03 -0.164049635527119992E-08  0.822716577111987958E-02
+ -0.666666666666666970E-02 -0.162816151525113997E-03  0.755629731334936036E-02
+ -0.145250995281190996E-07 -0.168361965007562991E-03 -0.190237271058338006E-05
+  0.822792892080568931E-02 -0.666666666666666970E-02 -0.434980646616620992E-03
+  0.921590961843673021E-02 -0.672201796129614040E-03  0.000000000000000000E+00
+ -0.435117717236683025E-03 -0.623850865689214011E-03  0.926672719210277010E-02
+ -0.666666666666666970E-02 -0.391662568195668987E-03  0.857592608264677939E-02
+ -0.270377852525331014E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.470346296009015001E-03  0.000000000000000000E+00
+ -0.290024881064321978E-03  0.893923985221735037E-02 -0.666666666666666970E-02
+ -0.317995102343517999E-03  0.807402209586039962E-02 -0.163000191627275006E-03
+ -0.506466182711903996E-03 -0.315285148839635984E-03  0.895741240163453953E-02
+  0.822721524635562987E-02 -0.564092115954154982E-09 -0.666666666666666970E-02
+ -0.389036326356696026E-11  0.758050134875735016E-02 -0.666666666666666970E-02
+  0.758050134875735016E-02 -0.389036331777706000E-11 -0.564092115791524967E-09
+  0.822721524635562987E-02 -0.666666666666666970E-02 -0.162059230234564013E-03
+  0.755676875879613980E-02 -0.404977791069632030E-11 -0.164607827252304003E-03
+ -0.598672124761886979E-09  0.822718791318045005E-02 -0.435030727375933994E-03
+  0.921782544139550068E-02 -0.671763596290095993E-03 -0.435067658311500988E-03
+ -0.666666666666666970E-02 -0.623720661515962042E-03  0.926678721368117027E-02
+ -0.666666666666666970E-02 -0.435067665337827991E-03  0.926637336566514015E-02
+ -0.623334437794025985E-03 -0.435030720348006980E-03 -0.671597031953381036E-03
+  0.921719062426287934E-02 -0.666666666666666970E-02 -0.162064554103597002E-03
+  0.755721778425398963E-02 -0.421379617299167968E-11 -0.164615465305043013E-03
+ -0.621841414828084976E-09  0.822717725632217053E-02 -0.666666666666666970E-02
+ -0.435032270976548998E-03  0.921659997236582915E-02 -0.671477595984673039E-03
+  0.000000000000000000E+00 -0.435066114983777015E-03 -0.622858547503055051E-03
+  0.926582760612120956E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031835475485022E-03  0.921598351720689934E-02 -0.671332921939591038E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066550409885975E-03
+  0.000000000000000000E+00 -0.622784365706614980E-03  0.926515556356406061E-02
+ -0.435081591675335006E-03 -0.666666666666666970E-02  0.926715723864761018E-02
+ -0.624060476111682047E-03 -0.435016790462375985E-03 -0.671761223543174959E-03
+  0.921784490296891079E-02 -0.666666666666666970E-02 -0.162062411240925002E-03
+  0.755721810753046033E-02 -0.413154453738800000E-11 -0.164605196762570010E-03
+ -0.609894095762123987E-09  0.822717754580637971E-02 -0.435056188182044996E-03
+  0.926585891365665042E-02 -0.622867894213649997E-03 -0.435042198968283974E-03
+ -0.666666666666666970E-02 -0.671546446351377053E-03  0.921692872689685964E-02
+ -0.164341192321288998E-03 -0.666666666666666970E-02  0.822700721314235071E-02
+ -0.388276097998368010E-09 -0.164366485757635008E-03 -0.391285012821627015E-09
+  0.822719359814269927E-02 -0.435043836697356014E-03 -0.666666666666666970E-02
+  0.921698374849288030E-02 -0.671558867523572019E-03 -0.435054550554351985E-03
+ -0.622870487342790016E-03  0.926586485739499914E-02 -0.666666666666666970E-02
+ -0.162059863256517012E-03  0.755722860343106036E-02 -0.403706320675834994E-11
+ -0.164610172315551000E-03 -0.596713959736372018E-09  0.822719066745875027E-02
+ -0.435029178909017021E-03  0.921782856828711958E-02 -0.671763646252384965E-03
+ -0.435069206480010991E-03 -0.666666666666666970E-02 -0.623759080059059956E-03
+  0.926682896934285030E-02 -0.165176323153183011E-03 -0.666666666666666970E-02
+  0.822774275548648036E-02 -0.421612183862497016E-09 -0.164358865204686000E-03
+ -0.403616558608118980E-09  0.822717766773221025E-02 -0.666666666666666970E-02
+  0.755853528843129958E-02 -0.410907228794313989E-11 -0.609517161291164049E-09
+  0.822725835180460052E-02  0.822699835770025062E-02 -0.229769141627132009E-09
+ -0.666666666666666970E-02 -0.552503936541298003E-11  0.815277412222070036E-02
+ -0.431800077704224978E-03 -0.666666666666666970E-02  0.911683253562737932E-02
+ -0.645107782251850984E-03 -0.438245881184861979E-03 -0.614761192822427965E-03
+  0.925173111806255020E-02 -0.666666666666666970E-02 -0.162066792004203990E-03
+  0.755722059760233992E-02 -0.430128633401459982E-11 -0.164626435077370987E-03
+ -0.634553335781814010E-09  0.822717711494816949E-02 -0.435011239879684001E-03
+ -0.666666666666666970E-02  0.921541899333368046E-02 -0.670833188949691958E-03
+  0.000000000000000000E+00 -0.435087140296697020E-03 -0.622361445147185050E-03
+  0.926525312625075051E-02 -0.666666666666666970E-02 -0.435024263236271974E-03
+  0.921464288704679918E-02 -0.670759624758249955E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435074121043737996E-03
+  0.000000000000000000E+00 -0.622356277676372038E-03  0.926393943568542926E-02
+ -0.666666666666666970E-02 -0.162274431660521006E-03  0.755738313912261012E-02
+ -0.318240683403240976E-10 -0.165730949472054013E-03 -0.457846708968348003E-08
+  0.822724281302687072E-02  0.822699408685964931E-02 -0.228425558756176994E-09
+ -0.666666666666666970E-02 -0.569262164845434979E-11  0.815841793296116953E-02
+ -0.666666666666666970E-02 -0.432079856432105011E-03  0.912452767230425080E-02
+ -0.647134271290311994E-03 -0.437974703324669004E-03 -0.615294578525974042E-03
+  0.925273450239525071E-02 -0.666666666666666970E-02  0.755849436258760991E-02
+ -0.410955652506049982E-11 -0.609618023001794995E-09  0.822725856237871075E-02
+ -0.162292777280510000E-03 -0.666666666666666970E-02  0.755742125771192031E-02
+ -0.382880319923928003E-10 -0.165829805796784997E-03 -0.549637401523144008E-08
+  0.822725133937288015E-02 -0.666666666666666970E-02 -0.162519791749040997E-03
+  0.758486654085705012E-02 -0.277312982546554016E-04 -0.531411130485370960E-03
+ -0.271318851749606982E-03  0.895290555306309938E-02 -0.666666666666666970E-02
+ -0.435144726093062025E-03  0.927049355419245005E-02 -0.627194917416272046E-03
+  0.000000000000000000E+00 -0.434953615617772005E-03 -0.673646073339009962E-03
+  0.921927163146945942E-02 -0.666666666666666970E-02 -0.162365825870953989E-03
+  0.770469126010004009E-02 -0.341374404077266992E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.530093736436846978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.670688877278153049E-04
+  0.878401724365305073E-02 -0.666666666666666970E-02 -0.406469238190150994E-03
+  0.868845253580060976E-02 -0.487445397507600979E-03 -0.459955572937245996E-03
+ -0.561455436984956992E-03  0.917876806761750916E-02  0.822724019224498060E-02
+ -0.586236017994280044E-09 -0.666666666666666970E-02 -0.399393167827312004E-11
+  0.756942274805354979E-02 -0.666666666666666970E-02  0.756938782679772958E-02
+ -0.398490217607864004E-11 -0.582276458527030000E-09  0.822717710481316034E-02
+ -0.666666666666666970E-02 -0.162064188279850002E-03  0.755638507164865976E-02
+ -0.426955655883516001E-11 -0.164621776652662993E-03 -0.630372617736862023E-09
+  0.822717559470338006E-02 -0.435097665884428989E-03  0.926770143129381982E-02
+ -0.624846533095091048E-03 -0.435000709733791000E-03 -0.666666666666666970E-02
+ -0.673347090449469992E-03  0.921745134737769929E-02 -0.666666666666666970E-02
+ -0.164221437102121003E-03  0.822590307607582068E-02 -0.301540531002169025E-09
+ -0.164276463596261000E-03 -0.317267642602591980E-09  0.822720896113589978E-02
+ -0.666666666666666970E-02 -0.435040484785382989E-03  0.921687621618049943E-02
+ -0.671540621954609001E-03 -0.435057902230038000E-03 -0.622872302677949011E-03
+  0.926585782249450020E-02 -0.666666666666666970E-02 -0.162127341684184989E-03
+  0.759121275229850991E-02 -0.381774206063630980E-11  0.000000000000000000E+00
+ -0.164583013631449993E-03 -0.545969304848883959E-09  0.822717722380189066E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319316568878998E-03
+  0.755721776450365024E-02 -0.125637200248759998E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854313970594991E-03  0.000000000000000000E+00
+ -0.695766357075677050E-09  0.759121342784747997E-02 -0.435007341721630001E-03
+ -0.666666666666666970E-02  0.921767422715922928E-02 -0.673397481031394029E-03
+ -0.435091036898059991E-03 -0.624857014917886951E-03  0.926772526275846005E-02
+ -0.666666666666666970E-02 -0.435042780430306005E-03  0.921682237057486024E-02
+ -0.671414330383147007E-03 -0.435055606759090997E-03 -0.622737221255020975E-03
+  0.926573013268000914E-02 -0.164512437176590000E-03  0.822720912015702070E-02
+ -0.478141445901109973E-09 -0.162035132711022011E-03 -0.666666666666666970E-02
+ -0.321987104749231018E-11  0.755727832223314962E-02 -0.162065357632703997E-03
+ -0.666666666666666970E-02  0.755727582947162985E-02 -0.423979469854183965E-11
+ -0.164618725596388991E-03 -0.625584490453962995E-09  0.822717479206993944E-02
+ -0.162035093478465002E-03 -0.666666666666666970E-02  0.755725620865007990E-02
+ -0.322006453375858990E-11 -0.164512451387367005E-03 -0.478178417741383017E-09
+  0.822720912019385062E-02 -0.666666666666666970E-02 -0.162063731945476990E-03
+  0.755721533840931960E-02 -0.418199409455997034E-11 -0.164611237471814999E-03
+ -0.617212655483738035E-09  0.822717573526372987E-02 -0.435072204818992981E-03
+  0.926779358414435948E-02 -0.624886835257608040E-03 -0.435026179922837024E-03
+ -0.666666666666666970E-02 -0.673540781035909049E-03  0.921830844145678924E-02
+ -0.435026048326556981E-03 -0.666666666666666970E-02  0.921631022735146070E-02
+ -0.671342133550576967E-03 -0.435072336384253024E-03 -0.622763518765494998E-03
+  0.926572086739899971E-02 -0.666666666666666970E-02  0.755741246850622026E-02
+ -0.410910644194978984E-11 -0.606515170668247041E-09  0.822717712895315936E-02
+  0.822711877443555936E-02 -0.222069534792841002E-09 -0.666666666666666970E-02
+ -0.774169806704296934E-11  0.819168186272316995E-02 -0.433633789741903004E-03
+ -0.666666666666666970E-02  0.917151071300205921E-02 -0.660635211420117002E-03
+ -0.436454594164527982E-03 -0.620192759941490040E-03  0.926043761605102080E-02
+ -0.666666666666666970E-02 -0.435400642045188024E-03  0.924569487205297025E-02
+ -0.602675682168614998E-03 -0.434697126875424015E-03 -0.641341627119159036E-03
+  0.919049697820984064E-02 -0.417766802147158021E-03 -0.666666666666666970E-02
+  0.884119022200715969E-02 -0.565196699593685948E-03  0.000000000000000000E+00
+ -0.450926780658037006E-03 -0.597865634330208961E-03  0.921780464690040956E-02
+ -0.666666666666666970E-02 -0.162927680290140999E-03  0.761083369282181989E-02
+ -0.416350633850501013E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.530230645664087974E-03  0.000000000000000000E+00
+ -0.269827347274097977E-03  0.866009136742689964E-02 -0.666666666666666970E-02
+ -0.162144345304457009E-03  0.760092331739868968E-02 -0.320930758282241997E-05
+ -0.529813749405099029E-03 -0.838256824279560954E-04  0.879776792041734972E-02
+  0.822717682684740922E-02 -0.303187285555688993E-09 -0.666666666666666970E-02
+ -0.349301427021062013E-11  0.790044609360190074E-02 -0.666666666666666970E-02
+ -0.407938480218909014E-03  0.871084858846631060E-02 -0.512295469801619008E-03
+ -0.458837927071466983E-03 -0.588085060255303044E-03  0.920289209649229924E-02
+ -0.666666666666666970E-02  0.755933306797651038E-02 -0.410021674909242031E-11
+ -0.607789044576720010E-09  0.822725774217296021E-02 -0.162167689058960993E-03
+ -0.666666666666666970E-02  0.755666327531452024E-02 -0.112741284887906999E-10
+ -0.165135297900302012E-03 -0.164049635526676995E-08  0.822716577111987958E-02
+ -0.666666666666666970E-02 -0.162064381314455013E-03  0.755721681642354016E-02
+ -0.420708647347467005E-11 -0.164614463272000995E-03 -0.620861048269696965E-09
+  0.822717666428922009E-02 -0.666666666666666970E-02 -0.435032929063819981E-03
+  0.921662843024339054E-02 -0.671487062718633038E-03  0.000000000000000000E+00
+ -0.435065457005957975E-03 -0.622863607135336990E-03  0.926583682639911012E-02
+ -0.666666666666666970E-02 -0.435033606033594978E-03  0.921549100307902068E-02
+ -0.670764254703717011E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435064780144060010E-03  0.000000000000000000E+00
+ -0.622228101134277012E-03  0.926458636134656031E-02 -0.666666666666666970E-02
+ -0.162412248298241007E-03  0.755747465195114043E-02 -0.132892288474876992E-09
+ -0.166453306564733994E-03 -0.187762010463222004E-07  0.822729692880195054E-02
+  0.822701112868542972E-02 -0.238165066447310994E-09 -0.666666666666666970E-02
+ -0.481872285692390998E-11  0.811743485472943976E-02 -0.666666666666666970E-02
+  0.907030660461941030E-02 -0.635289343114582030E-03 -0.614711342014916053E-03
+  0.924712095234279940E-02 -0.666666666666666970E-02 -0.162067491346243997E-03
+  0.755721660010066996E-02 -0.433042344878489997E-11 -0.164637989129536011E-03
+ -0.639067222491722952E-09  0.822718313255254956E-02 -0.435123629441636984E-03
+  0.926610912903469072E-02 -0.623064810739766984E-03 -0.434974730176162019E-03
+ -0.666666666666666970E-02 -0.671125339170808039E-03  0.921519637816755992E-02
+ -0.666666666666666970E-02 -0.166000837408311988E-03  0.822475961254992040E-02
+ -0.105963041200035007E-07 -0.166020054886163010E-03 -0.114023574905160996E-07
+  0.822673401751872028E-02 -0.666666666666666970E-02 -0.435040411541543013E-03
+  0.921687888628672063E-02 -0.671545404567263976E-03 -0.435057975467457981E-03
+ -0.622877304336513001E-03  0.926586291132954973E-02 -0.666666666666666970E-02
+ -0.162130707049625006E-03  0.759120585643323006E-02 -0.392634071196178009E-11
+  0.000000000000000000E+00 -0.164597967653741013E-03 -0.561253610552618950E-09
+  0.822717507317871979E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319143409792997E-03  0.755721565731708005E-02 -0.124778474586887002E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854049995611003E-03
+  0.000000000000000000E+00 -0.691000750970318969E-09  0.759120679480445974E-02
+ -0.435051311183006005E-03 -0.666666666666666970E-02  0.926639102794180047E-02
+ -0.623356751569072967E-03 -0.435047076190057990E-03 -0.671803857406816016E-03
+  0.921783849635480004E-02 -0.666666666666666970E-02 -0.435004887685989992E-03
+  0.921760501413688982E-02 -0.673382235862111022E-03 -0.435093489874442013E-03
+ -0.624855064858449022E-03  0.926773123149998018E-02 -0.166320189393602005E-03
+  0.822674016833681992E-02 -0.176036669988993003E-07 -0.162405090688964003E-03
+ -0.666666666666666970E-02 -0.125356963990057004E-09  0.755633079584900041E-02
+ -0.162059610339947000E-03 -0.666666666666666970E-02  0.755636445209539962E-02
+ -0.409725503580782012E-11 -0.164619621806747006E-03 -0.605978989258597974E-09
+  0.822719229488561021E-02 -0.162405940044832003E-03 -0.666666666666666970E-02
+  0.755662367990892966E-02 -0.125254866802548012E-09 -0.166319952568922996E-03
+ -0.175849783175382999E-07  0.822674014969775047E-02 -0.666666666666666970E-02
+ -0.162059866198233001E-03  0.755722656090779960E-02 -0.403692298108515966E-11
+ -0.164606538526818004E-03 -0.596577171893270041E-09  0.822718795529438045E-02
+ -0.435028073525391026E-03  0.921786254908986934E-02 -0.671799805131304017E-03
+ -0.435070311635940977E-03 -0.666666666666666970E-02 -0.623819803880377045E-03
+  0.926689343860783929E-02 -0.435070247954426976E-03 -0.666666666666666970E-02
+  0.926779304085995070E-02 -0.624694668192695042E-03 -0.435028137220242991E-03
+ -0.672979588001651041E-03  0.921846396898064017E-02 -0.666666666666666970E-02
+  0.894946699024212929E-02 -0.600122097862237002E-03 -0.606052261066108988E-03
+  0.923126173183646918E-02  0.822707325920852932E-02 -0.439334748407924013E-09
+ -0.666666666666666970E-02 -0.340850404729477990E-11  0.767218067483432017E-02
+ -0.163197277611309987E-03 -0.666666666666666970E-02  0.767185136575801018E-02
+ -0.332821151740468006E-08 -0.167963560070563987E-03 -0.390028261492079976E-06
+  0.822779264793695002E-02 -0.666666666666666970E-02 -0.424429129575981982E-03
+  0.884286864444589968E-02 -0.396957256272790994E-03 -0.445115815542256987E-03
+ -0.385983334887198980E-03  0.902131707790924942E-02 -0.407763611263302025E-03
+ -0.666666666666666970E-02  0.870515217458626978E-02 -0.498571990041084007E-03
+  0.000000000000000000E+00 -0.458970290280038998E-03 -0.570346192985176997E-03
+  0.918907244525967067E-02 -0.666666666666666970E-02 -0.322071566303724021E-03
+  0.801064459323599064E-02 -0.151974252254838009E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.504268248287133031E-03
+  0.000000000000000000E+00 -0.280047589106463980E-03  0.858275366599436061E-02
+ -0.666666666666666970E-02 -0.162136502882231009E-03  0.755886084998397007E-02
+ -0.797239446559356004E-11 -0.165003120614180988E-03 -0.116699722701056003E-08
+  0.822721467884345026E-02  0.926072857339886937E-02 -0.621240348529511033E-03
+ -0.666666666666666970E-02 -0.664187000217392946E-03  0.918478778839808968E-02
+ -0.666666666666666970E-02 -0.434091712912973990E-03  0.918381835623783074E-02
+ -0.661965111513909037E-03 -0.436002089949920021E-03 -0.618955328845967952E-03
+  0.925997826239693955E-02 -0.666666666666666970E-02  0.755828330701808961E-02
+ -0.411187050494477969E-11 -0.610057942551154034E-09  0.822725843300332015E-02
+ -0.162643320272068002E-03 -0.666666666666666970E-02  0.760168784361417004E-02
+ -0.350264221860245974E-04 -0.531763987837332967E-03 -0.285598197078351006E-03
+  0.896123574808329985E-02 -0.666666666666666970E-02 -0.162064603067110991E-03
+  0.755721765251989031E-02 -0.421568927958496012E-11  0.000000000000000000E+00
+ -0.164615562653504011E-03 -0.622111769806188027E-09  0.822717714448987017E-02
+ -0.666666666666666970E-02 -0.435031767299973019E-03  0.921658397588867064E-02
+ -0.671474829222033949E-03  0.000000000000000000E+00 -0.435066618573651026E-03
+ -0.622858771889590011E-03  0.926582672216797083E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435031759821466978E-03  0.921598447351964015E-02
+ -0.671353608292523957E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066626050711989E-03  0.000000000000000000E+00 -0.622806883477106956E-03
+  0.926515741405328948E-02 -0.666666666666666970E-02 -0.162062535484502993E-03
+  0.755721765241602027E-02 -0.413624146533982998E-11 -0.164605264966013012E-03
+ -0.610559194090089995E-09  0.822717713298093921E-02  0.822717692676106990E-02
+ -0.401916224683843019E-09 -0.666666666666666970E-02 -0.398503994321098006E-09
+  0.822695176184735054E-02 -0.666666666666666970E-02  0.921688643859516939E-02
+ -0.671543140262901043E-03 -0.622873028246735054E-03  0.926585914257952933E-02
+ -0.666666666666666970E-02 -0.162062560104490004E-03  0.755721765317601043E-02
+ -0.413717829056970014E-11 -0.164605387585608994E-03 -0.610695437742429953E-09
+  0.822717713403002017E-02 -0.435057919911320979E-03  0.926585996691042975E-02
+ -0.622874416570012033E-03 -0.435040467102555022E-03 -0.666666666666666970E-02
+ -0.671542718462655053E-03  0.921687774051401995E-02 -0.666666666666666970E-02
+ -0.164358956864967009E-03  0.822694333429544941E-02 -0.400838058665778016E-09
+ -0.164360413187303008E-03 -0.404399206698849991E-09  0.822717692928500062E-02
+ -0.666666666666666970E-02 -0.435040480198734978E-03  0.921687629883806946E-02
+ -0.671540846988837015E-03  0.000000000000000000E+00 -0.435057906816285018E-03
+ -0.622872546234125974E-03  0.926585805175031071E-02 -0.666666666666666970E-02
+ -0.162127546305523010E-03  0.759121249965894005E-02 -0.382423911661819962E-11
+  0.000000000000000000E+00 -0.164583726826487010E-03 -0.546878146458841976E-09
+  0.822717706956129988E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319285296618989E-03  0.755721766130180994E-02 -0.125511345494907995E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854277186167003E-03
+  0.000000000000000000E+00 -0.695070062425207962E-09  0.759121319044449988E-02
+ -0.435040455888697000E-03 -0.666666666666666970E-02  0.921687736420685050E-02
+ -0.671542633493042978E-03 -0.435057931124196984E-03 -0.622874398846488971E-03
+  0.926585992663164944E-02 -0.666666666666666970E-02 -0.435040480266267004E-03
+  0.921687629128858064E-02 -0.671540837058780997E-03 -0.435057906748759010E-03
+ -0.622872536301050019E-03  0.926585804176803009E-02 -0.164605382599736005E-03
+  0.822717713315264040E-02 -0.610689878464067020E-09 -0.162062559112368987E-03
+ -0.666666666666666970E-02 -0.413714002853130017E-11  0.755721765707715026E-02
+ -0.162062560979487995E-03 -0.666666666666666970E-02  0.755721765716044040E-02
+ -0.413721119467013017E-11 -0.164605391905931007E-03 -0.610700236970853951E-09
+  0.822717713430983974E-02 -0.162062559104255008E-03 -0.666666666666666970E-02
+  0.755721765270791958E-02 -0.413714010482357039E-11 -0.164605382602643998E-03
+ -0.610689887899547005E-09  0.822717713315264040E-02 -0.666666666666666970E-02
+ -0.162062002688690993E-03  0.755721765325861970E-02 -0.411602396034761983E-11
+ -0.164602611349319008E-03 -0.607618836657478970E-09  0.822717713082904076E-02
+ -0.435050795545137013E-03  0.926588559200255957E-02 -0.622885679652955006E-03
+ -0.435047591837533988E-03 -0.666666666666666970E-02 -0.671596715586648962E-03
+  0.921711691727978080E-02 -0.435047591829257026E-03 -0.666666666666666970E-02
+  0.921711503527408021E-02 -0.671594744907884041E-03 -0.435050795553413975E-03
+ -0.622883788869598950E-03  0.926588363014568990E-02 -0.666666666666666970E-02
+  0.921687630349790968E-02 -0.671540851815855032E-03 -0.622872550856051971E-03
+  0.926585805656933957E-02  0.822717712936564018E-02 -0.606929320764278968E-09
+ -0.666666666666666970E-02 -0.411128314877385017E-11  0.755721765699138986E-02
+ -0.162062559115238003E-03 -0.666666666666666970E-02  0.755721765701833012E-02
+ -0.413714010896968962E-11 -0.164605382614328987E-03 -0.610689894184595994E-09
+  0.822717713311110939E-02 -0.666666666666666970E-02 -0.435040480219567002E-03
+  0.921687629951207024E-02 -0.671540847113225037E-03 -0.435057906795455001E-03
+ -0.622872546234027962E-03  0.926585805179963064E-02 -0.162127546311748987E-03
+ -0.666666666666666970E-02  0.759121249955777965E-02 -0.382423940240163967E-11
+  0.000000000000000000E+00 -0.164583726855365006E-03 -0.546878175305287047E-09
+  0.822717706955716950E-02 -0.666666666666666970E-02 -0.166319285308019998E-03
+  0.755721766131254007E-02 -0.125511384896241995E-10  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166854277196930989E-03
+  0.000000000000000000E+00 -0.695070273739865966E-09  0.759121319033985009E-02
+ -0.666666666666666970E-02 -0.162062001821340992E-03  0.755721765238263031E-02
+ -0.411599097514774006E-11 -0.164602607032878005E-03 -0.607614054874119969E-09
+  0.822717713004762069E-02  0.926588367541618038E-02 -0.622883811249193005E-03
+ -0.666666666666666970E-02 -0.671594835876209036E-03  0.921711542242288975E-02
+ -0.666666666666666970E-02 -0.435047603237387977E-03  0.921711541881104976E-02
+ -0.671594832109726034E-03 -0.435050784145465008E-03 -0.622883807637388031E-03
+  0.926588367164974010E-02 -0.666666666666666970E-02  0.755721765237517013E-02
+ -0.411128320041531016E-11 -0.606929330558691026E-09  0.822717712936565058E-02
+ -0.435040480193840022E-03 -0.666666666666666970E-02  0.921687628881386924E-02
+ -0.671540836666198004E-03 -0.435057906821179974E-03 -0.622872536362236969E-03
+  0.926585804144416936E-02 -0.166666666666667011E-01 -0.435069629630581997E-03
+  0.194633698669611013E-01 -0.118122796424918004E-02 -0.435069154494976983E-03
+ -0.125613726459116001E-02  0.193884422615678009E-01 -0.166666666666667011E-01
+ -0.139671085486954007E-03  0.172964744640674989E-01 -0.239732978890832008E-07
+ -0.141641486800695994E-03 -0.121305359387927990E-05  0.178147059436198005E-01
+ -0.141641814770074998E-03  0.178147059816575991E-01 -0.121308374397845994E-05
+ -0.166666666666667011E-01 -0.139671142305139987E-03 -0.239739107040978989E-07
+  0.172964744651716990E-01 -0.166666666666667011E-01 -0.435068671068379023E-03
+  0.000000000000000000E+00  0.193848780140391010E-01 -0.125250859257347992E-02
+ -0.435070113055705985E-03 -0.117767774816952006E-02  0.194597410164681003E-01
+ -0.166666666666667011E-01  0.172969944943952014E-01 -0.779115312058558027E-07
+ -0.166666666666667011E-01 -0.755324935173564974E-05  0.179004996941746000E-01
+ -0.166666666666667011E-01  0.178976999748920000E-01 -0.387667548978359018E-05
+ -0.233628102655817991E-06  0.178135930268625009E-01 -0.166666666666667011E-01
+ -0.435068785870240020E-03  0.193888089807052987E-01 -0.125652619301678007E-02
+ -0.435069998254330982E-03 -0.118160505927794005E-02  0.194637572604987007E-01
+ -0.435033736890810016E-03 -0.166666666666667011E-01  0.193943160872508992E-01
+ -0.126292016425918010E-02  0.000000000000000000E+00 -0.435105043203752020E-03
+ -0.118838524744318999E-02  0.194704567784157005E-01 -0.166666666666667011E-01
+ -0.139670810780983011E-03  0.000000000000000000E+00  0.172964393749100998E-01
+ -0.239726372267789987E-07 -0.141640262301834013E-03 -0.121310025861443004E-05
+  0.178147059964809014E-01 -0.166666666666667011E-01 -0.435069265953867983E-03
+  0.193887440338862994E-01 -0.125644950186007995E-02 -0.435069518171818988E-03
+ -0.118152293796727997E-02  0.194636762447145013E-01 -0.140348173477511997E-03
+  0.178144318724902002E-01 -0.933813955352833025E-06 -0.166666666666667011E-01
+ -0.140348162575281002E-03 -0.933775488811803979E-06  0.178144292153939003E-01
+ -0.166666666666667011E-01 -0.139703533487054989E-03  0.172964344581288988E-01
+ -0.243260553268771015E-07 -0.141829183573385009E-03 -0.123048280177751006E-05
+  0.178147222203411011E-01 -0.435051982399214975E-03 -0.166666666666667011E-01
+  0.193933724724300004E-01 -0.126258834031118010E-02 -0.435086800763898984E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118819422876379008E-02
+  0.194685490108550997E-01  0.000000000000000000E+00 -0.435025946733501019E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193872403842459996E-01 -0.125578622091835007E-02 -0.435112831393176015E-03
+ -0.118149517659288990E-02  0.194634742330271997E-01 -0.166666666666667011E-01
+  0.172964350463347995E-01 -0.239721729446109010E-07 -0.121308719330683991E-05
+  0.178147059846293990E-01 -0.166666666666667011E-01 -0.435069341371819001E-03
+  0.193887901446231983E-01 -0.125649495627712989E-02 -0.435069442753911013E-03
+ -0.118156624050173007E-02  0.194637211303591989E-01 -0.166666666666667011E-01
+  0.172964137870020990E-01 -0.947281877378979960E-08  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.426824290648908980E-06  0.177495617930084996E-01
+  0.000000000000000000E+00 -0.140229676531456008E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177491875365273014E-01
+ -0.500657647124818999E-07 -0.140479994479306001E-03 -0.522090189834258032E-06
+  0.178140176711019983E-01 -0.435033290824524978E-03 -0.166666666666667011E-01
+  0.193943004566385011E-01 -0.126291331604961991E-02 -0.435105489168504994E-03
+ -0.118838501644233006E-02  0.194704545970095996E-01 -0.435268249897141982E-03
+ -0.166666666666667011E-01  0.194803294031692005E-01 -0.120221696823341996E-02
+ -0.434870408625407019E-03 -0.126728096347563995E-02  0.194000952173805016E-01
+ -0.166666666666667011E-01  0.172950455057520984E-01 -0.209695222919133997E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309476531645996E-05
+  0.178040398390986003E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173086281050639987E-01 -0.234055546331482985E-07 -0.116921175397781010E-05
+  0.178149613334784990E-01 -0.166666666666667011E-01 -0.141165505762467993E-03
+  0.172962556489167016E-01 -0.473446914579739040E-05 -0.544658362791339019E-03
+ -0.346186312977599982E-04  0.183333438873188002E-01 -0.139754712611172996E-03
+ -0.166666666666667011E-01  0.176170424650436003E-01 -0.246162966049732998E-07
+ -0.140206415574070008E-03 -0.567757815085220049E-06  0.178140588186939987E-01
+ -0.165095245016433009E-03  0.178223322459852007E-01 -0.681146904710235036E-05
+ -0.166666666666667011E-01 -0.150451686795303991E-03 -0.294774223703715983E-06
+  0.176167018986788000E-01 -0.166666666666667011E-01 -0.421169644089434015E-03
+  0.189193940368806990E-01 -0.945437876210229950E-03 -0.448291096827642985E-03
+ -0.100548436568255006E-02  0.192537240725896008E-01 -0.166666666666667011E-01
+  0.173199119819372992E-01 -0.235984471741585014E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.123676172594726990E-05
+  0.178172740172132006E-01 -0.166666666666667011E-01 -0.435171428458813985E-03
+  0.194709389718316986E-01 -0.119092785784032009E-02 -0.434967322589985023E-03
+ -0.126074169121839991E-02  0.193933111634515994E-01 -0.166666666666667011E-01
+  0.172958818583838006E-01 -0.240054389113918998E-07 -0.121597095988231006E-05
+  0.178147088549027999E-01 -0.434490077538828977E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192819870996423989E-01
+ -0.115810409218265005E-02 -0.435647599097352997E-03 -0.109302488191087002E-02
+  0.193702791631770004E-01 -0.166666666666667011E-01  0.172964231611969985E-01
+ -0.237439154341176999E-07  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.119986668002372008E-05
+  0.178139470404297014E-01 -0.166666666666667011E-01 -0.435069730380088988E-03
+  0.194633683526404012E-01 -0.118122901996727003E-02 -0.435069053745286004E-03
+ -0.125613277918751993E-02  0.193884385258774015E-01 -0.166666666666667011E-01
+ -0.139670526915550006E-03  0.172964750459083001E-01 -0.239672729870900994E-07
+ -0.141638257586788001E-03 -0.121275616494760993E-05  0.178147056890299998E-01
+ -0.141820432945145006E-03  0.178147229316946987E-01 -0.122951654344512007E-05
+ -0.166666666666667011E-01 -0.139702093156581013E-03 -0.243082078767443003E-07
+  0.172964754242462999E-01 -0.166666666666667011E-01 -0.435009064920768992E-03
+  0.193710715232028007E-01 -0.123960904534558994E-02 -0.435129707573877990E-03
+ -0.116591990546044996E-02  0.194474141497074995E-01 -0.166666666666667011E-01
+  0.172985028689811998E-01 -0.223410986359397003E-07 -0.166666666666667011E-01
+ -0.111564513924942993E-05  0.178095672953987004E-01 -0.166666666666667011E-01
+  0.193790139692226991E-01 -0.125224418475917997E-02 -0.118134652118899995E-02
+  0.194616264856153007E-01 -0.166666666666667011E-01 -0.435069813890499976E-03
+  0.194637397133314016E-01 -0.118159504407725001E-02 -0.435068970234672974E-03
+ -0.125650151292971989E-02  0.193888007309931006E-01 -0.435063659072025002E-03
+ -0.166666666666667011E-01  0.193957932538242998E-01 -0.126373923029710002E-02
+  0.000000000000000000E+00 -0.435075124949578019E-03 -0.118877268529867007E-02
+  0.194710559190932990E-01 -0.166666666666667011E-01 -0.139670655296777991E-03
+  0.172964394256112999E-01 -0.239709674501792008E-07 -0.141639364432354998E-03
+ -0.121301807421782997E-05  0.178147059272172996E-01 -0.166666666666667011E-01
+ -0.435069551479130975E-03  0.194636473054102983E-01 -0.118149814806462004E-02
+ -0.435069232646526994E-03 -0.125641682842023996E-02  0.193887163382647013E-01
+ -0.140347813456026012E-03  0.178144318351523001E-01 -0.933769427979961972E-06
+ -0.166666666666667011E-01 -0.140367975523149991E-03 -0.933880353406906957E-06
+  0.178144352914434007E-01 -0.166666666666667011E-01 -0.139694549084838003E-03
+  0.172964390046605986E-01 -0.242291801426243003E-07 -0.141777300691703004E-03
+ -0.122571812837964004E-05  0.178147201764872996E-01 -0.435047850309519990E-03
+ -0.166666666666667011E-01  0.193943737538610009E-01 -0.126316205126679004E-02
+  0.000000000000000000E+00 -0.435090932343969020E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118860422039771007E-02
+  0.194699012602338010E-01  0.000000000000000000E+00 -0.435045457437436027E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193878817571609997E-01
+ -0.125604652907840004E-02 -0.435093324868215015E-03 -0.118146947412013995E-02
+  0.194635297377060007E-01 -0.139670603133990989E-03 -0.166666666666667011E-01
+  0.172964396533356012E-01 -0.239704033946370987E-07 -0.141639061330424995E-03
+ -0.121298992344433003E-05  0.178147059349155999E-01 -0.166666666666667011E-01
+ -0.435069549163963982E-03  0.194637077506569015E-01 -0.118155678661577995E-02
+ -0.435069234961695992E-03 -0.125647743137780000E-02  0.193887749967509987E-01
+ -0.139700055968098002E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964161546672983E-01 -0.947619561016756952E-08 -0.141286732534576009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.426994435274359988E-06
+  0.177495882894109988E-01  0.000000000000000000E+00 -0.140193998721740011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177492151422632000E-01
+ -0.498003721032979000E-07 -0.140435023641922004E-03 -0.519120801586516000E-06
+  0.178140167474564008E-01 -0.435057327600382997E-03 -0.166666666666667011E-01
+  0.193955726939472016E-01 -0.126364253425448000E-02 -0.435081456064144001E-03
+ -0.118875145877563992E-02  0.194710320647638012E-01 -0.434101287475744974E-03
+ -0.166666666666667011E-01  0.193457297954395986E-01 -0.123123427263146005E-02
+ -0.436034461490277003E-03 -0.117022370416927993E-02  0.194484060865461987E-01
+ -0.166666666666667011E-01  0.172978496798759998E-01 -0.927299841537387054E-08
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.415800227862941001E-06  0.177484703644168011E-01 -0.166666666666667011E-01
+  0.193674050743437992E-01 -0.124713171719898993E-02 -0.118115734794936003E-02
+  0.194600874520639011E-01 -0.166666666666667011E-01 -0.141885046454967004E-03
+  0.173019565602799998E-01 -0.104149302119304992E-04 -0.548756258214203995E-03
+ -0.211535543706460008E-03  0.185091585802883984E-01 -0.140739916525857997E-03
+ -0.166666666666667011E-01  0.175467171312232995E-01 -0.284331238335905014E-07
+ -0.142759739737579006E-03 -0.795488176607448023E-06  0.178133916912854989E-01
+ -0.159214075622715010E-03  0.178183546415562992E-01 -0.506772449993064000E-05
+ -0.166666666666667011E-01 -0.147688204700038996E-03 -0.193011674962318992E-06
+  0.175471635134074010E-01 -0.166666666666667011E-01 -0.390248902500896981E-03
+  0.183597810468748013E-01 -0.389827888875239994E-03 -0.472748684898519024E-03
+ -0.455376277614669004E-03  0.187227106527687003E-01 -0.166666666666667011E-01
+  0.184527954209229988E-01 -0.738253265195438005E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.918001551148692977E-03  0.188197523488804995E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.405995886601362996E-03  0.000000000000000000E+00  0.187454059199008011E-01
+ -0.889550329385700050E-03 -0.461282908632618994E-03 -0.106896425909096997E-02
+  0.192970827972911013E-01 -0.166666666666667011E-01  0.173071968997474991E-01
+ -0.234787153019127012E-07 -0.117638457460658001E-05  0.178149781133057003E-01
+ -0.141084828730741996E-03 -0.166666666666667011E-01  0.177480973766994017E-01
+ -0.558748334339287969E-07 -0.141568067840571008E-03 -0.593631384362742965E-06
+  0.178140924667235002E-01 -0.139670647362560011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172964277750680996E-01 -0.239358302523397992E-07
+ -0.141638396626583005E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.121100320738714998E-05  0.178145893661829993E-01
+ -0.166666666666667011E-01 -0.139670792138544994E-03  0.172964364510580990E-01
+ -0.239726277356469989E-07 -0.141640184766718993E-03 -0.121310638575149002E-05
+  0.178147060027850010E-01 -0.166666666666667011E-01 -0.140348083609600002E-03
+  0.178144298829951003E-01 -0.933774429165531998E-06 -0.140348091728343988E-03
+ -0.933803076573281956E-06  0.178144318618429984E-01 -0.435069485970576990E-03
+  0.194636360494852001E-01 -0.118148312544618999E-02 -0.166666666666667011E-01
+ -0.435069298155133020E-03 -0.125640935261609995E-02  0.193887056174587014E-01
+ -0.166666666666667011E-01 -0.139670745411716994E-03  0.172964393771044000E-01
+ -0.239719351623946988E-07 -0.141639884975219995E-03 -0.121306573571080992E-05
+  0.178147059632727985E-01 -0.166666666666667011E-01  0.193887976520876991E-01
+ -0.125650164472155992E-02 -0.166666666666667011E-01 -0.118157208029700004E-02
+  0.194637273004373001E-01 -0.166666666666667011E-01  0.193887977203392008E-01
+ -0.125650167109372002E-02 -0.118157208884185996E-02  0.194637273863749009E-01
+ -0.166666666666667011E-01 -0.435069389748935983E-03  0.193887814175709994E-01
+ -0.125648501071033009E-02 -0.435069394376801999E-03 -0.118155580222921008E-02
+  0.194637107296697989E-01 -0.140178146762424992E-03 -0.166666666666667011E-01
+  0.177492251547946012E-01 -0.496800708557238970E-07  0.000000000000000000E+00
+ -0.140415043495099005E-03 -0.517779583567776022E-06  0.178140139542000003E-01
+ -0.166666666666667011E-01 -0.435069391170850978E-03  0.193884418595630000E-01
+ -0.125613777078238003E-02 -0.435069392954887005E-03 -0.118121619467206006E-02
+  0.194633634756040015E-01 -0.166666666666667011E-01 -0.139670761471267005E-03
+  0.172964743992243992E-01 -0.239698239143705995E-07 -0.141639617585322011E-03
+ -0.121288295374220008E-05  0.178147057813429002E-01 -0.141639619900249991E-03
+  0.178147057815410993E-01 -0.121288316535549008E-05 -0.166666666666667011E-01
+ -0.139670761872411997E-03 -0.239698282191903987E-07  0.172964743992280005E-01
+ -0.166666666666667011E-01 -0.435069391157123027E-03  0.193887088952683015E-01
+ -0.125641082509626010E-02 -0.435069392968615010E-03 -0.118148323088716994E-02
+  0.194636365300335998E-01 -0.139670783971901987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172964393406603995E-01 -0.239438477767506995E-07
+ -0.141639278555754001E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121143526565240011E-05  0.178146131748509011E-01  0.000000000000000000E+00
+ -0.435064994003248019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885566142219003E-01 -0.125634391561759004E-02 -0.435073790061057985E-03
+ -0.118148087424704991E-02  0.194636165590847988E-01 -0.139670564073325994E-03
+ -0.166666666666667011E-01  0.172964393993381010E-01 -0.239699852667253984E-07
+ -0.141638838042723001E-03 -0.121296980432410002E-05  0.178147058666821984E-01
+ -0.166666666666667011E-01 -0.435069390409400993E-03  0.193887087203693000E-01
+ -0.125641066243104005E-02 -0.435069393716336990E-03 -0.118148308295969000E-02
+  0.194636363738771995E-01 -0.139670603963582999E-03 -0.166666666666667011E-01
+  0.172964393580567997E-01 -0.239419221487562986E-07 -0.141638239923676007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121134061697532001E-05  0.178146131028351987E-01
+  0.000000000000000000E+00 -0.435064995135525016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885567877557993E-01 -0.125634406896881997E-02
+ -0.435073788928811995E-03 -0.118148100749493009E-02  0.194636167020634000E-01
+ -0.139670744060081990E-03 -0.166666666666667011E-01  0.172964393838881993E-01
+ -0.239719201961261003E-07 -0.141639877103680014E-03 -0.121306498448360998E-05
+  0.178147059625157998E-01 -0.435069391187604992E-03 -0.166666666666667011E-01
+  0.193887118630177990E-01 -0.125641385907748998E-02 -0.435069392938132991E-03
+ -0.118148619756011002E-02  0.194636395637266015E-01 -0.166666666666667011E-01
+  0.172965077542917012E-01 -0.927568247376278936E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046200973755008E-05  0.173051339160060005E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193887977549710017E-01
+ -0.125650168627743997E-02 -0.118157208932238010E-02  0.194637273908665996E-01
+ -0.166666666666667011E-01 -0.435069390872567011E-03  0.193884418484903988E-01
+ -0.125613776548786000E-02 -0.435069393253171026E-03 -0.118121619377422010E-02
+  0.194633634734872989E-01 -0.139670761455125999E-03 -0.166666666666667011E-01
+  0.172964743993230009E-01 -0.239698237345604007E-07 -0.141639617491158998E-03
+ -0.121288294467888995E-05  0.178147057813337999E-01 -0.141639617495969007E-03
+  0.178147057813342995E-01 -0.121288294511917005E-05 -0.166666666666667011E-01
+ -0.139670761455959995E-03 -0.239698237435694994E-07  0.172964743993230009E-01
+ -0.166666666666667011E-01 -0.435069391186033983E-03  0.193887118629635993E-01
+ -0.125641385905383009E-02 -0.435069392939704000E-03 -0.118148619755951002E-02
+  0.194636395637197007E-01 -0.166666666666667011E-01  0.172965077542917012E-01
+ -0.927568247375443947E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.102046200973782007E-05  0.173051339160060005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709096002E-03
+  0.000000000000000000E+00  0.173041376249854009E-01 -0.235410513736254011E-07
+ -0.141584640574228987E-03 -0.117464392253343004E-05  0.178146677207011994E-01
+ -0.166666666666667011E-01  0.193887977549710017E-01 -0.125650168627745992E-02
+ -0.118157208932238010E-02  0.194637273908665996E-01 -0.139674680709097004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173041376249854009E-01 -0.235410513736612015E-07 -0.141584640574232999E-03
+ -0.117464392253384995E-05  0.178146677207011994E-01 -0.139700009140160993E-03
+ -0.166666666666667011E-01  0.172964151618227001E-01 -0.947777749626817048E-08
+ -0.141286602860218007E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427075077777844986E-06
+  0.177495996071506991E-01 -0.166666666666667011E-01 -0.139670794784869993E-03
+  0.172964386794358001E-01 -0.239725105714824009E-07 -0.141640177118637010E-03
+ -0.121309559766888995E-05  0.178147059913013987E-01 -0.166666666666667011E-01
+ -0.435069295710136016E-03  0.193887241621244003E-01 -0.125642846904032001E-02
+ -0.435069488415573019E-03 -0.118150189715466995E-02  0.194636550484258014E-01
+ -0.140349307522104008E-03  0.178144320174323002E-01 -0.933958710839114049E-06
+ -0.166666666666667011E-01 -0.140349299171436005E-03 -0.933929273088272012E-06
+  0.178144299843184008E-01 -0.166666666666667011E-01 -0.435068659479091984E-03
+  0.193848776262358011E-01 -0.125250842953309989E-02 -0.435070124644940006E-03
+ -0.117767775442942997E-02  0.194597409762624005E-01 -0.166666666666667011E-01
+  0.172969944928271988E-01 -0.779115314441779059E-07 -0.166666666666667011E-01
+ -0.755324939016156960E-05  0.179004996942138013E-01 -0.166666666666667011E-01
+  0.178976999748929992E-01 -0.387667548981126021E-05 -0.233628102656007991E-06
+  0.178135930268625009E-01 -0.166666666666667011E-01 -0.139686009556357012E-03
+  0.172964370559683007E-01 -0.241363115163189999E-07 -0.141728010347422987E-03
+ -0.122115135534191003E-05  0.178147135015842006E-01 -0.435039396134656023E-03
+ -0.166666666666667011E-01  0.193948601305157994E-01 -0.126336416797764991E-02
+  0.000000000000000000E+00 -0.435099385138650019E-03 -0.118873902082604994E-02
+  0.194708368874158995E-01 -0.166666666666667011E-01 -0.435069389255753991E-03
+  0.193884422504904987E-01 -0.125613821100521001E-02 -0.435069394869983992E-03
+ -0.118121665337718993E-02  0.194633639337775007E-01 -0.166666666666667011E-01
+ -0.139671380124092994E-03  0.172964743448792008E-01 -0.239764688220868006E-07
+ -0.141643188367998986E-03 -0.121320975071519003E-05  0.178147060922408014E-01
+ -0.141639844172888998E-03  0.178147058017989017E-01 -0.121290395200246995E-05
+ -0.166666666666667011E-01 -0.139670800627354991E-03 -0.239702482700813011E-07
+  0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431626006E-03
+  0.193889053227481005E-01 -0.125663654535791008E-02 -0.435070548689864018E-03
+ -0.118172116532760008E-02  0.194638725033138005E-01 -0.161678337891911993E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964935301388005E-01
+ -0.110214153508716999E-06 -0.162422470311612014E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120622187566448997E-05  0.173052150067069012E-01
+  0.000000000000000000E+00 -0.139675704206961997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173040340098522989E-01 -0.235576501589924013E-07
+ -0.141591382930956008E-03 -0.117568258902886992E-05  0.178146687262424989E-01
+ -0.435067411942580010E-03 -0.166666666666667011E-01  0.193886625538398985E-01
+ -0.125640355926819004E-02 -0.435071372170706019E-03 -0.118150457761581010E-02
+  0.194636501951265008E-01 -0.166666666666667011E-01 -0.435069317670580980E-03
+  0.193887257892274983E-01 -0.125642966436278992E-02 -0.435069466455139981E-03
+ -0.118150274119342008E-02  0.194636560434883997E-01 -0.161601801934664006E-03
+ -0.166666666666667011E-01  0.172965054748792993E-01 -0.926502681628198944E-07
+ -0.162351306455949001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138676949999E-05
+  0.173051393629537996E-01  0.000000000000000000E+00 -0.139695639988150999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173041438368814991E-01
+ -0.237586532645376999E-07 -0.141701626147540003E-03 -0.118517343244098004E-05
+  0.178146776002247008E-01 -0.435031309394906995E-03 -0.166666666666667011E-01
+  0.193945788398839983E-01 -0.126324081474057004E-02 -0.435107470132360024E-03
+ -0.118873511395483993E-02  0.194708001084720993E-01 -0.435268203178909021E-03
+ -0.166666666666667011E-01  0.194803280841454006E-01 -0.120221471825894999E-02
+ -0.434870455402753989E-03 -0.126728100453280998E-02  0.194000950994309002E-01
+ -0.166666666666667011E-01  0.172950455023187996E-01 -0.209695226067755011E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309478839939992E-05
+  0.178040398395854990E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05
+  0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303238012E-03
+  0.172962108365329013E-01 -0.473445694664380988E-05 -0.544660594244451043E-03
+ -0.346550124406059992E-04  0.183333673483180998E-01 -0.139754726222278999E-03
+ -0.166666666666667011E-01  0.176170084219189997E-01 -0.246149781576928007E-07
+ -0.140206540519620008E-03 -0.567788699156871954E-06  0.178140588497450007E-01
+ -0.165095492820405989E-03  0.178223326710732002E-01 -0.681196695094317026E-05
+ -0.166666666666667011E-01 -0.150450799069272987E-03 -0.294765827682543002E-06
+  0.176166680355560011E-01 -0.166666666666667011E-01 -0.421169440873413019E-03
+  0.189193905894523007E-01 -0.945436157371731009E-03 -0.448291280314589975E-03
+ -0.100548420996315999E-02  0.192537236307863992E-01 -0.166666666666667011E-01
+  0.173199119681975988E-01 -0.235984477489150007E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.123676178966298004E-05
+  0.178172740173286014E-01 -0.166666666666667011E-01 -0.435171428461128974E-03
+  0.194709389729267983E-01 -0.119092785894799004E-02 -0.434967322587667975E-03
+ -0.126074169224678996E-02  0.193933111644412001E-01 -0.166666666666667011E-01
+  0.172958818582512990E-01 -0.240054389194565998E-07 -0.121597096057541996E-05
+  0.178147088549035007E-01 -0.434490077563752020E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192819871000558009E-01
+ -0.115810409202821997E-02 -0.435647599072524984E-03 -0.109302488139546004E-02
+  0.193702791629623006E-01 -0.435071227105071981E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194623470555931009E-01 -0.118137412384183003E-02
+ -0.435067557009957988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.125600599101201998E-02  0.193876331668423017E-01
+ -0.166666666666667011E-01 -0.139670755029073005E-03  0.172964393832697010E-01
+ -0.239720378568450991E-07 -0.141639940417128004E-03 -0.121307076941768003E-05
+  0.178147059675646015E-01 -0.166666666666667011E-01 -0.435069370632784017E-03
+  0.193887126423957987E-01 -0.125641513719206011E-02 -0.435069413492952990E-03
+ -0.118148776859817008E-02  0.194636409705596994E-01 -0.140350151817307989E-03
+  0.178144321253345984E-01 -0.934063912396124954E-06 -0.166666666666667011E-01
+ -0.140350134684648006E-03 -0.934057299894995007E-06  0.178144316718926998E-01
+ -0.166666666666667011E-01 -0.435068157528115975E-03  0.193838368298109992E-01
+ -0.125145454139508993E-02 -0.435070626592773998E-03 -0.117665452507550007E-02
+  0.194586918508348992E-01 -0.166666666666667011E-01  0.172970581684627998E-01
+ -0.692387039594140975E-18 -0.166666666666667011E-01 -0.116362368857371003E-16
+  0.173226956581742017E-01 -0.166666666666667011E-01  0.173227088035448999E-01
+ -0.227142423077076990E-07 -0.109342148655364995E-05  0.178145868761214014E-01
+ -0.166666666666667011E-01 -0.139672882575824002E-03  0.172964387861730996E-01
+ -0.239948730000151010E-07 -0.141652224900244998E-03 -0.121419432935652996E-05
+  0.178147068681758017E-01 -0.435063160018721003E-03 -0.166666666666667011E-01
+  0.193888839247167016E-01 -0.125673333404555001E-02  0.000000000000000000E+00
+ -0.435075623983676026E-03 -0.118189505882440008E-02  0.194640013018383996E-01
+ -0.166666666666667011E-01 -0.435069389648734019E-03  0.000000000000000000E+00
+  0.193884419847699004E-01 -0.125613793152611996E-02 -0.435069394477004018E-03
+ -0.118121637448865007E-02  0.194633636498748007E-01 -0.166666666666667011E-01
+ -0.139670964419093003E-03  0.172964743771341009E-01 -0.239720016692284015E-07
+ -0.141640788952219994E-03 -0.121299004622935002E-05  0.178147058760876990E-01
+ -0.141639744525080005E-03  0.178147057920136007E-01 -0.121289465915329990E-05
+ -0.166666666666667011E-01 -0.139670783426588003E-03 -0.239700609664733988E-07
+  0.172964743758160996E-01 -0.166666666666667011E-01 -0.435069015984241008E-03
+  0.193887360934393016E-01 -0.125644691154631004E-02 -0.435069768141048007E-03
+ -0.118152418161517991E-02  0.194636756633727002E-01 -0.161605811424037006E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965033247536003E-01
+ -0.935932238155109053E-07 -0.162355789101947996E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.103050516658616990E-05  0.173051502951346997E-01
+  0.000000000000000000E+00 -0.139674971988557002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041442833562001E-01 -0.235436840180659000E-07
+ -0.141586201352507011E-03 -0.117475642480550998E-05  0.178146678139698006E-01
+ -0.435068868942317015E-03 -0.166666666666667011E-01  0.193886960011524015E-01
+ -0.125640827839761994E-02 -0.435069915182551979E-03 -0.118148826831425001E-02
+  0.194636394345385003E-01 -0.166666666666667011E-01 -0.435069350378355024E-03
+  0.193887155593089992E-01 -0.125641852657909004E-02 -0.435069433747378026E-03
+ -0.118149137295133996E-02  0.194636445794403000E-01 -0.161601725677973997E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965069829389001E-01
+ -0.926650122837621983E-07 -0.162351498594983989E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102023342811811993E-05  0.173051439965640000E-01
+  0.000000000000000000E+00 -0.139679244153132989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041479614229991E-01 -0.235877316475454016E-07
+ -0.141610020722497009E-03 -0.117688204166686002E-05  0.178146696122764014E-01
+ -0.435061149026493986E-03 -0.166666666666667011E-01  0.193888142839695986E-01
+ -0.125670280436314006E-02 -0.435077634883449980E-03 -0.118189407884959998E-02
+  0.194639921507386016E-01 -0.415397337630780993E-03 -0.166666666666667011E-01
+  0.188278985006655984E-01 -0.898332650045155024E-03 -0.453392799083327013E-03
+ -0.100006104572014989E-02  0.192410591480840984E-01 -0.166666666666667011E-01
+  0.173196351833987995E-01 -0.236101000931807013E-07  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.123805476906965002E-05
+  0.178172764972891993E-01 -0.166666666666667011E-01  0.172958803102313992E-01
+ -0.240055324643047990E-07 -0.121597905454717007E-05  0.178147088629614994E-01
+ -0.166666666666667011E-01 -0.139361332584411003E-03  0.172963780471655990E-01
+ -0.206406041294607990E-07 -0.139853947917575004E-03 -0.104886506051046009E-05
+  0.178144680355106985E-01 -0.482519288981585995E-03 -0.166666666666667011E-01
+  0.184437042794661984E-01 -0.182960367117892994E-03 -0.375083574717954013E-03
+ -0.210197986405779994E-03  0.183325256678834002E-01 -0.143207812762676998E-03
+  0.178148150557865988E-01 -0.128505524246536990E-05 -0.166666666666667011E-01
+ -0.199761155404034002E-03 -0.199498340007143991E-05  0.178294573883331017E-01
+ -0.166666666666667011E-01 -0.433603776206709980E-03  0.193497604591337004E-01
+ -0.124503388545287009E-02 -0.436528045694049990E-03 -0.119072564830830993E-02
+  0.194675615509681994E-01 -0.166666666666667011E-01  0.172951192308572997E-01
+ -0.214703230748565997E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.107141838335927994E-05  0.178058595670048010E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.434603197488181014E-03
+  0.000000000000000000E+00  0.193058537142562006E-01 -0.118063268562849000E-02
+ -0.435534875266554988E-03 -0.111369468800605006E-02  0.193919603585517991E-01
+ -0.166666666666667011E-01  0.173056689022044012E-01 -0.235558337256563013E-07
+ -0.118363055389575001E-05  0.178149857852359997E-01 -0.435171466645126008E-03
+ -0.166666666666667011E-01  0.194709518251613006E-01 -0.119094077856059005E-02
+ -0.434967284379077007E-03 -0.126075351291434000E-02  0.193933220870970008E-01
+ -0.435071725054391020E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.194634518078929016E-01 -0.118146952991411008E-02 -0.435067059054058004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.125632584199591004E-02  0.193884910636192009E-01 -0.166666666666667011E-01
+ -0.139670745136767009E-03  0.172964396823254013E-01 -0.239719121263430998E-07
+ -0.141639880244726003E-03 -0.121306391322725994E-05  0.178147059609039017E-01
+ -0.166666666666667011E-01 -0.140348087747312988E-03  0.178144318030939997E-01
+ -0.933800582768691986E-06 -0.140348087981950005E-03 -0.933801410703608978E-06
+  0.178144318602838012E-01 -0.435069394684114985E-03  0.194636364171987992E-01
+ -0.118148312778892993E-02 -0.166666666666667011E-01 -0.435069389441622998E-03
+ -0.125641069437981007E-02  0.193887087332374995E-01 -0.166666666666667011E-01
+ -0.139670745366996012E-03  0.172964393825588009E-01 -0.239719343252174001E-07
+ -0.141639884660971991E-03 -0.121306568223363996E-05  0.178147059632162985E-01
+ -0.166666666666667011E-01  0.193887976553534999E-01 -0.125650164603376005E-02
+ -0.166666666666667011E-01 -0.118157208059822003E-02  0.194637273034572005E-01
+ -0.166666666666667011E-01  0.193887977225202998E-01 -0.125650167205010994E-02
+ -0.118157208887189995E-02  0.194637273866554994E-01 -0.166666666666667011E-01
+ -0.435069390858170975E-03  0.193887118569476997E-01 -0.125641385953187998E-02
+ -0.435069393267567008E-03 -0.118148620273378011E-02  0.194636395676883005E-01
+ -0.139674686768931008E-03 -0.166666666666667011E-01  0.173041492305231991E-01
+ -0.235404449719397016E-07  0.000000000000000000E+00 -0.141584559768768005E-03
+ -0.117458831050516992E-05  0.178146676653484007E-01 -0.166666666666667011E-01
+ -0.139670743889800991E-03  0.000000000000000000E+00  0.172964393844355011E-01
+ -0.239719183336094004E-07 -0.141639876115250000E-03 -0.121306489168701010E-05
+  0.178147059624313986E-01 -0.166666666666667011E-01 -0.140348087304844010E-03
+  0.178144318418145993E-01 -0.933801043769264029E-06 -0.140348087380234008E-03
+ -0.933801309788676977E-06  0.178144318601899006E-01 -0.435069392936427975E-03
+  0.194636364108725993E-01 -0.118148311418031005E-02 -0.166666666666667011E-01
+ -0.435069391189310008E-03 -0.125641070624249993E-02  0.193887087796953995E-01
+ -0.166666666666667011E-01 -0.435069391192276006E-03  0.193887815134154012E-01
+ -0.125648507879496003E-02 -0.435069392933461977E-03 -0.118155584780088997E-02
+  0.194637107833945007E-01 -0.139700198050289988E-03 -0.166666666666667011E-01
+  0.172964151566850008E-01 -0.947849280649777055E-08 -0.141287467726875003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.427106383900035001E-06  0.177495992954622012E-01  0.000000000000000000E+00
+ -0.140178145665099013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177492248107081994E-01 -0.496797031657574988E-07 -0.140415043450978997E-03
+ -0.517779495462221951E-06  0.178140139541173997E-01 -0.435069391108214992E-03
+ -0.166666666666667011E-01  0.193887087862484007E-01 -0.125641071469571007E-02
+ -0.435069393017522990E-03 -0.118148312365616006E-02  0.194636364200341007E-01
+ -0.166666666666667011E-01 -0.139670745360696987E-03  0.172964393844495003E-01
+ -0.239719341341385992E-07 -0.141639884605172008E-03 -0.121306566857870991E-05
+  0.178147059632026983E-01 -0.435069390272270994E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.193887086899636012E-01 -0.125641067599288007E-02
+ -0.435069393853468019E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148311480717992E-02  0.194636363313261986E-01  0.000000000000000000E+00
+ -0.435069390212755017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193887087459734993E-01 -0.125641069148615990E-02 -0.435069393912983020E-03
+ -0.118148311376017998E-02  0.194636364065413001E-01 -0.139670743899566996E-03
+ -0.166666666666667011E-01  0.172964393844273999E-01 -0.239719184390481014E-07
+ -0.141639876171704000E-03 -0.121306489688977996E-05  0.178147059624365993E-01
+ -0.435069391188922026E-03 -0.166666666666667011E-01  0.193887118630630996E-01
+ -0.125641385909724002E-02 -0.435069392936816011E-03 -0.118148619756053004E-02
+  0.194636395637322983E-01 -0.166666666666667011E-01  0.172965077542917012E-01
+ -0.927568247378449987E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.102046200973719009E-05  0.173051339160060005E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.166666666666667011E-01
+ -0.435069391170655984E-03  0.193884418583357004E-01 -0.125613776952834995E-02
+ -0.435069392955081998E-03 -0.118121619344725010E-02  0.194633634743562010E-01
+ -0.139670761454458998E-03 -0.166666666666667011E-01  0.172964743993847016E-01
+ -0.239698237234471001E-07 -0.141639617486673003E-03 -0.121288294398923001E-05
+  0.178147057813330990E-01 -0.141639617486714989E-03  0.178147057813330990E-01
+ -0.121288294399298003E-05 -0.166666666666667011E-01 -0.139670761454465991E-03
+ -0.239698237234977997E-07  0.172964743993847016E-01 -0.166666666666667011E-01
+ -0.435069391188907986E-03  0.193887118630626000E-01 -0.125641385909704010E-02
+ -0.435069392936829997E-03 -0.118148619756053004E-02  0.194636395637322011E-01
+ -0.166666666666667011E-01  0.172965077542917012E-01 -0.927568247378397048E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973721995E-05
+  0.173051339160060005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+  0.173041376249854009E-01 -0.235410513737083011E-07 -0.141584640574244004E-03
+ -0.117464392253482002E-05  0.178146677207011994E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173041376249854009E-01
+ -0.235410513737084004E-07 -0.141584640574244004E-03 -0.117464392253482002E-05
+  0.178146677207011994E-01 -0.161601720191613987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965076669214998E-01 -0.926701710590744007E-07
+ -0.162351552424590994E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.102033061970779005E-05  0.173051453907301991E-01
+ -0.166666666666667011E-01 -0.140025227157220011E-03  0.172964538609525001E-01
+ -0.280836077054835003E-07 -0.143700853603787992E-03 -0.141493042418215998E-05
+  0.178149166877635996E-01 -0.166666666666667011E-01 -0.434397138667670006E-03
+  0.193663283421353014E-01 -0.124681061178207996E-02 -0.435740193435873980E-03
+ -0.118155329481477991E-02  0.194610974716315013E-01 -0.435888399868572985E-03
+  0.193018909827777987E-01 -0.102736762162752003E-02 -0.166666666666667011E-01
+ -0.434248105255362999E-03 -0.108835030296548004E-02  0.192105826235978994E-01
+ -0.166666666666667011E-01 -0.142766725250475000E-03  0.173186084279148000E-01
+ -0.107223373775717997E-06 -0.158331602080537010E-03 -0.502583390911269042E-05
+  0.178188218385040009E-01 -0.166666666666667011E-01  0.191887710237797007E-01
+ -0.116955156282324006E-02 -0.166666666666667011E-01 -0.115321249659207001E-02
+  0.193095009677901989E-01 -0.166666666666667011E-01  0.192716415760125007E-01
+ -0.120400166465171009E-02 -0.117888093553793010E-02  0.194469246141813004E-01
+ -0.166666666666667011E-01 -0.139668701380672987E-03  0.172963839829421012E-01
+ -0.239535905300788984E-07 -0.141628653565274010E-03 -0.121228849838231003E-05
+  0.178147051859353991E-01 -0.435070871110831980E-03 -0.166666666666667011E-01
+  0.194629509412433989E-01 -0.118084397985067997E-02  0.000000000000000000E+00
+ -0.435067913007957025E-03 -0.125568351559249996E-02  0.193879990665461986E-01
+ -0.166666666666667011E-01 -0.435069390650039015E-03  0.000000000000000000E+00
+  0.193884418750694017E-01 -0.125613779809562003E-02 -0.435069393475699022E-03
+ -0.118121622922623989E-02  0.194633635071832997E-01 -0.166666666666667011E-01
+ -0.139670790826866987E-03  0.172964743947118006E-01 -0.239701379014204989E-07
+ -0.141639787000618012E-03 -0.121289838766225000E-05  0.178147057914929997E-01
+ -0.141639653189466996E-03  0.178147057839792011E-01 -0.121288622158938009E-05
+ -0.166666666666667011E-01 -0.139670767633548004E-03 -0.239698902061346000E-07
+  0.172964743947353998E-01 -0.166666666666667011E-01 -0.435069338865947026E-03
+  0.193887012207536007E-01 -0.125640423249911005E-02 -0.435069445259782012E-03
+ -0.118147761261617004E-02  0.194636302225951012E-01 -0.161594071152451998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965071943530987E-01
+ -0.911198389499341023E-07 -0.162344730250092007E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.100404518995791990E-05  0.173051412553598995E-01
+  0.000000000000000000E+00 -0.139674707461267010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041614291120985E-01 -0.235399540925235988E-07
+ -0.141584554759037012E-03 -0.117453693858401004E-05  0.178146676069954012E-01
+ -0.435069368728934001E-03 -0.166666666666667011E-01  0.193887057094737983E-01
+ -0.125640804230569007E-02 -0.435069415396801976E-03 -0.118148083984231998E-02
+  0.194636339567480984E-01 -0.166666666666667011E-01 -0.435069379906770002E-03
+  0.193887120222560987E-01 -0.125641427645285992E-02 -0.435069404218967981E-03
+ -0.118148677817681990E-02  0.194636400648426008E-01 -0.161601688606437998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965075862324992E-01
+ -0.926646811008730053E-07 -0.162351530879915987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102027850567167008E-05  0.173051453753680015E-01
+  0.000000000000000000E+00 -0.139674509841842008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041492584409991E-01 -0.235385724457535989E-07
+ -0.141583570403258998E-03 -0.117449718296302996E-05  0.178146674600714004E-01
+ -0.435069784091323016E-03 -0.166666666666667011E-01  0.194629197189570002E-01
+ -0.118078836231266999E-02 -0.435069000033927022E-03 -0.125568468813201011E-02
+  0.193879970350075009E-01 -0.371043793435626003E-03 -0.166666666666667011E-01
+  0.180438947424557995E-01 -0.288534286733046986E-03 -0.486290336507850022E-03
+ -0.560900198964180987E-03  0.187957070663157012E-01 -0.166666666666667011E-01
+  0.177374439856716991E-01 -0.214133613745063999E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.267021979533145016E-03
+  0.179665664602573008E-01 -0.166666666666667011E-01  0.193465910019366000E-01
+ -0.123791262554833999E-02 -0.118077342883706003E-02  0.194573127658026999E-01
+ -0.166666666666667011E-01 -0.139258097457669987E-03  0.172965222480018985E-01
+ -0.139396279238899008E-07 -0.139258097457669987E-03 -0.716212299733630966E-06
+  0.178142098387169996E-01 -0.448746699171803979E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.187014712985399990E-01 -0.470424366568322023E-03
+ -0.420614096697062990E-03 -0.416028776742435001E-03  0.185800358532766004E-01
+ -0.143647797085317999E-03  0.178142596258036995E-01 -0.784777563840381003E-06
+ -0.166666666666667011E-01 -0.289432939916457006E-03 -0.392675356000534004E-05
+  0.178691669573259014E-01 -0.166666666666667011E-01 -0.433838074751410983E-03
+  0.193348477920875991E-01 -0.122492972126329001E-02 -0.436295777836767011E-03
+ -0.116751654000400999E-02  0.194447290230824014E-01 -0.166666666666667011E-01
+  0.173080151788632987E-01 -0.244742519208693003E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133996458919445003E-03  0.178636488290151996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139757971205888996E-03  0.000000000000000000E+00  0.173348621828142008E-01
+ -0.229201529246951995E-07 -0.141718205587288988E-03 -0.107549290031851995E-05
+  0.178145692111312988E-01 -0.166666666666667011E-01  0.193848567630524014E-01
+ -0.125477293338469996E-02 -0.118151635484324991E-02  0.194632155620537983E-01
+ -0.140442473096456008E-03 -0.166666666666667011E-01  0.173484955878829006E-01
+ -0.294133603610693987E-07 -0.144782161123405996E-03 -0.133037267977017010E-05
+  0.178148273618939992E-01 -0.435071708554150990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194637484731423997E-01 -0.118167963393035996E-02
+ -0.435067075554543010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.125644827298293993E-02  0.193888013904018990E-01
+ -0.166666666666667011E-01 -0.143972797957216006E-03  0.173505484374359004E-01
+ -0.481359947088405975E-04 -0.558385337699288022E-03 -0.438375372304428026E-03
+  0.187218716475071011E-01 -0.166666666666667011E-01 -0.139937861588156992E-03
+  0.174078894816094010E-01 -0.227986700629730998E-07 -0.141801880833368992E-03
+ -0.908696204756822957E-06  0.178144027207976993E-01 -0.143597614564550997E-03
+  0.178145584123696014E-01 -0.106316430421145007E-05 -0.166666666666667011E-01
+ -0.140417723829495007E-03 -0.268131472654641992E-07  0.174078938787147995E-01
+ -0.166666666666667011E-01 -0.434071246446273990E-03  0.193423068719833988E-01
+ -0.122826345821061989E-02 -0.436064307367708980E-03 -0.116769061492820003E-02
+  0.194456917758701990E-01 -0.166666666666667011E-01  0.172982030065722998E-01
+ -0.280949763883982991E-07 -0.166666666666667011E-01 -0.189517763026067993E-05
+  0.178271977135994013E-01 -0.166666666666667011E-01  0.178265008932220990E-01
+ -0.134657557070520009E-05 -0.925862840555429982E-06  0.178144080134048990E-01
+ -0.166666666666667011E-01 -0.434780858347314024E-03  0.193789566829400996E-01
+ -0.125221572856264000E-02 -0.435357659993625004E-03 -0.118148634697722000E-02
+  0.194624944824026008E-01 -0.139674508339166013E-03 -0.166666666666667011E-01
+  0.173041495432516984E-01 -0.235385803949071996E-07  0.000000000000000000E+00
+ -0.141583560353484997E-03 -0.117449727533117009E-05  0.178146675901048983E-01
+ -0.166666666666667011E-01 -0.139670744554657995E-03  0.172964393825064990E-01
+ -0.239719255799683993E-07 -0.141639879972026009E-03 -0.121306525215112003E-05
+  0.178147059627193002E-01 -0.166666666666667011E-01 -0.140348090038086001E-03
+  0.178144318165064999E-01 -0.933801051164943002E-06 -0.140348090679100993E-03
+ -0.933801691611406014E-06  0.178144318606493005E-01 -0.435069394145568998E-03
+  0.194636343488718000E-01 -0.118148110868380009E-02 -0.166666666666667011E-01
+ -0.435069389980168985E-03 -0.125640863633868995E-02  0.193887067279608985E-01
+ -0.166666666666667011E-01 -0.435069389466851027E-03  0.193887808952679999E-01
+ -0.125648448071444003E-02 -0.435069394658887010E-03 -0.118155528731557990E-02
+  0.194637102049162007E-01 -0.139700012758421009E-03 -0.166666666666667011E-01
+  0.172964152304735004E-01 -0.947803391050105932E-08  0.000000000000000000E+00
+ -0.141286629556396993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427087720755819975E-06  0.177496011881496003E-01
+  0.000000000000000000E+00 -0.140176041318299010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177492266992976995E-01 -0.496642740357883019E-07
+ -0.140412389918406994E-03 -0.517602927754366013E-06  0.178140137848851986E-01
+ -0.435070897570947004E-03 -0.166666666666667011E-01  0.194636050822587989E-01
+ -0.118148880805278990E-02 -0.435067886547593015E-03 -0.125633531890120007E-02
+  0.193886425635053997E-01 -0.166666666666667011E-01 -0.139669237663479995E-03
+  0.172964396325578015E-01 -0.239557312619153985E-07 -0.141631179766294000E-03
+ -0.121226845353324010E-05  0.178147051921049987E-01 -0.435070355283276994E-03
+ -0.166666666666667011E-01  0.194636498694088986E-01 -0.118156237601487000E-02
+ -0.435068428839514019E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125638673489450003E-02
+  0.193887513139221999E-01  0.000000000000000000E+00 -0.435071250841686976E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194636876903557000E-01
+ -0.118157615129436001E-02 -0.435067533273078989E-03 -0.125640800692181000E-02
+  0.193887125978184013E-01 -0.139670568327848998E-03 -0.166666666666667011E-01
+  0.172964396537817998E-01 -0.239700195293086008E-07 -0.141638860134678002E-03
+ -0.121297096046396990E-05  0.178147058843058995E-01 -0.435090508141796988E-03
+ -0.166666666666667011E-01  0.194669892927517990E-01 -0.118530039028566991E-02
+ -0.435048274568398019E-03 -0.125913754523842994E-02  0.193915247098193008E-01
+ -0.166666666666667011E-01  0.172960984745374989E-01 -0.213609391405068995E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367312891359006E-05
+  0.178056546914439999E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173098191165470990E-01 -0.233466429807202985E-07 -0.116342624068746999E-05
+  0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069713252911978E-03
+  0.194633204071541013E-01 -0.118118203409811997E-02 -0.435069070872497979E-03
+ -0.125608526726808992E-02  0.193883924273124987E-01 -0.139659157492490009E-03
+ -0.166666666666667011E-01  0.172964811115927999E-01 -0.238453400472707005E-07
+ -0.141572592558481008E-03 -0.120675012864579991E-05  0.178147008765491000E-01
+ -0.141538194636943009E-03  0.178146978498745015E-01 -0.120361727918045010E-05
+ -0.166666666666667011E-01 -0.139653196804214987E-03 -0.237816257834915005E-07
+  0.172964810525967003E-01 -0.166666666666667011E-01 -0.435101710749684009E-03
+  0.194659176105031993E-01 -0.118451620585209995E-02 -0.435037070057877023E-03
+ -0.125773914717291005E-02  0.193902105162142985E-01 -0.166666666666667011E-01
+  0.172962783354724016E-01 -0.218342668490586010E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.109032448231275997E-05  0.178073744716654991E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434677413069518002E-03  0.000000000000000000E+00  0.193010409680071000E-01
+ -0.117438144583154004E-02 -0.435460867464983019E-03 -0.110662888765544992E-02
+  0.193848162965538015E-01 -0.166666666666667011E-01  0.173066829056823003E-01
+ -0.235026937709481004E-07 -0.117850668810495007E-05  0.178149746842341007E-01
+ -0.434571088710914019E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192763238295202005E-01 -0.115086473839517999E-02
+ -0.435566874653087015E-03 -0.108492248964389996E-02  0.193620926343772010E-01
+ -0.161599193400744999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964883975278011E-01 -0.923842913120058025E-07 -0.162350867043699006E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.101863211158590009E-05  0.173051440367491989E-01 -0.166666666666667011E-01
+ -0.435069857911570023E-03  0.194633765819712985E-01 -0.118123987997616003E-02
+ -0.435068926213479003E-03 -0.125613733340555994E-02  0.193884432348929991E-01
+ -0.166666666666667011E-01 -0.139666077646513989E-03  0.172964743925189991E-01
+ -0.239195397112311000E-07 -0.141612586198800003E-03 -0.121041044884479009E-05
+  0.178147032886921998E-01 -0.141624350828127994E-03  0.178147048678200005E-01
+ -0.121149428933203998E-05 -0.166666666666667011E-01 -0.139668115474050994E-03
+ -0.239415570468690011E-07  0.172964744463235016E-01 -0.166666666666667011E-01
+ -0.435074201017785996E-03  0.194493813662804001E-01 -0.116767077284960006E-02
+ -0.435064583034155982E-03 -0.124201442563575998E-02  0.193746726576543987E-01
+ -0.166666666666667011E-01  0.173081303334150990E-01 -0.244868683853623015E-04
+ -0.166666666666667011E-01 -0.133924491696816997E-03  0.178635877956252014E-01
+ -0.166666666666667011E-01  0.173348654764515983E-01 -0.223062641979707998E-07
+ -0.104752868600600992E-05  0.178145411980304008E-01 -0.166666666666667011E-01
+ -0.435078250390669987E-03  0.194615680563809991E-01 -0.117967106973176005E-02
+ -0.435060533485638016E-03 -0.125408416347905010E-02  0.193864744856198992E-01
+ -0.139507036577244005E-03 -0.166666666666667011E-01  0.173052664826232015E-01
+ -0.217638922762432993E-07  0.000000000000000000E+00 -0.140641592938044988E-03
+ -0.108598665496889000E-05  0.178145535889315011E-01 -0.166666666666667011E-01
+ -0.139669898838359002E-03  0.172964394567505994E-01 -0.239628351519636015E-07
+ -0.141634997663802990E-03 -0.121261808844727004E-05  0.178147055141749007E-01
+ -0.166666666666667011E-01 -0.140550312154719988E-03  0.178144664524045003E-01
+ -0.935528049442035979E-06 -0.140352831077276009E-03 -0.934435514417231040E-06
+  0.178144323723362016E-01 -0.435067799539816001E-03  0.193882988824195003E-01
+ -0.125598375468558010E-02 -0.166666666666667011E-01 -0.435070984577867010E-03
+ -0.118114930153955010E-02  0.194632560036250013E-01 -0.166666666666667011E-01
+ -0.435066606879862022E-03  0.193887645192647000E-01 -0.125653020767648002E-02
+ -0.435072177221240990E-03 -0.118164241777987005E-02  0.194637770279468998E-01
+ -0.139525322491481995E-03 -0.166666666666667011E-01  0.172964018029032988E-01
+ -0.889114618565385951E-08  0.000000000000000000E+00 -0.140487892558472998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.401772060623745995E-06  0.177503217815941985E-01  0.000000000000000000E+00
+ -0.139680840737954994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177499929679008003E-01 -0.461429010297917004E-07 -0.139808785077553992E-03
+ -0.474216741479885991E-06  0.178139754223665993E-01 -0.435490707175785019E-03
+ -0.166666666666667011E-01  0.194038024150119011E-01 -0.113136000392566002E-02
+ -0.434647496176783011E-03 -0.118521045340098008E-02  0.193210677155511990E-01
+ -0.166666666666667011E-01 -0.139356047055132996E-03  0.172964913696200991E-01
+ -0.205948470749521995E-07 -0.139855780445013013E-03 -0.104675229678023004E-05
+  0.178145500428797988E-01 -0.435278470720009021E-03 -0.166666666666667011E-01
+  0.194122813378182005E-01 -0.114346492577733008E-02 -0.434860167096455020E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.119664955180343990E-02  0.193456144785291995E-01
+  0.000000000000000000E+00 -0.435507216634916999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194759659828977992E-01 -0.120235295974980010E-02
+ -0.434630954199676010E-03 -0.125613199041933006E-02  0.193898784861901013E-01
+ -0.139505276818091995E-03 -0.166666666666667011E-01  0.172964538951550992E-01
+ -0.222158902483321995E-07 -0.140684567793642991E-03 -0.112653151422048998E-05
+  0.178145939457222992E-01 -0.142980894339064004E-03 -0.166666666666667011E-01
+  0.173314540736370995E-01 -0.109865934611584997E-06 -0.158534103555776988E-03
+ -0.501444248842295042E-05  0.178188430592681017E-01 -0.166666666666667011E-01
+  0.191833679361077016E-01 -0.116695965850981001E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.115255810703607004E-02  0.193073017106333011E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172969314941423015E-01
+ -0.239425929155972008E-07 -0.121051376505055003E-05  0.178147034232610003E-01
+ -0.166666666666667011E-01 -0.435705239999077025E-03  0.190415627118516000E-01
+ -0.780602612534949010E-03 -0.434432059542404973E-03 -0.823587259891796967E-03
+  0.189719852118625000E-01 -0.147270631146331993E-03 -0.166666666666667011E-01
+  0.174669495140968994E-01 -0.100890785221647000E-03 -0.564704614718581016E-03
+ -0.477370877328551027E-03  0.187551209753295985E-01 -0.139439405915424988E-03
+  0.178140970873433002E-01 -0.575941857140058983E-06 -0.166666666666667011E-01
+ -0.139258097457669987E-03 -0.129861811110761993E-07  0.173691987550076005E-01
+ -0.166666666666667011E-01 -0.437167563287824015E-03  0.195228637533087016E-01
+ -0.126248812464285006E-02 -0.432956764514017976E-03 -0.125491318191218006E-02
+  0.193932789336802996E-01 -0.166666666666667011E-01  0.172966662190472999E-01
+ -0.121413434559184010E-06  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.972291627814046991E-05  0.179398884476568016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.350105792758167024E-03
+  0.000000000000000000E+00  0.179358412564323000E-01 -0.564926759529481039E-05
+ -0.144639853954665995E-03 -0.314581136116586985E-06  0.178138319445949010E-01
+ -0.166666666666667011E-01  0.193703856784436003E-01 -0.124844632146220002E-02
+ -0.118120734927350994E-02  0.194604805412335984E-01 -0.431301131288118977E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.192675040792685999E-01 -0.119991065467065991E-02 -0.438790367540799003E-03
+ -0.117513383093279010E-02  0.194438480854111986E-01 -0.160851119958975013E-03
+ -0.166666666666667011E-01  0.172967403401987005E-01 -0.176667226349783012E-07
+ -0.161653225232298001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.204715316608505000E-06
+  0.173054598817215001E-01 -0.166666666666667011E-01 -0.435630521898672989E-03
+  0.193599440196800983E-01 -0.109077905797679009E-02 -0.434507214005793005E-03
+ -0.113826801298030997E-02  0.192764807436608988E-01 -0.166666666666667011E-01
+ -0.139536670190099001E-03  0.173126649368885999E-01 -0.217199105747271010E-07
+ -0.140761521185631995E-03 -0.106837091099441006E-05  0.178145668888614987E-01
+ -0.559739137192076996E-03  0.187342109887928994E-01 -0.453812468447541010E-03
+ -0.166666666666667011E-01 -0.144437867213115988E-03 -0.570571543832924991E-04
+  0.173662394826720017E-01 -0.166666666666667011E-01 -0.141554480273217009E-03
+  0.173902726296936011E-01 -0.612977187708155032E-05 -0.543269256103985979E-03
+ -0.155576690925164998E-04  0.183343607413548014E-01 -0.166666666666667011E-01
+  0.190435224881784007E-01 -0.109812319463792004E-02 -0.166666666666667011E-01
+ -0.111910148431190000E-02  0.192002058717034990E-01 -0.166666666666667011E-01
+  0.191810336689013997E-01 -0.116112314020396998E-02 -0.117547931931091007E-02
+  0.194335300141191998E-01 -0.166666666666667011E-01 -0.435330233904635021E-03
+  0.194710205889397983E-01 -0.119426072742205005E-02 -0.434808333182332976E-03
+ -0.125626108720282010E-02  0.193894446405454002E-01 -0.140176366522793013E-03
+ -0.166666666666667011E-01  0.177492265794550012E-01 -0.496668406936042969E-07
+  0.000000000000000000E+00 -0.140412799232444004E-03 -0.517630044729356947E-06
+  0.178140137898683999E-01 -0.166666666666667011E-01 -0.435069391185710023E-03
+  0.193884418604889988E-01 -0.125613777143264004E-02 -0.435069392940028014E-03
+ -0.118121619509623009E-02  0.194633634760910008E-01 -0.166666666666667011E-01
+ -0.139670763696215990E-03  0.172964743991942983E-01 -0.239698478178091986E-07
+ -0.141639630426265001E-03 -0.121288412910185994E-05  0.178147057825197991E-01
+ -0.141639615804873989E-03  0.178147057811398994E-01 -0.121288279038434004E-05
+ -0.166666666666667011E-01 -0.139670761162715996E-03 -0.239698205914474004E-07
+  0.172964743991641003E-01 -0.166666666666667011E-01 -0.435069386965028005E-03
+  0.193887089999775983E-01 -0.125641102065624996E-02 -0.435069397160709978E-03
+ -0.118148348379683993E-02  0.194636367651009994E-01 -0.139670105523366001E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964392990828005E-01
+ -0.239365198406219005E-07 -0.141635362805994999E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121107464035148993E-05  0.178146126504961991E-01
+  0.000000000000000000E+00 -0.435064986808901988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885562632203010E-01 -0.125634370176468005E-02
+ -0.435073797255201974E-03 -0.118148076820924009E-02  0.194636164222987997E-01
+ -0.139670495544523001E-03 -0.166666666666667011E-01  0.172964394121789995E-01
+ -0.239692490633551009E-07 -0.141638442382618007E-03 -0.121293358252935993E-05
+  0.178147058331974001E-01 -0.166666666666667011E-01 -0.435069391760568977E-03
+  0.193887087038398000E-01 -0.125641061773689995E-02 -0.435069392365169006E-03
+ -0.118148301966637995E-02  0.194636363154463005E-01 -0.139670535416434004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964393658350986E-01
+ -0.239411949136597000E-07 -0.141637844578005988E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121130492256924999E-05  0.178146130952495999E-01
+  0.000000000000000000E+00 -0.435064996378640002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885572616325011E-01 -0.125634453230068005E-02
+ -0.435073787685731975E-03 -0.118148144433873992E-02  0.194636171468661988E-01
+ -0.139670065580594011E-03 -0.166666666666667011E-01  0.172964393185634988E-01
+ -0.239646313460681002E-07 -0.141635961417651987E-03 -0.121270668854361004E-05
+  0.178147055904161987E-01 -0.434952598750324988E-03 -0.166666666666667011E-01
+  0.193531525591995983E-01 -0.122237998046724992E-02 -0.435186141504625024E-03
+ -0.114985437531919007E-02  0.194307215840397000E-01 -0.166666666666667011E-01
+  0.173112910791177986E-01 -0.253865277378717003E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.138259363976047007E-03  0.178692219185750990E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193634106903348008E-01
+ -0.124536794862644003E-02 -0.118108829657000993E-02  0.194595580860945985E-01
+ -0.166666666666667011E-01 -0.435070677392060000E-03  0.194631053165025998E-01
+ -0.118099345597813001E-02 -0.435068106728430011E-03 -0.125584229113298996E-02
+  0.193881574845413997E-01 -0.139730287993371005E-03 -0.166666666666667011E-01
+  0.172965129995373001E-01 -0.246134321847894014E-07 -0.141982715458390012E-03
+ -0.124443001739989997E-05  0.178147384368371013E-01 -0.141918001872325990E-03
+  0.178147321831834994E-01 -0.123839758886950008E-05 -0.166666666666667011E-01
+ -0.139719073147956993E-03 -0.244906574666029988E-07  0.172965128605244989E-01
+ -0.166666666666667011E-01 -0.434974886831416001E-03  0.193565238209871988E-01
+ -0.122539181987882991E-02 -0.435163868606227981E-03 -0.115249078341496999E-02
+  0.194335198475120013E-01 -0.166666666666667011E-01  0.173108110594775988E-01
+ -0.252486229488127016E-04  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.137635746082909991E-03  0.178684051236191015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.140103968298785998E-03
+  0.000000000000000000E+00  0.173357792454707990E-01 -0.263227882027079996E-07
+ -0.143405882709686988E-03 -0.122767817853369000E-05  0.178147220760386006E-01
+ -0.166666666666667011E-01  0.193641798191389013E-01 -0.124570775063777002E-02
+ -0.118110177773261002E-02  0.194596602746646002E-01 -0.140117598274002998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173359378634117996E-01 -0.264605452289240990E-07 -0.143470065352009998E-03
+ -0.123349701123618009E-05  0.178147283869216008E-01 -0.139699946083262993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964334581704009E-01
+ -0.947724450009741998E-08 -0.141286163109395011E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427036318179888015E-06
+  0.177496010208582009E-01 -0.166666666666667011E-01 -0.435115619333969008E-03
+  0.188851999358167003E-01 -0.625665684806612967E-03 -0.435023156520779992E-03
+ -0.676915827627737983E-03  0.188324700803031986E-01 -0.166666666666667011E-01
+ -0.139445450795412993E-03  0.174062618030441998E-01 -0.190292129986386985E-07
+ -0.139963260485348001E-03 -0.765758079538392050E-06  0.178142630176348002E-01
+ -0.532898920971015019E-03  0.187657876890418990E-01 -0.512856477472066019E-03
+ -0.166666666666667011E-01 -0.274567299328894003E-03 -0.183874244209534995E-03
+  0.177330719708947010E-01 -0.166666666666667011E-01 -0.142934406904232013E-03
+  0.173279989505247985E-01 -0.109940048199017001E-06 -0.158611614219584011E-03
+ -0.505928267052957010E-05  0.178189142473271986E-01 -0.166666666666667011E-01
+  0.191837857182392997E-01 -0.116715933513682007E-02 -0.166666666666667011E-01
+ -0.115261551048108000E-02  0.193075009319374005E-01 -0.166666666666667011E-01
+  0.192715504217954997E-01 -0.120392392385177999E-02 -0.117898318247606007E-02
+  0.194477863404796010E-01 -0.166666666666667011E-01 -0.435541716493012012E-03
+  0.194766610463366005E-01 -0.120362759000766000E-02 -0.434596353391778010E-03
+ -0.125584159818315009E-02  0.193897157787361017E-01 -0.435053772892073011E-03
+ -0.166666666666667011E-01  0.193916866829711015E-01 -0.125980517520123993E-02
+  0.000000000000000000E+00 -0.435085010459558010E-03 -0.118504095738582995E-02
+  0.194671524144862003E-01 -0.166666666666667011E-01 -0.139670776269951012E-03
+  0.172964393783541989E-01 -0.239722661869433015E-07 -0.141640063065600003E-03
+ -0.121308200605217006E-05  0.178147059781510013E-01 -0.166666666666667011E-01
+ -0.435069330991406000E-03  0.193887306635154993E-01 -0.125643445841802995E-02
+ -0.435069453134320002E-03 -0.118150726982913000E-02  0.194636605867374995E-01
+ -0.140346027430110010E-03  0.178144315994148983E-01 -0.933532497936932975E-06
+ -0.166666666666667011E-01 -0.140345973839162006E-03 -0.933513688391224014E-06
+  0.178144303098709995E-01 -0.166666666666667011E-01 -0.139688634719759995E-03
+  0.172964363725470992E-01 -0.241646447299009009E-07 -0.141743168416124011E-03
+ -0.122254521804739008E-05  0.178147146809073016E-01 -0.435059870297027000E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193910369733675987E-01
+ -0.125956508747497001E-02 -0.435078913540753987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531740329009E-02  0.194660938610287000E-01
+  0.000000000000000000E+00 -0.435045710174179019E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426795061985E-01 -0.125569402526470994E-02
+ -0.435093072169510985E-03 -0.118112078544253991E-02  0.194631751705502017E-01
+ -0.139419879842034002E-03 -0.166666666666667011E-01  0.172964832727320016E-01
+ -0.213161584771942993E-07 -0.140230214929088989E-03 -0.108242332728665990E-05
+  0.178145871202937990E-01 -0.166666666666667011E-01 -0.435070648727867019E-03
+  0.194635173256733005E-01 -0.118139703309711993E-02 -0.435068135392855987E-03
+ -0.125625483745250995E-02  0.193885626932315015E-01 -0.139434374869669994E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964444175748994E-01
+ -0.849107878174372025E-08 -0.140068522154330002E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.383811097196054990E-06  0.177500579653602011E-01
+  0.000000000000000000E+00 -0.140208513794569996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177497183126398006E-01 -0.504600381193170004E-07
+ -0.140451251998329987E-03 -0.520324482162036022E-06  0.178140160927331009E-01
+ -0.435048849909038004E-03 -0.166666666666667011E-01  0.193915158406327004E-01
+ -0.125973026995557995E-02 -0.435089932877583986E-03 -0.118503856970240008E-02
+  0.194671300235447001E-01 -0.435388199753574995E-03 -0.166666666666667011E-01
+  0.194580790950913997E-01 -0.118263497490539004E-02 -0.434750258397955996E-03
+ -0.124178531210186001E-02  0.193754724111637015E-01 -0.166666666666667011E-01
+  0.173081480358036992E-01 -0.244895773689186006E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133922698934494993E-03  0.178635891343980995E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193849898137051016E-01
+ -0.125483132565947994E-02 -0.118151827368068998E-02  0.194632328928430985E-01
+ -0.166666666666667011E-01 -0.142602728611152993E-03  0.172960572372911001E-01
+ -0.117224517748718006E-06 -0.158587369045204001E-03 -0.567482454003337982E-05
+  0.178189579824735989E-01 -0.139720580884210006E-03 -0.166666666666667011E-01
+  0.177036794314483006E-01 -0.306504126162400988E-07 -0.139932852514187012E-03
+ -0.499375406901578031E-06  0.178139944442497998E-01 -0.546160778925738975E-03
+  0.184421024561669013E-01 -0.894165394996089942E-04 -0.166666666666667011E-01
+ -0.147442848397534993E-03 -0.240246210218283011E-04  0.177137736588957005E-01
+ -0.166666666666667011E-01 -0.400413710403848991E-03  0.186079235147509001E-01
+ -0.750380380436909983E-03 -0.465646679664707009E-03 -0.925379800643584029E-03
+  0.191586075383834986E-01 -0.166666666666667011E-01  0.177029162142693999E-01
+ -0.193968097316907988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.245143979550067003E-03  0.179475102930650998E-01
+ -0.166666666666667011E-01 -0.140401212229714989E-03  0.173460971401947009E-01
+ -0.290712390925789007E-07 -0.144639789824670004E-03 -0.132223121416674009E-05
+  0.178148169771682983E-01 -0.166666666666667011E-01  0.193485878399233015E-01
+ -0.123880031430750989E-02 -0.118081271609363995E-02  0.194575794108941998E-01
+ -0.139755721134129007E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173348680988957005E-01 -0.228988835753830997E-07
+ -0.141707074016602990E-03 -0.107451126571115990E-05  0.178145682418863992E-01
+ -0.139419950330123000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964916757554002E-01 -0.212265241915647985E-07 -0.140190964958966001E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.107696768404692002E-05  0.178142558899385996E-01 -0.166666666666667011E-01
+ -0.143972832788044007E-03  0.173505501235237010E-01 -0.481361186863988992E-04
+ -0.558385355651620026E-03 -0.438374207210451001E-03  0.187218708288803999E-01
+ -0.166666666666667011E-01 -0.139941367226140996E-03  0.174078891209918998E-01
+ -0.228259192699078984E-07 -0.141815000445607008E-03 -0.909747619509022010E-06
+  0.178144035116544995E-01 -0.143597630564741997E-03  0.178145584149138995E-01
+ -0.106316648366329002E-05 -0.166666666666667011E-01 -0.140417726652687994E-03
+ -0.268131811843163990E-07  0.174078935255252004E-01 -0.166666666666667011E-01
+ -0.434071240888807990E-03  0.193423066886313985E-01 -0.122826337417718994E-02
+ -0.436064312888945010E-03 -0.116769060685687994E-02  0.194456917479709016E-01
+ -0.166666666666667011E-01  0.172982030071983997E-01 -0.280949763384092005E-07
+ -0.166666666666667011E-01 -0.189517762455052993E-05  0.178271977135736996E-01
+ -0.166666666666667011E-01  0.178265008932003005E-01 -0.134657556992853991E-05
+ -0.925862840600532052E-06  0.178144080134048990E-01 -0.166666666666667011E-01
+ -0.434776065018225015E-03  0.193791533646124996E-01 -0.125252599578476996E-02
+ -0.435362444422488025E-03 -0.118186375443277993E-02  0.194628395316310994E-01
+ -0.140181958380352012E-03 -0.166666666666667011E-01  0.177492492651218983E-01
+ -0.497367681715309972E-07  0.000000000000000000E+00 -0.140419741913999988E-03
+ -0.518097724412703002E-06  0.178140141000072000E-01 -0.166666666666667011E-01
+ -0.435069320521429986E-03  0.193884456522794985E-01 -0.125614319532225006E-02
+ -0.435069463604292005E-03 -0.118122256043727992E-02  0.194633694896899009E-01
+ -0.166666666666667011E-01 -0.139670889730207013E-03  0.172964735664216003E-01
+ -0.239712525939313013E-07 -0.141640366257597987E-03 -0.121295504750436001E-05
+  0.178147058424521984E-01 -0.141657676433903007E-03  0.178147071049579997E-01
+ -0.121453449198612004E-05 -0.166666666666667011E-01 -0.139673889655509000E-03
+ -0.240033947134963004E-07  0.172964735804191985E-01 -0.166666666666667011E-01
+ -0.435069155034934006E-03  0.193887186536986986E-01 -0.125642600972210999E-02
+ -0.435069629090626005E-03 -0.118150164198114007E-02  0.194636536348553983E-01
+ -0.139672279648728003E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964368900011996E-01 -0.239116221440321008E-07 -0.141646519897784998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120943312255959009E-05
+  0.178144561861929007E-01  0.000000000000000000E+00 -0.435057590298097978E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887674296497996E-01
+ -0.125672688730438004E-02 -0.435081193385279013E-03 -0.118196873889853007E-02
+  0.194640540700679009E-01 -0.139679127767890996E-03 -0.166666666666667011E-01
+  0.172963729770981983E-01 -0.240508992733560004E-07 -0.141688504708434011E-03
+ -0.121696226254563010E-05  0.178146602550901985E-01 -0.166666666666667011E-01
+ -0.435063688039146997E-03  0.193888428280263002E-01 -0.125667755640774008E-02
+ -0.435075095983264025E-03 -0.118183170521553998E-02  0.194639441233023999E-01
+ -0.139679366822631002E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172963909933037986E-01 -0.238904374634853986E-07 -0.141684903032989010E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120762219986995010E-05
+  0.178141341272457995E-01  0.000000000000000000E+00 -0.435044663859789993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193878878622800004E-01
+ -0.125607230703734000E-02 -0.435094118323148986E-03 -0.118150745952062992E-02
+  0.194635591497266010E-01 -0.139672170490270990E-03 -0.166666666666667011E-01
+  0.172964346444417008E-01 -0.239875039867713010E-07 -0.141648157868298996E-03
+ -0.121384147158235009E-05  0.178147065643662998E-01 -0.435090496096461013E-03
+ -0.166666666666667011E-01  0.194669889560541987E-01 -0.118529978595172006E-02
+ -0.435048286615349021E-03 -0.125913755896959003E-02  0.193915246813111015E-01
+ -0.166666666666667011E-01  0.172960984732630010E-01 -0.213609392236131997E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367313562377998E-05
+  0.178056546915008017E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173098191165334987E-01 -0.233466429813017999E-07 -0.116342624074817007E-05
+  0.178149495945451007E-01 -0.166666666666667011E-01 -0.435066797805972990E-03
+  0.193883041569685999E-01 -0.125604896999190001E-02 -0.435071986298389026E-03
+ -0.118116640188501007E-02  0.194633028406514010E-01 -0.139659163853909999E-03
+ -0.166666666666667011E-01  0.172964806846983010E-01 -0.238454355521033004E-07
+ -0.141572633531096987E-03 -0.120675578133157002E-05  0.178147008816651985E-01
+ -0.141538304155846992E-03  0.178146978602927997E-01 -0.120362914934832999E-05
+ -0.166666666666667011E-01 -0.139653215055702006E-03 -0.237818478129986986E-07
+  0.172964806257772004E-01 -0.166666666666667011E-01 -0.435101676794290987E-03
+  0.194659166622981016E-01 -0.118451450992946000E-02 -0.435037104020240977E-03
+ -0.125773918484995990E-02  0.193902104354441995E-01 -0.166666666666667011E-01
+  0.172962783320088007E-01 -0.218342670137599985E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.109032449721754008E-05  0.178073744715902017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434677413065023984E-03  0.000000000000000000E+00  0.193010409686150998E-01
+ -0.117438144655306010E-02 -0.435460867469465978E-03 -0.110662888842583997E-02
+  0.193848162973076985E-01 -0.166666666666667011E-01  0.173066829055725999E-01
+ -0.235026937765211008E-07 -0.117850668861997998E-05  0.178149746842344997E-01
+ -0.434571088713672988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192763238297126993E-01 -0.115086473852541002E-02
+ -0.435566874650336990E-03 -0.108492248973003007E-02  0.193620926345084016E-01
+ -0.139708981740112000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172963531511250990E-01 -0.951377049798812932E-08 -0.141328242724759995E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.428708426390729985E-06  0.177495866410289013E-01 -0.166666666666667011E-01
+ -0.435067939506500023E-03  0.193883890526193002E-01 -0.125611318806702994E-02
+ -0.435070844612536994E-03 -0.118121301451965996E-02  0.194633542459594008E-01
+ -0.166666666666667011E-01 -0.139664557957332988E-03  0.172964746611408010E-01
+ -0.239032606349812988E-07 -0.141603814090319997E-03 -0.120960968951782991E-05
+  0.178147026032577996E-01 -0.141639613175400988E-03  0.178147057784965000E-01
+ -0.121288106723124010E-05 -0.166666666666667011E-01 -0.139670761286573002E-03
+ -0.239698004783898005E-07  0.172964747236447995E-01 -0.166666666666667011E-01
+ -0.435069392365180986E-03  0.194637108092800999E-01 -0.118155587635168993E-02
+ -0.435069391760556996E-03 -0.125648510526804006E-02  0.193887815570613006E-01
+ -0.166666666666667011E-01  0.172964151549298006E-01 -0.947849091229853943E-08
+ -0.166666666666667011E-01 -0.427106302281655022E-06  0.177495992959328004E-01
+ -0.166666666666667011E-01  0.177492248112503005E-01 -0.496796927091242970E-07
+ -0.517779383578906051E-06  0.178140139540073003E-01 -0.166666666666667011E-01
+ -0.435081057490304974E-03  0.194615601858393016E-01 -0.117973336574162002E-02
+ -0.435057726202830975E-03 -0.125399377351724994E-02  0.193864040661351003E-01
+ -0.139666404223663996E-03 -0.166666666666667011E-01  0.173052378719510992E-01
+ -0.233928603649598996E-07  0.000000000000000000E+00 -0.141527802484294987E-03
+ -0.116498797383902993E-05  0.178146582832621990E-01 -0.166666666666667011E-01
+ -0.139669857630702005E-03  0.172964394671825013E-01 -0.239623924701885009E-07
+ -0.141634759721171003E-03 -0.121259630273193999E-05  0.178147054943494990E-01
+ -0.166666666666667011E-01 -0.140555314552680993E-03  0.178144675283670000E-01
+ -0.934892070952238007E-06 -0.140347413389892010E-03 -0.933745297868121043E-06
+  0.178144318010057985E-01 -0.435067722986518012E-03  0.193882964019922999E-01
+ -0.125598089226015993E-02 -0.166666666666667011E-01 -0.435071061130372014E-03
+ -0.118115048591624990E-02  0.194632553716142000E-01 -0.166666666666667011E-01
+ -0.435069777018943008E-03  0.194636048172950014E-01 -0.118146131478344993E-02
+ -0.435069007106324973E-03 -0.125636789473562000E-02  0.193886685693907997E-01
+ -0.139690885024778004E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964302280185014E-01 -0.950505359387174008E-08 -0.141247379876703009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.428634134774421003E-06
+  0.177500186390967991E-01  0.000000000000000000E+00 -0.139677196785329999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177496423494717011E-01
+ -0.457508561380249031E-07 -0.139805680326738989E-03 -0.473830440639680986E-06
+  0.178139751639271003E-01 -0.435493815113026025E-03 -0.166666666666667011E-01
+  0.194038726929433997E-01 -0.113147962792324000E-02 -0.434644379614391009E-03
+ -0.118519815382812010E-02  0.193210650330794993E-01 -0.166666666666667011E-01
+ -0.139355161737402005E-03  0.172964917336673983E-01 -0.205843400324393006E-07
+ -0.139851518567708007E-03 -0.104623713006894006E-05  0.178145496587001004E-01
+ -0.435276969900176027E-03 -0.166666666666667011E-01  0.194123409556366984E-01
+ -0.114355309658945004E-02 -0.434861669984602990E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.119672311455852002E-02
+  0.193457917807220992E-01  0.000000000000000000E+00 -0.435510762531134978E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194758550382455008E-01
+ -0.120231103100156000E-02 -0.434627398233908020E-03 -0.125592048119514991E-02
+  0.193896956616587991E-01 -0.139662076715157995E-03 -0.166666666666667011E-01
+  0.172964848493273003E-01 -0.238760385660033009E-07 -0.141589392729580995E-03
+ -0.120824848732212997E-05  0.178147013421607013E-01 -0.435069416060821000E-03
+ -0.166666666666667011E-01  0.194636372101129017E-01 -0.118148444615658998E-02
+ -0.435069368064916007E-03 -0.125641081782009996E-02  0.193887089888112006E-01
+ -0.166666666666667011E-01  0.172964393423152007E-01 -0.239438465741853011E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.121143525352024005E-05
+  0.178146131933784001E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964393844348002E-01 -0.239719133473768010E-07 -0.121306464651348996E-05
+  0.178147059621872987E-01 -0.166666666666667011E-01 -0.435085659385655980E-03
+  0.194638181335962009E-01 -0.118203438802173995E-02 -0.435053123899394999E-03
+ -0.125612026912269993E-02  0.193884809592554004E-01 -0.139670749044936993E-03
+ -0.166666666666667011E-01  0.172964759607952996E-01 -0.239695886158460013E-07
+ -0.141639529816893012E-03 -0.121286786314684000E-05  0.178147057662445991E-01
+ -0.141639434640671995E-03  0.178147057572842007E-01 -0.121285914937942990E-05
+ -0.166666666666667011E-01 -0.139670732553277014E-03 -0.239694113970498999E-07
+  0.172964759605990989E-01 -0.166666666666667011E-01 -0.435069447260804027E-03
+  0.194636380955877999E-01 -0.118148603206807003E-02 -0.435069336864923981E-03
+ -0.125641079713200007E-02  0.193887090761660011E-01 -0.166666666666667011E-01
+  0.172964393436225994E-01 -0.239438464742388014E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121143524546324995E-05  0.178146131932993002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435064994893337995E-03  0.000000000000000000E+00  0.193885567753522003E-01
+ -0.125634406111458990E-02 -0.435073789170992999E-03 -0.118148100326300994E-02
+  0.194636166968780999E-01 -0.166666666666667011E-01  0.172964393844214012E-01
+ -0.239719133482090010E-07 -0.121306464658341994E-05  0.178147059621873993E-01
+ -0.435064994896760008E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885567753732009E-01 -0.125634406106166999E-02
+ -0.435073789167570986E-03 -0.118148100316007990E-02  0.194636166967958983E-01
+ -0.160852226257028993E-03 -0.166666666666667011E-01  0.172967522287055010E-01
+ -0.175965589473195009E-07 -0.161650188610159987E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.203407230368285005E-06  0.173054266458900000E-01 -0.166666666666667011E-01
+ -0.435068952667249981E-03  0.193884313397972990E-01 -0.125613577038263011E-02
+ -0.435069831457874994E-03 -0.118122048876310006E-02  0.194633662905455002E-01
+ -0.166666666666667011E-01 -0.139672097032745988E-03  0.172964738095684016E-01
+ -0.239841856617656007E-07 -0.141647330681414008E-03 -0.121359018892023999E-05
+  0.178147064035918991E-01 -0.141624421727624012E-03  0.178147048779919992E-01
+ -0.121150370057904998E-05 -0.166666666666667011E-01 -0.139668126615946994E-03
+ -0.239417188530348007E-07  0.172964738011583997E-01 -0.166666666666667011E-01
+ -0.435074178605607004E-03  0.194493807353944004E-01 -0.116766964883922007E-02
+ -0.435064605447021003E-03 -0.124201443598458009E-02  0.193746725929095989E-01
+ -0.166666666666667011E-01  0.173081303330447009E-01 -0.244868682847540015E-04
+ -0.166666666666667011E-01 -0.133924491202099991E-03  0.178635877949816016E-01
+ -0.166666666666667011E-01  0.173348654763293003E-01 -0.223062642015643999E-07
+ -0.104752868644245000E-05  0.178145411980309004E-01 -0.166666666666667011E-01
+ -0.435066924736750998E-03  0.193888351972144012E-01 -0.125659460890097997E-02
+ -0.435071859369655009E-03 -0.118170026599907991E-02  0.194638400334165990E-01
+ -0.139784932051159995E-03 -0.166666666666667011E-01  0.177494904443947013E-01
+ -0.465951791816761021E-07  0.000000000000000000E+00 -0.139919920565161006E-03
+ -0.483545322941620994E-06  0.178139539234989999E-01 -0.166666666666667011E-01
+ -0.435069373706029993E-03  0.193884428434829004E-01 -0.125613915345877008E-02
+ -0.435069410419707014E-03 -0.118121780684916997E-02  0.194633650118269004E-01
+ -0.166666666666667011E-01 -0.139672737807363003E-03  0.172964741998577996E-01
+ -0.239910227158871999E-07 -0.141651023984394999E-03 -0.121392525480250999E-05
+  0.178147066608579999E-01 -0.141640091614955997E-03  0.178147058209647997E-01
+ -0.121292720982870991E-05 -0.166666666666667011E-01 -0.139670843240139006E-03
+ -0.239707145982510011E-07  0.172964741884388991E-01 -0.166666666666667011E-01
+ -0.435065762561208998E-03  0.193887058048313986E-01 -0.125648894894022007E-02
+ -0.435073021522692984E-03 -0.118161486512369002E-02  0.194637424061168997E-01
+ -0.139514021925975012E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964189492996988E-01 -0.222147158226326007E-07 -0.140733410708959988E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.112570816185458990E-05
+  0.178142652186864016E-01  0.000000000000000000E+00 -0.435053679420991009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193881873511392994E-01
+ -0.125619573968341011E-02 -0.435085103920473015E-03 -0.118149860448210999E-02
+  0.194635877202401011E-01 -0.139671537483522003E-03 -0.166666666666667011E-01
+  0.172964363686913987E-01 -0.239806171193915986E-07 -0.141644487121937989E-03
+ -0.121349920180918992E-05  0.178147063153519004E-01 -0.166666666666667011E-01
+ -0.435069239606207980E-03  0.193887146861679988E-01 -0.125642003417049007E-02
+ -0.435069544519456981E-03 -0.118149450125770001E-02  0.194636470456143011E-01
+ -0.139671620203623008E-03 -0.166666666666667011E-01  0.172964378382645008E-01
+ -0.239229803350932009E-07 -0.141643246303753000E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121015091751334009E-05  0.178145160915291999E-01  0.000000000000000000E+00
+ -0.435060404326353023E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885964889095007E-01 -0.125648696022081000E-02 -0.435078379542822014E-03
+ -0.118169041399521007E-02  0.194637954413787000E-01 -0.139513918934682001E-03
+ -0.166666666666667011E-01  0.172964088839709998E-01 -0.223091853795638985E-07
+ -0.140734728439723009E-03 -0.113122244037598008E-05  0.178145985581429009E-01
+ -0.142980894195082010E-03 -0.166666666666667011E-01  0.173314539890347015E-01
+ -0.109866000013417994E-06 -0.158534109366701998E-03 -0.501444632546631029E-05
+  0.178188430632978990E-01 -0.166666666666667011E-01  0.191833678841652998E-01
+ -0.116695963416674989E-02  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.115255809689225998E-02  0.193073016724906003E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172969314941213009E-01 -0.239425929167627016E-07
+ -0.121051376515920001E-05  0.178147034232611010E-01 -0.166666666666667011E-01
+ -0.435672877594180986E-03  0.190410730770498998E-01 -0.779817530267083964E-03
+ -0.434464569145039006E-03 -0.823623092840541026E-03  0.189719892294968988E-01
+ -0.147270553185889990E-03 -0.166666666666667011E-01  0.174669464297294991E-01
+ -0.100890525843168994E-03 -0.564704630029780048E-03 -0.477372644101759012E-03
+  0.187551220717942987E-01 -0.139439417619302005E-03  0.178140970993464001E-01
+ -0.575952369555323042E-06 -0.166666666666667011E-01 -0.139258097457669987E-03
+ -0.129863402849587998E-07  0.173691959079540006E-01 -0.166666666666667011E-01
+ -0.437167519508436999E-03  0.195228624474010001E-01 -0.126248680382140001E-02
+ -0.432956808905804989E-03 -0.125491318727090004E-02  0.193932788356039991E-01
+ -0.166666666666667011E-01  0.172966662193905010E-01 -0.121413434668133993E-06
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.972291627722414967E-05
+  0.179398884478061994E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.350105795458797975E-03  0.000000000000000000E+00
+  0.179358412564393013E-01 -0.564926743849099038E-05 -0.144639853703507987E-03
+ -0.314581126948723974E-06  0.178138319445937006E-01 -0.166666666666667011E-01
+  0.193703856826768009E-01 -0.124844632332796993E-02 -0.118120734934456010E-02
+  0.194604805418001001E-01 -0.431301130210841007E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192675040489116989E-01
+ -0.119991064076258990E-02 -0.438790368590655981E-03 -0.117513383015891001E-02
+  0.194438480812311985E-01 -0.139700882958903997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172963963463645017E-01 -0.952678627103132059E-08
+ -0.141292751882909993E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.429511932811560008E-06  0.177498907292438987E-01
+ -0.166666666666667011E-01 -0.435115740933037984E-03  0.188852022804432002E-01
+ -0.625668905066499950E-03 -0.435023034878147990E-03 -0.676916698779699965E-03
+  0.188324708841117010E-01 -0.166666666666667011E-01 -0.139446455078796010E-03
+  0.174062615201245989E-01 -0.190374494518299010E-07 -0.139967041096937007E-03
+ -0.766079367871497027E-06  0.178142633180293011E-01 -0.532899100111402005E-03
+  0.187657876777795017E-01 -0.512856379296787999E-03 -0.166666666666667011E-01
+ -0.274566803915548998E-03 -0.183873918065665000E-03  0.177330707817049994E-01
+ -0.166666666666667011E-01 -0.142934407180007997E-03  0.173279989208002007E-01
+ -0.109940092441766996E-06 -0.158611617255726990E-03 -0.505928492620163000E-05
+  0.178189142493529011E-01 -0.166666666666667011E-01  0.191837856777059002E-01
+ -0.116715931604291999E-02 -0.166666666666667011E-01 -0.115261550320967993E-02
+  0.193075009050143985E-01 -0.166666666666667011E-01  0.192715504007066017E-01
+ -0.120392391412785997E-02 -0.117898318183162006E-02  0.194477863374666986E-01
+ -0.166666666666667011E-01 -0.435539737045843010E-03  0.194769265765691001E-01
+ -0.120385500091470992E-02 -0.434598338907063995E-03 -0.125615984296152997E-02
+  0.193900185880402989E-01 -0.435045514645459013E-03 -0.166666666666667011E-01
+  0.193914099137533005E-01 -0.125968948646175004E-02  0.000000000000000000E+00
+ -0.435093267670908987E-03 -0.118504666533010995E-02  0.194671248940259008E-01
+ -0.166666666666667011E-01 -0.139670791769998004E-03  0.172964393766727002E-01
+ -0.239724329252018989E-07 -0.141640152553009998E-03 -0.121309020944367992E-05
+  0.178147059866567009E-01 -0.166666666666667011E-01 -0.435069301629178995E-03
+  0.193887307626826010E-01 -0.125643515612083000E-02 -0.435069482496533021E-03
+ -0.118150837470912999E-02  0.194636615934981015E-01 -0.140348157921430997E-03
+  0.178144318702273992E-01 -0.933811502946043969E-06 -0.166666666666667011E-01
+ -0.140348150101401000E-03 -0.933783910990536035E-06  0.178144299643035985E-01
+ -0.166666666666667011E-01 -0.139689488845466003E-03  0.172964362338874009E-01
+ -0.241738769572596000E-07 -0.141748099793582007E-03 -0.122299932206489994E-05
+  0.178147151092019999E-01 -0.435059916514922017E-03 -0.166666666666667011E-01
+  0.193909887083489017E-01 -0.125955268129309002E-02  0.000000000000000000E+00
+ -0.435078867325628023E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493749346423004E-02  0.194660289027400006E-01
+  0.000000000000000000E+00 -0.435044083341360984E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193878554391240000E-01 -0.125604934766983993E-02
+ -0.435094698749261975E-03 -0.118149272049225995E-02  0.194635444483519000E-01
+ -0.139670978031983007E-03 -0.166666666666667011E-01  0.172964365263250999E-01
+ -0.239746177115109000E-07 -0.141641256924305989E-03 -0.121320404495395004E-05
+  0.178147060925782016E-01 -0.166666666666667011E-01 -0.435069350149236000E-03
+  0.193887875206789004E-01 -0.125649208526834003E-02 -0.435069433976496020E-03
+ -0.118156330259116009E-02  0.194637181802562002E-01 -0.139700455923195987E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964141750712001E-01
+ -0.947598200812492946E-08 -0.141288502520823991E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.426976078434958026E-06  0.177495760715989000E-01
+  0.000000000000000000E+00 -0.140209281941629005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177492016410450010E-01 -0.499125605911364029E-07
+ -0.140454288917710998E-03 -0.520385719864523011E-06  0.178140161378314010E-01
+ -0.435047474599787026E-03 -0.166666666666667011E-01  0.193914777822561009E-01
+ -0.125971925119921992E-02 -0.435091308001488001E-03 -0.118504760094160990E-02
+  0.194671336437930997E-01 -0.435388198571750988E-03 -0.166666666666667011E-01
+  0.194580790627339011E-01 -0.118263492130232990E-02 -0.434750259582202979E-03
+ -0.124178531317212996E-02  0.193754724084427010E-01 -0.166666666666667011E-01
+  0.173081480357068010E-01 -0.244895773575376999E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133922698986129010E-03  0.178635891344391985E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193849898136359000E-01
+ -0.125483132562911989E-02 -0.118151827367967994E-02  0.194632328928340016E-01
+ -0.166666666666667011E-01 -0.142602747236407002E-03  0.172960559757211994E-01
+ -0.117227408613948994E-06 -0.158587582136584995E-03 -0.567497435835260990E-05
+  0.178189581300063002E-01 -0.139720580991034007E-03 -0.166666666666667011E-01
+  0.177036782302840007E-01 -0.306502532037200970E-07 -0.139932855252091994E-03
+ -0.499376182467009957E-06  0.178139944449866999E-01 -0.546160785305606963E-03
+  0.184421025597138015E-01 -0.894167397427545998E-04 -0.166666666666667011E-01
+ -0.147442821218409995E-03 -0.240244287551271008E-04  0.177137723141376005E-01
+ -0.166666666666667011E-01 -0.400413696341282016E-03  0.186079233649920003E-01
+ -0.750380296538879967E-03 -0.465646690569393017E-03 -0.925379786421282957E-03
+  0.191586075172780999E-01 -0.166666666666667011E-01  0.177029162002804996E-01
+ -0.193968089936665005E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.245143973844672025E-03  0.179475102895724006E-01
+ -0.166666666666667011E-01 -0.140401212232880995E-03  0.173460971398785996E-01
+ -0.290712391494978008E-07 -0.144639789846284008E-03 -0.132223121759111006E-05
+  0.178148169771744011E-01 -0.166666666666667011E-01  0.193485878396606990E-01
+ -0.123880031419069990E-02 -0.118081271608876993E-02  0.194575794108615002E-01
+ -0.139755721135340007E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173348680988911000E-01 -0.228988835869186992E-07
+ -0.141707074022604998E-03 -0.107451126624275990E-05  0.178145682418867010E-01
+ -0.139671204369828987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964527346999991E-01 -0.238206247996137016E-07 -0.141637869293981998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.120424066353106008E-05  0.178141993541914984E-01 -0.166666666666667011E-01
+ -0.435293328042460988E-03  0.188969200156620006E-01 -0.638325692737096986E-03
+ -0.434845262794537977E-03 -0.686401912695584044E-03  0.188415679710590006E-01
+ -0.166666666666667011E-01 -0.139450260715297009E-03  0.174035477820680996E-01
+ -0.190790185253572011E-07 -0.139988379398403009E-03 -0.772546665100815971E-06
+  0.178142695812380999E-01 -0.535400152422887033E-03  0.187643522383528008E-01
+ -0.509589395081856949E-03 -0.166666666666667011E-01 -0.267380606913843026E-03
+ -0.178487068761980011E-03  0.177166953062542001E-01 -0.166666666666667011E-01
+ -0.142812108775019998E-03  0.173285847433682014E-01 -0.101845505783431005E-06
+ -0.158027855199464992E-03 -0.469583501144497018E-05  0.178185786022877005E-01
+ -0.166666666666667011E-01  0.191928643172403005E-01 -0.117144676995123995E-02
+ -0.166666666666667011E-01 -0.115407859919587998E-02  0.193128636528600017E-01
+ -0.166666666666667011E-01  0.192743847992538006E-01 -0.120526566976207009E-02
+ -0.117896448649838000E-02  0.194473150974792015E-01 -0.166666666666667011E-01
+ -0.435531752463573995E-03  0.194769704025496010E-01 -0.120375472484332991E-02
+ -0.434606347505284997E-03 -0.125643108232954001E-02  0.193902401795961014E-01
+ -0.435044468242076017E-03 -0.166666666666667011E-01  0.193871801282229986E-01
+ -0.125535484215398997E-02  0.000000000000000000E+00 -0.435094313909530008E-03
+ -0.118080938637835000E-02  0.194628399719979014E-01 -0.166666666666667011E-01
+ -0.139670871572238991E-03  0.172964393487799011E-01 -0.239732892684089006E-07
+ -0.141640613379916011E-03 -0.121313235466317992E-05  0.178147060195563006E-01
+ -0.166666666666667011E-01 -0.435069156529857982E-03  0.193887053622597005E-01
+ -0.125641208621197001E-02 -0.435069627595703981E-03 -0.118148788386283009E-02
+  0.194636401109469989E-01 -0.140357102716023998E-03  0.178144327480211015E-01
+ -0.934953983487829978E-06 -0.166666666666667011E-01 -0.140357082074367998E-03
+ -0.934881658679828005E-06  0.178144277579760993E-01 -0.166666666666667011E-01
+ -0.139671248276079991E-03  0.172964387342726990E-01 -0.239772668643883999E-07
+ -0.141642790859008988E-03 -0.121332835924946005E-05  0.178147058633731994E-01
+ -0.435073048109683026E-03 -0.166666666666667011E-01  0.194629028419433005E-01
+ -0.118081642190051008E-02 -0.435065735973592016E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125566691830197993E-02
+  0.193878887757382989E-01  0.000000000000000000E+00 -0.435064995098833989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193890263762267999E-01
+ -0.125684228342590990E-02 -0.435073788965510016E-03 -0.118197542174123009E-02
+  0.194640901826458011E-01 -0.139679109099943003E-03 -0.166666666666667011E-01
+  0.172963725697189014E-01 -0.240507205201775987E-07 -0.141688401087265996E-03
+ -0.121695435575037005E-05  0.178146602328799995E-01 -0.166666666666667011E-01
+ -0.435064159301769998E-03  0.193889744409617010E-01 -0.125680005533066992E-02
+ -0.435074624737013995E-03 -0.118194373351253996E-02  0.194640651337368992E-01
+ -0.139708967713254010E-03 -0.166666666666667011E-01  0.172963616356281991E-01
+ -0.951324367715890009E-08 -0.141328092009716008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.428676150398848976E-06  0.177495851017135005E-01  0.000000000000000000E+00
+ -0.140184006518702988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177492476417109996E-01 -0.497520252347835031E-07 -0.140422322794949995E-03
+ -0.518268041627564046E-06  0.178140140868086015E-01 -0.435065432183330027E-03
+ -0.166666666666667011E-01  0.193879055107330014E-01 -0.125567283667538999E-02
+ -0.435073351892596022E-03 -0.118081969348847004E-02  0.194629358214771994E-01
+ -0.371043726257224002E-03 -0.166666666666667011E-01  0.180438943525561002E-01
+ -0.288534096345938022E-03 -0.486290379332660018E-03 -0.560900084076317983E-03
+  0.187957069582666003E-01 -0.166666666666667011E-01  0.177374439328937990E-01
+ -0.214133585214810996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.267021959071755976E-03  0.179665664489489993E-01
+ -0.166666666666667011E-01  0.193465909995582004E-01 -0.123791262449076994E-02
+ -0.118077342878954009E-02  0.194573127654812002E-01 -0.166666666666667011E-01
+ -0.139258097457669987E-03  0.172965161025564994E-01 -0.139408267359583998E-07
+ -0.139258097457669987E-03 -0.716280208726827027E-06  0.178142098987628013E-01
+ -0.448745061591116986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.187014625769068993E-01 -0.470417770715891002E-03 -0.420615920070051990E-03
+ -0.416030208289801002E-03  0.185800388700857987E-01 -0.143647586674343006E-03
+  0.178142597734856009E-01 -0.784926914613626987E-06 -0.166666666666667011E-01
+ -0.289422112454956976E-03 -0.392634159972045978E-05  0.178691591832313007E-01
+ -0.166666666666667011E-01 -0.433838061791930977E-03  0.193348473865251995E-01
+ -0.122492954566329995E-02 -0.436295790691586000E-03 -0.116751653970171990E-02
+  0.194447289756928994E-01 -0.166666666666667011E-01  0.173080151777424002E-01
+ -0.244742517923652000E-04  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.133996459548231008E-03  0.178636488295263011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139757971222783007E-03
+  0.000000000000000000E+00  0.173348621827651983E-01 -0.229201530844114011E-07
+ -0.141718205670910005E-03 -0.107549290769848001E-05  0.178145692111379012E-01
+ -0.166666666666667011E-01  0.193848567620656005E-01 -0.125477293295166007E-02
+ -0.118151635482893996E-02  0.194632155619245996E-01 -0.140442473137899012E-03
+ -0.166666666666667011E-01  0.173484955870056995E-01 -0.294133609017818009E-07
+ -0.144782161335538988E-03 -0.133037270602234991E-05  0.178148273619183999E-01
+ -0.139679362257274988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964143215187001E-01 -0.238876115774515004E-07 -0.141684595140730989E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.120742002760442999E-05  0.178141299090062988E-01 -0.166666666666667011E-01
+ -0.139670790012750987E-03  0.172964393796226010E-01 -0.239724130311146011E-07
+ -0.141640142355812013E-03 -0.121308921736943007E-05  0.178147059830380990E-01
+ -0.166666666666667011E-01 -0.140356055867897991E-03  0.178144308211048014E-01
+ -0.934793117616238968E-06 -0.140356063365803011E-03 -0.934819410434324011E-06
+  0.178144326353903010E-01 -0.435069477705365992E-03  0.194636362328108985E-01
+ -0.118148328063351999E-02 -0.166666666666667011E-01 -0.435069306420349006E-03
+ -0.125640963064629994E-02  0.193887060480272995E-01 -0.166666666666667011E-01
+ -0.139670746102450000E-03  0.172964393573767014E-01 -0.239719438513679986E-07
+ -0.141639889164474000E-03 -0.121306620727653000E-05  0.178147059636829010E-01
+ -0.166666666666667011E-01  0.193887976067478006E-01 -0.125650162570866992E-02
+ -0.166666666666667011E-01 -0.118157207781601002E-02  0.194637272757679988E-01
+ -0.166666666666667011E-01  0.193887977046092001E-01 -0.125650166419828007E-02
+ -0.118157208862118990E-02  0.194637273843102990E-01 -0.166666666666667011E-01
+ -0.435064765067235998E-03  0.193889522097447989E-01 -0.125676448751559002E-02
+ -0.435074018990514973E-03 -0.118190001722009996E-02  0.194640239141652995E-01
+ -0.140182361260136993E-03 -0.166666666666667011E-01  0.177493225039018002E-01
+ -0.498169409981402992E-07  0.000000000000000000E+00 -0.140419962900806993E-03
+ -0.518131012618996021E-06  0.178140141298323007E-01 -0.166666666666667011E-01
+ -0.435069323598363008E-03  0.193884453764502987E-01 -0.125614284670238011E-02
+ -0.435069460527360988E-03 -0.118122217360852990E-02  0.194633691143310007E-01
+ -0.166666666666667011E-01 -0.139670884426293997E-03  0.172964736171049016E-01
+ -0.239711924341550007E-07 -0.141640335129922002E-03 -0.121295197632909002E-05
+  0.178147058397751003E-01 -0.141656453358155989E-03  0.178147070060390988E-01
+ -0.121442246005117002E-05 -0.166666666666667011E-01 -0.139673677795910993E-03
+ -0.240011176184461992E-07  0.172964736295889016E-01 -0.166666666666667011E-01
+ -0.435069164848058006E-03  0.193887179318738001E-01 -0.125642505656990006E-02
+ -0.435069619277515991E-03 -0.118150056235400997E-02  0.194636526000948017E-01
+ -0.139672366859797993E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964370349440004E-01 -0.239153496141802001E-07 -0.141647103173812004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120964050265571001E-05
+  0.178144653667999010E-01  0.000000000000000000E+00 -0.435058020368536008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887196651586005E-01
+ -0.125666847383042006E-02 -0.435080763346489007E-03 -0.118190508477706993E-02
+  0.194639922621226008E-01 -0.139641248248434992E-03 -0.166666666666667011E-01
+  0.172963810772421012E-01 -0.236457807710709016E-07 -0.141469789611806004E-03
+ -0.119702907967459002E-05  0.178146434627204017E-01 -0.166666666666667011E-01
+ -0.435064079024843019E-03  0.193888050589719994E-01 -0.125663030165246007E-02
+ -0.435074705011247002E-03 -0.118177958688183999E-02  0.194638936988976996E-01
+ -0.139641455576729011E-03 -0.166666666666667011E-01  0.172963976004947007E-01
+ -0.234988035352734987E-07 -0.141466765709830994E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118846831550546996E-05
+  0.178141535165701990E-01  0.000000000000000000E+00 -0.435046365527533990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193879446262342006E-01
+ -0.125609596684336995E-02 -0.435092416913644979E-03 -0.118150619229764994E-02
+  0.194635647529617997E-01 -0.139672261692974013E-03 -0.166666666666667011E-01
+  0.172964349185803015E-01 -0.239884652187998984E-07 -0.141648681442598987E-03
+ -0.121388810715753001E-05  0.178147066071378986E-01 -0.435069380083255997E-03
+ -0.166666666666667011E-01  0.193887114872825991E-01 -0.125641369962899998E-02
+ -0.435069404042481985E-03 -0.118148620112529007E-02  0.194636395221940012E-01
+ -0.166666666666667011E-01  0.172965077531164017E-01 -0.927568178720202994E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046202213369007E-05
+  0.173051339160117008E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887977533772002E-01 -0.125650168557878009E-02 -0.118157208930007003E-02
+  0.194637273906578985E-01 -0.166666666666667011E-01 -0.435066628630517986E-03
+  0.193883490811541000E-01 -0.125609889832018992E-02 -0.435072155470964984E-03
+ -0.118121788505638997E-02  0.194633537877374994E-01 -0.139670767304303994E-03
+ -0.166666666666667011E-01  0.172964740023511983E-01 -0.239699122889327989E-07
+ -0.141639655324462003E-03 -0.121288819251166995E-05  0.178147057860663996E-01
+ -0.141639719254381006E-03  0.178147057910134007E-01 -0.121289402770353995E-05
+ -0.166666666666667011E-01 -0.139670778383217008E-03 -0.239700310192222999E-07
+  0.172964740024197997E-01 -0.166666666666667011E-01 -0.435069359721298977E-03
+  0.193887107978404985E-01 -0.125641340675414995E-02 -0.435069424404436024E-03
+ -0.118148620716793996E-02  0.194636394455750990E-01 -0.166666666666667011E-01
+  0.172965077510262993E-01 -0.927568056586548005E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046204410674010E-05  0.173051339160213008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680780886991E-03  0.000000000000000000E+00  0.173041376249668984E-01
+ -0.235410521181961013E-07 -0.141584640975313996E-03 -0.117464395864609995E-05
+  0.178146677207322995E-01 -0.166666666666667011E-01  0.193887977504637009E-01
+ -0.125650168430156001E-02 -0.118157208925927996E-02  0.194637273902763010E-01
+ -0.139674680734473000E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041376249788992E-01 -0.235410516367698016E-07
+ -0.141584640716005010E-03 -0.117464393529904008E-05  0.178146677207122010E-01
+ -0.139669171022264988E-03 -0.166666666666667011E-01  0.172963674074106989E-01
+ -0.936361616323369989E-08 -0.141145821435217987E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.422126237945786021E-06  0.177496556211017009E-01 -0.166666666666667011E-01
+ -0.139670831173391990E-03  0.172964378151377017E-01 -0.239729563898503003E-07
+ -0.141640396000908991E-03 -0.121311945483194002E-05  0.178147060102482983E-01
+ -0.166666666666667011E-01 -0.140348082156047988E-03  0.178144284125189012E-01
+ -0.933754616232120984E-06 -0.140348096313297011E-03 -0.933804570413684980E-06
+  0.178144318631820002E-01 -0.435069554992271979E-03  0.194636358566133011E-01
+ -0.118148321892500995E-02 -0.166666666666667011E-01 -0.435069229133381978E-03
+ -0.125640843271979000E-02  0.193887033474266987E-01 -0.166666666666667011E-01
+ -0.139670746039409987E-03  0.172964393604615012E-01 -0.239719429806933989E-07
+ -0.141639888769111005E-03 -0.121306615757996010E-05  0.178147059636586010E-01
+ -0.166666666666667011E-01  0.193887976105126016E-01 -0.125650162733728990E-02
+ -0.166666666666667011E-01 -0.118157207791531990E-02  0.194637272767340004E-01
+ -0.166666666666667011E-01  0.193887977049908011E-01 -0.125650166436534001E-02
+ -0.118157208862703007E-02  0.194637273843653001E-01 -0.166666666666667011E-01
+ -0.435069387915653001E-03  0.193887117581042004E-01 -0.125641381798179000E-02
+ -0.435069396210086012E-03 -0.118148620435052005E-02  0.194636395574343014E-01
+ -0.139674686777477991E-03 -0.166666666666667011E-01  0.173041492300615996E-01
+ -0.235404450870160989E-07  0.000000000000000000E+00 -0.141584559821053004E-03
+ -0.117458831712666003E-05  0.178146676653543994E-01 -0.166666666666667011E-01
+ -0.139670743890679005E-03  0.172964393844352998E-01 -0.239719183429222001E-07
+ -0.141639876120318997E-03 -0.121306489215045994E-05  0.178147059624318010E-01
+ -0.166666666666667011E-01 -0.140348087305803990E-03  0.178144318417805016E-01
+ -0.933801043435799969E-06 -0.140348087381334013E-03 -0.933801309950123998E-06
+  0.178144318601900983E-01 -0.435069392938043002E-03  0.194636364111164008E-01
+ -0.118148311443157998E-02 -0.166666666666667011E-01 -0.435069391187694980E-03
+ -0.125641070647420998E-02  0.193887087798863995E-01 -0.166666666666667011E-01
+ -0.435069391191658010E-03  0.193887815134855014E-01 -0.125648507888060008E-02
+ -0.435069392934080027E-03 -0.118155584789411011E-02  0.194637107834846994E-01
+ -0.139700198059465998E-03 -0.166666666666667011E-01  0.172964151566727987E-01
+ -0.947849280924833965E-08 -0.141287467767569990E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383838012019E-06
+  0.177495992952378008E-01  0.000000000000000000E+00 -0.140178145950843993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177492248104791014E-01
+ -0.496797052902892032E-07 -0.140415043811154997E-03 -0.517779519367424970E-06
+  0.178140139541318986E-01 -0.435069390908532979E-03 -0.166666666666667011E-01
+  0.193887088082677003E-01 -0.125641074170613001E-02 -0.435069393217205004E-03
+ -0.118148315311942002E-02  0.194636364485415991E-01 -0.166666666666667011E-01
+ -0.139670745499349992E-03  0.172964393844174009E-01 -0.239719356219104995E-07
+ -0.141639885405697995E-03 -0.121306574176735002E-05  0.178147059632627995E-01
+ -0.435069390223737009E-03 -0.166666666666667011E-01  0.193887087097285998E-01
+ -0.125641070240488004E-02 -0.435069393902000974E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148314340501995E-02
+  0.194636363510784990E-01  0.000000000000000000E+00 -0.435069390007495975E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887087390940994E-01
+ -0.125641068860798000E-02 -0.435069394118242007E-03 -0.118148311389442004E-02
+  0.194636364058402012E-01 -0.139670743908197002E-03 -0.166666666666667011E-01
+  0.172964393843969000E-01 -0.239719185336623003E-07 -0.141639876221824010E-03
+ -0.121306490160525007E-05  0.178147059624407002E-01 -0.435069391188885001E-03
+ -0.166666666666667011E-01  0.193887118630617986E-01 -0.125641385909668990E-02
+ -0.435069392936854012E-03 -0.118148619756053004E-02  0.194636395637321005E-01
+ -0.166666666666667011E-01  0.172965077542917012E-01 -0.927568247378030971E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973725997E-05
+  0.173051339160060005E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391154679994E-03
+  0.193884418577966004E-01 -0.125613776930019998E-02 -0.435069392971057988E-03
+ -0.118121619345346995E-02  0.194633634742979005E-01 -0.139670761454477999E-03
+ -0.166666666666667011E-01  0.172964743993829009E-01 -0.239698237237728008E-07
+ -0.141639617486802999E-03 -0.121288294400935000E-05  0.178147057813331997E-01
+ -0.141639617487035994E-03  0.178147057813331997E-01 -0.121288294403079010E-05
+ -0.166666666666667011E-01 -0.139670761454519009E-03 -0.239698237241789995E-07
+  0.172964743993829009E-01 -0.166666666666667011E-01 -0.435069391188809974E-03
+  0.193887118630593006E-01 -0.125641385909560006E-02 -0.435069392936928985E-03
+ -0.118148619756054001E-02  0.194636395637317987E-01 -0.166666666666667011E-01
+  0.172965077542917012E-01 -0.927568247377395960E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046200973731990E-05  0.173051339160060005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680709099010E-03  0.000000000000000000E+00  0.173041376249854009E-01
+ -0.235410513736555998E-07 -0.141584640574245006E-03 -0.117464392253488990E-05
+  0.178146677207011994E-01 -0.166666666666667011E-01  0.193887977549708004E-01
+ -0.125650168627737991E-02 -0.118157208932236990E-02  0.194637273908664990E-01
+ -0.139674680709099010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041376249854009E-01 -0.235410513736546998E-07
+ -0.141584640574244004E-03 -0.117464392253485009E-05  0.178146677207011994E-01
+ -0.161601720479338990E-03 -0.166666666666667011E-01  0.172965076665808001E-01
+ -0.926702349005646988E-07 -0.162351552716187005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102033131897240007E-05  0.173051453909592000E-01 -0.166666666666667011E-01
+ -0.435069503154383993E-03  0.194633707007618000E-01 -0.118122593631263999E-02
+ -0.435069280971315012E-03 -0.125614175474171000E-02  0.193884463223708003E-01
+ -0.166666666666667011E-01 -0.139672289609427992E-03  0.172964739021653005E-01
+ -0.239862388766482005E-07 -0.141648440824267006E-03 -0.121369084068463994E-05
+  0.178147064711290000E-01 -0.141624407193420988E-03  0.178147048759245002E-01
+ -0.121150194851005999E-05 -0.166666666666667011E-01 -0.139668124261626994E-03
+ -0.239416874711979002E-07  0.172964738935757999E-01 -0.166666666666667011E-01
+ -0.435074183386577979E-03  0.194493808683343997E-01 -0.116766988693369996E-02
+ -0.435064600665905016E-03 -0.124201443215284004E-02  0.193746726050244011E-01
+ -0.166666666666667011E-01  0.173081303333391008E-01 -0.244868683301465988E-04
+ -0.166666666666667011E-01 -0.133924491176626011E-03  0.178635877950088992E-01
+ -0.166666666666667011E-01  0.173348654763634015E-01 -0.223062642005740000E-07
+ -0.104752868632071993E-05  0.178145411980307998E-01 -0.166666666666667011E-01
+ -0.435066580651087987E-03  0.193886968271455989E-01 -0.125646132097252000E-02
+ -0.435072203449548981E-03 -0.118157534474708009E-02  0.194637086880425011E-01
+ -0.139516419388416990E-03 -0.166666666666667011E-01  0.173041769139335999E-01
+ -0.219137423564159007E-07  0.000000000000000000E+00 -0.140700386591277008E-03
+ -0.109554334382343009E-05  0.178145628168012993E-01 -0.166666666666667011E-01
+ -0.139670785511788992E-03  0.172964393739205996E-01 -0.239723655938119001E-07
+ -0.141640116450570004E-03 -0.121308690248200008E-05  0.178147059824802015E-01
+ -0.166666666666667011E-01 -0.140353492284304998E-03  0.178144307839981988E-01
+ -0.934469196153570009E-06 -0.140353499090780005E-03 -0.934493112001855014E-06
+  0.178144324348224990E-01 -0.435069470216686990E-03  0.194636389371891996E-01
+ -0.118148595888806001E-02 -0.166666666666667011E-01 -0.435069313909032019E-03
+ -0.125641246449570009E-02  0.193887089321175994E-01 -0.166666666666667011E-01
+ -0.435066224778556999E-03  0.193888655360086003E-01 -0.125664138810585991E-02
+ -0.435072559315323980E-03 -0.118175670116711000E-02  0.194638920667745997E-01
+ -0.139535529617076010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172963867645380999E-01 -0.887106097806382040E-08 -0.140533144432323988E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.400546896298884985E-06
+  0.177498997117815013E-01  0.000000000000000000E+00 -0.140183011740981989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177495724693359017E-01
+ -0.500888730287148026E-07 -0.140419805775519006E-03 -0.518188128586851950E-06
+  0.178140142381220008E-01 -0.435066535168476983E-03 -0.166666666666667011E-01
+  0.193886214195925013E-01 -0.125637957080158998E-02 -0.435072248931340003E-03
+ -0.118149384028357995E-02  0.194636350622922998E-01 -0.166666666666667011E-01
+ -0.139671934380400999E-03  0.172964390135027998E-01 -0.239846931501482010E-07
+ -0.141646750301238991E-03 -0.121369351008339006E-05  0.178147064557130988E-01
+ -0.435070730673722014E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.194635424129478984E-01 -0.118148467355335004E-02 -0.435068053446324991E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125637211589557001E-02
+  0.193885950416877000E-01  0.000000000000000000E+00 -0.435065887914215998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887904637096015E-01
+ -0.125657470314374992E-02 -0.435072896172526973E-03 -0.118169761602073009E-02
+  0.194638244685289008E-01 -0.139514756609392007E-03 -0.166666666666667011E-01
+  0.172964082847710011E-01 -0.223179439037086001E-07 -0.140739566181214998E-03
+ -0.113165464823919991E-05  0.178145987602886004E-01 -0.142980894581813998E-03
+ -0.166666666666667011E-01  0.173314539689373992E-01 -0.109866042397868997E-06
+ -0.158534112254571006E-03 -0.501444838217852000E-05  0.178188430651418997E-01
+ -0.166666666666667011E-01  0.191833678439739010E-01 -0.116695961504079994E-02
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.115255809083636006E-02
+  0.193073016509602985E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172969314939965015E-01 -0.239425929242464991E-07 -0.121051376580430009E-05
+  0.178147034232617012E-01 -0.166666666666667011E-01 -0.435670020554572023E-03
+  0.190410313364221991E-01 -0.779749213569466953E-03 -0.434467438812183976E-03
+ -0.823627788704149036E-03  0.189719905685246000E-01 -0.147270534866566013E-03
+ -0.166666666666667011E-01  0.174669457049126987E-01 -0.100890465185484999E-03
+ -0.564704633717408051E-03 -0.477373059999760001E-03  0.187551223307545999E-01
+ -0.139439401685823998E-03  0.178140971035927985E-01 -0.575958645398553022E-06
+ -0.166666666666667011E-01 -0.139258097457669987E-03 -0.129864673787176997E-07
+  0.173691952385951996E-01 -0.166666666666667011E-01 -0.437167482025401009E-03
+  0.195228613586728994E-01 -0.126248570169920009E-02 -0.432956846913321015E-03
+ -0.125491321979356992E-02  0.193932787767274011E-01 -0.166666666666667011E-01
+  0.172966662158898984E-01 -0.121413435136702005E-06  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.972291637013934920E-05  0.179398884476381013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.350105792551318000E-03  0.000000000000000000E+00  0.179358412563318005E-01
+ -0.564926760193745028E-05 -0.144639853967615001E-03 -0.314581136545178986E-06
+  0.178138319445972984E-01 -0.166666666666667011E-01  0.193703856781149014E-01
+ -0.124844632131722007E-02 -0.118120734926823010E-02  0.194604805411918991E-01
+ -0.431301129654014016E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192675040339660014E-01 -0.119991063440438991E-02
+ -0.438790369133317979E-03 -0.117513383061983992E-02  0.194438480799019008E-01
+ -0.161600618881618991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964964024612013E-01 -0.929759484961031952E-07 -0.162354606074604997E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102674406185603999E-05  0.173051827906571011E-01 -0.166666666666667011E-01
+ -0.143972810900854002E-03  0.173505492137603010E-01 -0.481358872158661014E-04
+ -0.558385319861011051E-03 -0.438374229680275993E-03  0.187218708148952986E-01
+ -0.166666666666667011E-01 -0.139937822338344987E-03  0.174078896544059003E-01
+ -0.227983603063990998E-07 -0.141801732328176988E-03 -0.908683871102670049E-06
+  0.178144027016026996E-01 -0.143597578982917004E-03  0.178145584095829000E-01
+ -0.106316067196835009E-05 -0.166666666666667011E-01 -0.140417715032831007E-03
+ -0.268130638269802001E-07  0.174078940527996016E-01 -0.166666666666667011E-01
+ -0.434071259447730998E-03  0.193423072809523998E-01 -0.122826363249218997E-02
+ -0.436064294450999015E-03 -0.116769061111692994E-02  0.194456918207231988E-01
+ -0.166666666666667011E-01  0.172982030081412011E-01 -0.280949762784711011E-07
+ -0.166666666666667011E-01 -0.189517761832509992E-05  0.178271977135750007E-01
+ -0.166666666666667011E-01  0.178265008932085994E-01 -0.134657557022567991E-05
+ -0.925862840583118960E-06  0.178144080134048990E-01 -0.166666666666667011E-01
+ -0.434780916834727992E-03  0.193789511951828995E-01 -0.125220905631005992E-02
+ -0.435357601613535017E-03 -0.118147901946294995E-02  0.194624869766942994E-01
+ -0.140177592469087011E-03 -0.166666666666667011E-01  0.177492253272222983E-01
+ -0.496756672234455021E-07  0.000000000000000000E+00 -0.140414345842645999E-03
+ -0.517733155226927036E-06  0.178140139187708997E-01 -0.166666666666667011E-01
+ -0.435069391154579001E-03  0.193884418550169003E-01 -0.125613776648296005E-02
+ -0.435069392971158982E-03 -0.118121619070897998E-02  0.194633634714499008E-01
+ -0.166666666666667011E-01 -0.139670760166673013E-03  0.172964743997286001E-01
+ -0.239698098600330014E-07 -0.141639610050856989E-03 -0.121288226150826994E-05
+  0.178147057806271984E-01 -0.141639611820286996E-03  0.178147057807866993E-01
+ -0.121288242338980006E-05 -0.166666666666667011E-01 -0.139670760473278990E-03
+ -0.239698131526809996E-07  0.172964743997317989E-01 -0.166666666666667011E-01
+ -0.435069393628180999E-03  0.194636361556436016E-01 -0.118148289887844995E-02
+ -0.435069390497556984E-03 -0.125641041453345998E-02  0.193887085193421987E-01
+ -0.139670572892313988E-03 -0.166666666666667011E-01  0.172964393935715992E-01
+ -0.239416569730802012E-07  0.000000000000000000E+00 -0.141638062327426994E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121132811726249994E-05  0.178146133160351008E-01  0.000000000000000000E+00
+ -0.435065005443445983E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885567700676983E-01 -0.125634384183789002E-02 -0.435073778621178993E-03
+ -0.118148063714991003E-02  0.194636163660859007E-01 -0.139670682267266008E-03
+ -0.166666666666667011E-01  0.172964394328304003E-01 -0.239712554208225998E-07
+ -0.141639519995227992E-03 -0.121303220633213010E-05  0.178147059371244997E-01
+ -0.166666666666667011E-01 -0.435069393052231988E-03  0.194636362649158005E-01
+ -0.118148299187565992E-02 -0.435069391073505994E-03 -0.125641054065752994E-02
+  0.193887086398788003E-01 -0.139670722089359006E-03 -0.166666666666667011E-01
+  0.172964393787637984E-01 -0.239432406825611007E-07 -0.141638922827194003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121140585286092006E-05  0.178146133355806986E-01  0.000000000000000000E+00
+ -0.435065002826116019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885565492021017E-01 -0.125634366816502010E-02 -0.435073781238435990E-03
+ -0.118148050457107996E-02  0.194636162212637985E-01 -0.139670533104681001E-03
+ -0.166666666666667011E-01  0.172964394486108987E-01 -0.239696530874210007E-07
+ -0.141638658891901002E-03 -0.121295339218169004E-05  0.178147058622277997E-01
+ -0.435090507624198979E-03 -0.166666666666667011E-01  0.194669892783514015E-01
+ -0.118530036438423006E-02 -0.435048275086066013E-03 -0.125913754589597991E-02
+  0.193915247086616990E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172960984744740011E-01 -0.213609391462743998E-07 -0.166666666666667011E-01
+ -0.106367312934038010E-05  0.178056546914526007E-01 -0.166666666666667011E-01
+  0.173098191165480010E-01 -0.233466429807012998E-07 -0.116342624068299999E-05
+  0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069781984934981E-03
+  0.194633224227003000E-01 -0.118118559577364002E-02 -0.435069002140319990E-03
+ -0.125608528610373000E-02  0.193883926903337007E-01 -0.139659157768774999E-03
+ -0.166666666666667011E-01  0.172964811071458988E-01 -0.238453432888061996E-07
+ -0.141572594197268990E-03 -0.120675029799499991E-05  0.178147008767080006E-01
+ -0.141538198842383991E-03  0.178146978502537016E-01 -0.120361768174549009E-05
+ -0.166666666666667011E-01 -0.139653197525385994E-03 -0.237816337684043011E-07
+  0.172964810481539000E-01 -0.166666666666667011E-01 -0.435101709399810996E-03
+  0.194659175728730989E-01 -0.118451613849551002E-02 -0.435037071408026995E-03
+ -0.125773914873593992E-02  0.193902105130663999E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172962783353253005E-01 -0.218342668577251013E-07
+ -0.166666666666667011E-01 -0.109032448304442004E-05  0.178073744716683996E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.434677413069656997E-03
+  0.193010409680305015E-01 -0.117438144585429007E-02 -0.435460867464844024E-03
+ -0.110662888767641991E-02  0.193848162965736988E-01 -0.166666666666667011E-01
+  0.173066829056793998E-01 -0.235026937711214014E-07 -0.117850668811869000E-05
+  0.178149746842341007E-01 -0.434571088711364992E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.192763238295290996E-01 -0.115086473839449998E-02
+ -0.435566874652637017E-03 -0.108492248963690989E-02  0.193620926343746995E-01
+ -0.139700128428728001E-03 -0.166666666666667011E-01  0.172964062536986006E-01
+ -0.947853442084885955E-08 -0.141287229661082013E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.427116700035002018E-06  0.177495998236214987E-01 -0.166666666666667011E-01
+ -0.139670101692640009E-03  0.172964394288557984E-01 -0.239650162888405991E-07
+ -0.141636168849436004E-03 -0.121272540531506999E-05  0.178147056226493003E-01
+ -0.166666666666667011E-01 -0.435070601601427007E-03  0.194632424803467992E-01
+ -0.118112723413059005E-02 -0.435068182519665007E-03 -0.125598135252497989E-02
+  0.193882951902035992E-01 -0.140350019130283996E-03  0.178144321328113989E-01
+ -0.934070655061730960E-06 -0.166666666666667011E-01 -0.140500665652292997E-03
+ -0.934901878808751052E-06  0.178144580209842994E-01 -0.166666666666667011E-01
+ -0.435068213377728019E-03  0.193838387473541998E-01 -0.125145538065708989E-02
+ -0.435070570743591027E-03 -0.117665454921121997E-02  0.194586920952139995E-01
+ -0.166666666666667011E-01  0.172970581688366015E-01 -0.800807256662095042E-18
+ -0.166666666666667011E-01 -0.116362368362492996E-16  0.173226956581742017E-01
+ -0.166666666666667011E-01  0.173227088035448999E-01 -0.227142423077076990E-07
+ -0.109342148655364995E-05  0.178145868761214014E-01 -0.166666666666667011E-01
+ -0.139356858353544992E-03  0.172964916979878001E-01 -0.206044330373597012E-07
+ -0.139859905327302997E-03 -0.104722219521747004E-05  0.178145504281299986E-01
+ -0.435504189131306018E-03 -0.166666666666667011E-01  0.194761567651129015E-01
+ -0.120248342376421996E-02  0.000000000000000000E+00 -0.434633990251299008E-03
+ -0.125640745877263009E-02  0.193901277969210986E-01 -0.166666666666667011E-01
+ -0.435069416312690977E-03  0.000000000000000000E+00  0.194633539559233987E-01
+ -0.118120745095251002E-02 -0.435069367813045000E-03 -0.125612755432325993E-02
+  0.193884319840324990E-01 -0.166666666666667011E-01 -0.139670622926057006E-03
+  0.172964757091009988E-01 -0.239682454273153985E-07 -0.141638804410660994E-03
+ -0.121280234601187995E-05  0.178147056852437993E-01 -0.141603983295218001E-03
+  0.178147026131165002E-01 -0.120962066737389997E-05 -0.166666666666667011E-01
+ -0.139664589011853001E-03 -0.239035291403939993E-07  0.172964756492609006E-01
+ -0.166666666666667011E-01 -0.435069662231408008E-03  0.194635113006111984E-01
+ -0.118136680130286010E-02 -0.435069121894098010E-03 -0.125627780048987998E-02
+  0.193885794815036004E-01 -0.161604530780638008E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172965258603989015E-01 -0.937092380803772050E-07
+ -0.162357917896980013E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.103420292441458007E-05  0.173052119919553000E-01 -0.139467894615960005E-03
+ -0.166666666666667011E-01  0.173042004359115001E-01 -0.214254882847721013E-07
+ -0.140431934037818008E-03 -0.107186911344032006E-05  0.178145703665167007E-01
+ -0.435495844207702974E-03 -0.166666666666667011E-01  0.194039307261216001E-01
+ -0.113156930142864010E-02 -0.434642344857009977E-03 -0.118520221101701007E-02
+  0.193210744918359996E-01 -0.166666666666667011E-01 -0.435081000308743025E-03
+  0.194615626446394997E-01 -0.117973440054902992E-02 -0.435057783388624999E-03
+ -0.125399790053982996E-02  0.193864078136761001E-01 -0.160852239681984991E-03
+ -0.166666666666667011E-01  0.172967516936828013E-01 -0.175963431721074013E-07
+ -0.161650172616747013E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.203401553309515001E-06  0.173054257802153014E-01
+ -0.139669192621242992E-03 -0.166666666666667011E-01  0.173052370218566010E-01
+ -0.234215439038825986E-07 -0.141543308308260998E-03 -0.116637645863834998E-05
+  0.178146593404366997E-01 -0.435080679695976980E-03 -0.166666666666667011E-01
+  0.194640875795392999E-01 -0.118216803173680995E-02 -0.435058104025100973E-03
+ -0.125653847166064996E-02  0.193888504672653991E-01 -0.415397501230701984E-03
+ -0.166666666666667011E-01  0.188279009057511007E-01 -0.898333912356110006E-03
+ -0.453392657545613985E-03 -0.100006119280005999E-02  0.192410594709606017E-01
+ -0.166666666666667011E-01  0.173196351930705005E-01 -0.236100996593661996E-07
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.123805471953726010E-05  0.178172764971170003E-01
+ -0.166666666666667011E-01  0.172958803103070990E-01 -0.240055324597324986E-07
+ -0.121597905415154999E-05  0.178147088629611004E-01 -0.166666666666667011E-01
+ -0.139360440123895005E-03  0.172964135806168012E-01 -0.206282988179274991E-07
+ -0.139848706255163002E-03 -0.104818854429194995E-05  0.178144673884889013E-01
+ -0.482623514256753016E-03 -0.166666666666667011E-01  0.184436867195339992E-01
+ -0.182833591562523009E-03 -0.374919021333009015E-03 -0.210075563706943995E-03
+  0.183321751743828996E-01 -0.143208310342222002E-03  0.178148148672475012E-01
+ -0.128486278788734005E-05 -0.166666666666667011E-01 -0.199920559280411991E-03
+ -0.199737151953955009E-05  0.178295061512010987E-01 -0.166666666666667011E-01
+ -0.433603816949246982E-03  0.193497617607279009E-01 -0.124503446125193997E-02
+ -0.436528005342376002E-03 -0.119072566655100006E-02  0.194675617176627014E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172951192319563997E-01
+ -0.214703229569797989E-07 -0.166666666666667011E-01 -0.107141837494050999E-05
+  0.178058595667948995E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434603197477276975E-03  0.000000000000000000E+00  0.193058537144648011E-01
+ -0.118063268602804994E-02 -0.435534875277424982E-03 -0.111369468853339993E-02
+  0.193919603590869995E-01 -0.166666666666667011E-01  0.173056689021276987E-01
+ -0.235558337294687006E-07 -0.118363055424740992E-05  0.178149857852361003E-01
+ -0.435171466639532013E-03 -0.166666666666667011E-01  0.194709518243956006E-01
+ -0.119094077767639001E-02 -0.434967284384675014E-03 -0.126075351231549003E-02
+  0.193933220864733989E-01 -0.435279548009691016E-03 -0.166666666666667011E-01
+  0.194122108470178995E-01 -0.114336878037841995E-02 -0.434859088315804978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.119657203671675997E-02  0.193454506408375984E-01
+ -0.166666666666667011E-01 -0.139670792135546010E-03  0.172964364510711996E-01
+ -0.239726277026338997E-07 -0.141640184749274994E-03 -0.121310638409450998E-05
+  0.178147060027833010E-01 -0.166666666666667011E-01 -0.140348082594213009E-03
+  0.178144298829873010E-01 -0.933774300842211964E-06 -0.140348090712458993E-03
+ -0.933802946513419972E-06  0.178144318617154997E-01 -0.435069485964877014E-03
+  0.194636360495127995E-01 -0.118148312545114003E-02 -0.166666666666667011E-01
+ -0.435069298160832996E-03 -0.125640935270473000E-02  0.193887056176578997E-01
+ -0.166666666666667011E-01 -0.139670745411772993E-03  0.172964393771041017E-01
+ -0.239719351629878006E-07 -0.141639884975546990E-03 -0.121306573574195002E-05
+  0.178147059632727985E-01 -0.166666666666667011E-01  0.193887976520839000E-01
+ -0.125650164472002990E-02 -0.166666666666667011E-01 -0.118157208029668996E-02
+  0.194637273004341985E-01 -0.166666666666667011E-01  0.193887977203370011E-01
+ -0.125650167109275009E-02 -0.118157208884183003E-02  0.194637273863745991E-01
+ -0.166666666666667011E-01 -0.435069390354664992E-03  0.193887118325760006E-01
+ -0.125641384488361994E-02 -0.435069393771072991E-03 -0.118148619567291996E-02
+  0.194636395582745009E-01 -0.139674686773609991E-03 -0.166666666666667011E-01
+  0.173041492371790000E-01 -0.235404446357735999E-07  0.000000000000000000E+00
+ -0.141584559729122987E-03 -0.117458827916782993E-05  0.178146676653142995E-01
+ -0.166666666666667011E-01 -0.139670743898058004E-03  0.000000000000000000E+00
+  0.172964393844322988E-01 -0.239719184224240012E-07 -0.141639876162939988E-03
+ -0.121306489605736002E-05  0.178147059624353017E-01 -0.166666666666667011E-01
+ -0.140348087318788991E-03  0.178144318414940016E-01 -0.933801041250177032E-06
+ -0.140348087395501987E-03 -0.933801311940696952E-06  0.178144318601919996E-01
+ -0.435069392951675977E-03  0.194636364109944011E-01 -0.118148311441449989E-02
+ -0.166666666666667011E-01 -0.435069391174062005E-03 -0.125641070624533989E-02
+  0.193887087793657986E-01 -0.166666666666667011E-01 -0.435069391183394005E-03
+  0.193887815136928009E-01 -0.125648507926827001E-02 -0.435069392942343978E-03
+ -0.118155584839527996E-02  0.194637107839485991E-01 -0.139700198051380994E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964151566069001E-01
+ -0.947849285071832001E-08 -0.141287467734219009E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427106386139514992E-06  0.177495992957071996E-01
+ -0.140178145272992012E-03 -0.166666666666667011E-01  0.177492248109030991E-01
+ -0.496797001355079032E-07 -0.140415042956877005E-03 -0.517779462070214031E-06
+  0.178140139540239988E-01 -0.435069391423600026E-03 -0.166666666666667011E-01
+  0.193887083934986999E-01 -0.125641030816468009E-02 -0.435069392702138011E-03
+ -0.118148272213002004E-02  0.194636360081575990E-01 -0.166666666666667011E-01
+ -0.139670744267195992E-03  0.172964393844029993E-01 -0.239719223657696016E-07
+ -0.141639878293219989E-03 -0.121306508981309993E-05  0.178147059625506990E-01
+ -0.435069391344719005E-03 -0.166666666666667011E-01  0.193887083608434013E-01
+ -0.125641029540731006E-02 -0.435069392781018977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148271833985003E-02
+  0.194636359700672001E-01 -0.435069391241579015E-03 -0.166666666666667011E-01
+  0.193887087819448016E-01 -0.125641070755089994E-02 -0.435069392884159022E-03
+ -0.118148311473540010E-02  0.194636364115348993E-01 -0.139670743900726009E-03
+ -0.166666666666667011E-01  0.172964393843643011E-01 -0.239719184546282004E-07
+ -0.141639876179015995E-03 -0.121306489779411998E-05  0.178147059624346009E-01
+ -0.435069391188663010E-03 -0.166666666666667011E-01  0.193887118630544017E-01
+ -0.125641385909356002E-02 -0.435069392937075027E-03 -0.118148619756065993E-02
+  0.194636395637313997E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172965077542917012E-01 -0.927568247375718968E-07 -0.166666666666667011E-01
+ -0.102046200973759010E-05  0.173051339160060005E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627735996E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391170066991E-03
+  0.193884418583467992E-01 -0.125613776955457008E-02 -0.435069392955670991E-03
+ -0.118121619348252006E-02  0.194633634743846991E-01 -0.139670761454596990E-03
+ -0.166666666666667011E-01  0.172964743993801011E-01 -0.239698237251752988E-07
+ -0.141639617487515998E-03 -0.121288294408737000E-05  0.178147057813331997E-01
+ -0.141639617489162006E-03  0.178147057813334009E-01 -0.121288294423791994E-05
+ -0.166666666666667011E-01 -0.139670761454882000E-03 -0.239698237282632997E-07
+  0.172964743993801011E-01 -0.166666666666667011E-01 -0.435069391188130992E-03
+  0.193887118630363987E-01 -0.125641385908596996E-02 -0.435069392937606990E-03
+ -0.118148619756086007E-02  0.194636395637294013E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965077542917012E-01 -0.927568247372701942E-07
+ -0.166666666666667011E-01 -0.102046200973822008E-05  0.173051339160060005E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709100989E-03
+  0.173041376249854009E-01 -0.235410513736760014E-07 -0.141584640574256011E-03
+ -0.117464392253588008E-05  0.178146677207011994E-01 -0.166666666666667011E-01
+  0.193887977549706998E-01 -0.125650168627732006E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.173041376249854009E-01 -0.235410513737710014E-07
+ -0.141584640574248991E-03 -0.117464392253522998E-05  0.178146677207011994E-01
+ -0.161601715880123993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172965076669070010E-01 -0.926692831361944028E-07 -0.162351548531821993E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102032125850803006E-05  0.173051453880029987E-01 -0.166666666666667011E-01
+ -0.139670774820815007E-03  0.172964393819063991E-01 -0.239722506017439995E-07
+ -0.141640054670844003E-03 -0.121308123358901007E-05  0.178147059780696011E-01
+ -0.166666666666667011E-01 -0.435069333337653979E-03  0.193887254401867992E-01
+ -0.125642901506816990E-02 -0.435069450788072999E-03 -0.118150189097284992E-02
+  0.194636551938420013E-01 -0.140349304427809005E-03  0.178144320166155994E-01
+ -0.933957824287751043E-06 -0.166666666666667011E-01 -0.140349299344085009E-03
+ -0.933939903071356043E-06  0.178144307788948991E-01 -0.166666666666667011E-01
+ -0.435068660852071026E-03  0.000000000000000000E+00  0.193848776728661985E-01
+ -0.125250844957367998E-02 -0.435070123271966982E-03 -0.117767775440582993E-02
+  0.194597409817202013E-01 -0.166666666666667011E-01  0.172969944929179005E-01
+ -0.779115314308188993E-07 -0.166666666666667011E-01 -0.755324938795983959E-05
+  0.179004996942120007E-01 -0.166666666666667011E-01  0.178976999748934017E-01
+ -0.387667548982099008E-05 -0.233628102656076998E-06  0.178135930268625009E-01
+ -0.166666666666667011E-01 -0.139686009466056008E-03  0.172964370591520006E-01
+ -0.241363103321880989E-07 -0.141728009792282988E-03 -0.122115128991173003E-05
+  0.178147135015222988E-01 -0.435039396203468979E-03 -0.166666666666667011E-01
+  0.193948601328561010E-01 -0.126336416897148991E-02  0.000000000000000000E+00
+ -0.435099385069850019E-03 -0.118873902080404996E-02  0.194708368876741998E-01
+ -0.166666666666667011E-01 -0.435069389255753991E-03  0.000000000000000000E+00
+  0.193884422504904987E-01 -0.125613821100519006E-02 -0.435069394869983992E-03
+ -0.118121665337716998E-02  0.194633639337775007E-01 -0.166666666666667011E-01
+ -0.139671380124086001E-03  0.172964743448792008E-01 -0.239764688219899014E-07
+ -0.141643188367961012E-03 -0.121320975071175002E-05  0.178147060922408014E-01
+ -0.141639844172828988E-03  0.178147058017989017E-01 -0.121290395199703009E-05
+ -0.166666666666667011E-01 -0.139670800627344013E-03 -0.239702482699698004E-07
+  0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431639017E-03
+  0.193889053227464005E-01 -0.125663654535589997E-02 -0.435070548689851008E-03
+ -0.118172116532546008E-02  0.194638725033117015E-01 -0.161678337891334005E-03
+ -0.166666666666667011E-01  0.172964935301390017E-01 -0.110214153364595997E-06
+ -0.162422470311079996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120622187414538000E-05  0.173052150067064016E-01
+ -0.139675704206640992E-03 -0.166666666666667011E-01  0.173040340098533016E-01
+ -0.235576501556051000E-07 -0.141591382929153007E-03 -0.117568258886253002E-05
+  0.178146687262423983E-01 -0.435067411943173014E-03 -0.166666666666667011E-01
+  0.193886625537611004E-01 -0.125640355917502997E-02 -0.435071372170113015E-03
+ -0.118150457751595010E-02  0.194636501950278991E-01 -0.166666666666667011E-01
+ -0.435069317670600008E-03  0.193887257892246985E-01 -0.125642966435953992E-02
+ -0.435069466455119977E-03 -0.118150274118994998E-02  0.194636560434848990E-01
+ -0.161601801933608996E-03 -0.166666666666667011E-01  0.172965054748796011E-01
+ -0.926502679423437959E-07 -0.162351306454973002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138442591992E-05
+  0.173051393629530988E-01 -0.139695639987958987E-03 -0.166666666666667011E-01
+  0.173041438368830985E-01 -0.237586532625412003E-07 -0.141701626146447995E-03
+ -0.118517343233528007E-05  0.178146776002246002E-01 -0.435031309395256976E-03
+ -0.166666666666667011E-01  0.193945788398414004E-01 -0.126324081468956006E-02
+ -0.435107470132009989E-03 -0.118873511389969004E-02  0.194708001084177990E-01
+ -0.435268203178918020E-03 -0.166666666666667011E-01  0.194803280841455984E-01
+ -0.120221471825937998E-02 -0.434870455402744990E-03 -0.126728100453280001E-02
+  0.194000950994309002E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172950455023187996E-01 -0.209695226068837989E-07 -0.166666666666667011E-01
+ -0.104309478839938997E-05  0.178040398395854990E-01 -0.166666666666667011E-01
+  0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05
+  0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303230992E-03
+  0.172962108365398992E-01 -0.473445694663900975E-05 -0.544660594244081005E-03
+ -0.346550124345780994E-04  0.183333673483142001E-01 -0.139754726222275990E-03
+ -0.166666666666667011E-01  0.176170084219247000E-01 -0.246149781579885013E-07
+ -0.140206540519596996E-03 -0.567788699151559999E-06  0.178140588497450007E-01
+ -0.165095492820345002E-03  0.178223326710730996E-01 -0.681196695085373968E-05
+ -0.166666666666667011E-01 -0.150450799069420005E-03 -0.294765827683784011E-06
+  0.176166680355617014E-01 -0.166666666666667011E-01 -0.421169440873449990E-03
+  0.189193905894529009E-01 -0.945436157372043042E-03 -0.448291280314557015E-03
+ -0.100548420996317994E-02  0.192537236307864998E-01 -0.166666666666667011E-01
+  0.173199119681975988E-01 -0.235984477489147988E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.123676178966295993E-05  0.178172740173286014E-01 -0.166666666666667011E-01
+ -0.435171428461128974E-03  0.194709389729267983E-01 -0.119092785894799004E-02
+ -0.434967322587667975E-03 -0.126074169224678996E-02  0.193933111644412001E-01
+ -0.166666666666667011E-01  0.172958818582512990E-01 -0.240054389194565998E-07
+ -0.121597096057541996E-05  0.178147088549035007E-01 -0.434490077563752020E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.192819871000558009E-01 -0.115810409202821997E-02 -0.435647599072524984E-03
+ -0.109302488139546004E-02  0.193702791629623006E-01 -0.435071227206677987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194623470628700994E-01
+ -0.118137412448114003E-02 -0.435067556908351005E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.125600599177529007E-02
+  0.193876331694533000E-01 -0.166666666666667011E-01 -0.139670743889624998E-03
+  0.172964393844356017E-01 -0.239719183317204011E-07 -0.141639876114236000E-03
+ -0.121306489159398005E-05  0.178147059624313014E-01 -0.166666666666667011E-01
+ -0.140348087313353994E-03  0.178144318418225998E-01 -0.933801044951077008E-06
+ -0.140348087388714990E-03 -0.933801310870910963E-06  0.178144318601911010E-01
+ -0.435069392936102985E-03  0.194636364108025997E-01 -0.118148311410958993E-02
+ -0.166666666666667011E-01 -0.435069391189634998E-03 -0.125641070617519006E-02
+  0.193887087796365994E-01 -0.166666666666667011E-01 -0.139670745273937991E-03
+  0.172964393844580004E-01 -0.239719332022523000E-07 -0.141639884104326007E-03
+ -0.121306562274284990E-05  0.178147059631592990E-01 -0.166666666666667011E-01
+  0.193887976615549004E-01 -0.125650164862212989E-02 -0.166666666666667011E-01
+ -0.118157208096353003E-02  0.194637273070949017E-01 -0.166666666666667011E-01
+  0.193887977248977002E-01 -0.125650167309232009E-02 -0.118157208890513009E-02
+  0.194637273869662994E-01 -0.166666666666667011E-01 -0.435069391187089996E-03
+  0.193887118682170012E-01 -0.125641386440402997E-02 -0.435069392938647987E-03
+ -0.118148620277655990E-02  0.194636395690588986E-01 -0.139674686768426990E-03
+ -0.166666666666667011E-01  0.173041492303226997E-01 -0.235404449782952015E-07
+  0.000000000000000000E+00 -0.141584559767933007E-03 -0.117458831126650007E-05
+  0.178146676653491987E-01 -0.166666666666667011E-01 -0.139670743889684006E-03
+  0.000000000000000000E+00  0.172964393844356017E-01 -0.239719183323613007E-07
+ -0.141639876114578011E-03 -0.121306489162555998E-05  0.178147059624313986E-01
+ -0.166666666666667011E-01 -0.140348087304587000E-03  0.178144318418189986E-01
+ -0.933801043795399019E-06 -0.140348087379959000E-03 -0.933801309750847956E-06
+  0.178144318601899006E-01 -0.435069392936221976E-03  0.194636364108877989E-01
+ -0.118148311419265998E-02 -0.166666666666667011E-01 -0.435069391189516982E-03
+ -0.125641070625854005E-02  0.193887087797160983E-01 -0.166666666666667011E-01
+ -0.435069391192432022E-03  0.193887815134137984E-01 -0.125648507878967997E-02
+ -0.435069392933306015E-03 -0.118155584779333005E-02  0.194637107833882002E-01
+ -0.139700198049885988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964151566860000E-01 -0.947849280219223922E-08 -0.141287467724896009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383695399994E-06
+  0.177495992954452009E-01 -0.140178145690196993E-03 -0.166666666666667011E-01
+  0.177492248106932010E-01 -0.496797033573775009E-07 -0.140415043482606990E-03
+ -0.517779497585701998E-06  0.178140139541216984E-01 -0.435069391089031011E-03
+ -0.166666666666667011E-01  0.193887087977068009E-01 -0.125641072675559008E-02
+ -0.435069393036708002E-03 -0.118148313570800006E-02  0.194636364323594983E-01
+ -0.166666666666667011E-01 -0.139670745392212007E-03  0.172964393844482998E-01
+ -0.239719344734911991E-07 -0.141639884787109996E-03 -0.121306568527222010E-05
+  0.178147059632214992E-01 -0.435069390245067006E-03 -0.166666666666667011E-01
+  0.193887086999318010E-01 -0.125641068746266005E-02 -0.435069393880670977E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148312670310991E-02  0.194636363420862997E-01 -0.435069390179441007E-03
+ -0.166666666666667011E-01  0.193887087448267985E-01 -0.125641069098422005E-02
+ -0.435069393946296975E-03 -0.118148311374637006E-02  0.194636364063980986E-01
+ -0.139670743899172997E-03 -0.166666666666667011E-01  0.172964393844271015E-01
+ -0.239719184348657009E-07 -0.141639876169436012E-03 -0.121306489668503007E-05
+  0.178147059624364015E-01 -0.435069391188929019E-03 -0.166666666666667011E-01
+  0.193887118630633008E-01 -0.125641385909733001E-02 -0.435069392936809993E-03
+ -0.118148619756053004E-02  0.194636395637322983E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965077542917983E-01 -0.927568247378536014E-07
+ -0.166666666666667011E-01 -0.102046200973724006E-05  0.173051339160060005E-01
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.166666666666667011E-01
+ -0.435069391171351012E-03  0.193884418583585988E-01 -0.125613776953759993E-02
+ -0.435069392954388001E-03 -0.118121619344631009E-02  0.194633634743581994E-01
+ -0.139670761454455990E-03 -0.166666666666667011E-01  0.172964743993848993E-01
+ -0.239698237233977009E-07 -0.141639617486649991E-03 -0.121288294398642993E-05
+  0.178147057813330990E-01 -0.141639617486657987E-03  0.178147057813330990E-01
+ -0.121288294398714991E-05 -0.166666666666667011E-01 -0.139670761454456992E-03
+ -0.239698237233852998E-07  0.172964743993848993E-01 -0.166666666666667011E-01
+ -0.435069391188925984E-03  0.193887118630632002E-01 -0.125641385909728990E-02
+ -0.435069392936811999E-03 -0.118148619756053004E-02  0.194636395637322983E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172965077542917983E-01
+ -0.927568247378527941E-07 -0.166666666666667011E-01 -0.102046200973724006E-05
+  0.173051339160060005E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680709099010E-03  0.173041376249854009E-01 -0.235410513736267014E-07
+ -0.141584640574244004E-03 -0.117464392253481007E-05  0.178146677207011994E-01
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.139674680709099010E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.173041376249854009E-01
+ -0.235410513735998015E-07 -0.141584640574244004E-03 -0.117464392253481007E-05
+  0.178146677207011994E-01 -0.161601720324868993E-03 -0.166666666666667011E-01
+  0.172965076669642989E-01 -0.926701979568355970E-07 -0.162351552541144001E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.102033090018047992E-05  0.173051453908108985E-01
+ -0.166666666666667011E-01 -0.435069391175761004E-03  0.193884418585090999E-01
+ -0.125613776960263991E-02 -0.435069392949976978E-03 -0.118121619344674009E-02
+  0.194633634743759006E-01 -0.166666666666667011E-01 -0.139670761454455990E-03
+  0.172964743993851006E-01 -0.239698237233875994E-07 -0.141639617486650994E-03
+ -0.121288294398543001E-05  0.178147057813330990E-01 -0.141639617476079996E-03
+  0.178147057813322005E-01 -0.121288294301813998E-05 -0.166666666666667011E-01
+ -0.139670761452624013E-03 -0.239698237036860001E-07  0.172964743993851006E-01
+ -0.166666666666667011E-01 -0.435069391192448990E-03  0.193887815134013014E-01
+ -0.125648507877692997E-02 -0.435069392933288993E-03 -0.118155584778073010E-02
+  0.194637107833748012E-01 -0.166666666666667011E-01  0.172964151566857016E-01
+ -0.947849086778118929E-08 -0.166666666666667011E-01 -0.427106298729459992E-06
+  0.177495992959284983E-01 -0.166666666666667011E-01  0.177492248112494991E-01
+ -0.496796927082655975E-07 -0.517779383578772961E-06  0.178140139540073003E-01
+ -0.166666666666667011E-01 -0.435069391188929019E-03  0.193887118675337006E-01
+ -0.125641386366846992E-02 -0.435069392936809993E-03 -0.118148620203093000E-02
+  0.194636395683033987E-01 -0.139674686766604988E-03 -0.166666666666667011E-01
+  0.173041492306688985E-01 -0.235404449394736996E-07  0.000000000000000000E+00
+ -0.141584559754340987E-03 -0.117458830859994993E-05  0.178146676653464994E-01
+ -0.166666666666667011E-01 -0.139670743889622992E-03  0.000000000000000000E+00
+  0.172964393844356017E-01 -0.239719183316415013E-07 -0.141639876114222989E-03
+ -0.121306489159275990E-05  0.178147059624313014E-01 -0.166666666666667011E-01
+ -0.140348087304557998E-03  0.178144318418215000E-01 -0.933801043825826983E-06
+ -0.140348087379918993E-03 -0.933801309744065975E-06  0.178144318601899006E-01
+ -0.435069392936097998E-03  0.194636364108022007E-01 -0.118148311410921003E-02
+ -0.166666666666667011E-01 -0.435069391189641015E-03 -0.125641070617487000E-02
+  0.193887087796364016E-01 -0.166666666666667011E-01 -0.435069391192461025E-03
+  0.193887815133983003E-01 -0.125648507877364007E-02 -0.435069392933277012E-03
+ -0.118155584777734001E-02  0.194637107833714011E-01 -0.139700198047769001E-03
+ -0.166666666666667011E-01  0.172964151566880990E-01 -0.947849280983375026E-08
+ -0.141287467715877995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427106384123253011E-06  0.177495992955506998E-01
+ -0.140178145551705995E-03 -0.166666666666667011E-01  0.177492248107965003E-01
+ -0.496797023203806995E-07 -0.140415043308048001E-03 -0.517779485956588989E-06
+  0.178140139541092986E-01 -0.435069391188775008E-03 -0.166666666666667011E-01
+  0.193887087796628006E-01 -0.125641070621944004E-02 -0.435069392936964004E-03
+ -0.118148311416520994E-02  0.194636364108560014E-01 -0.166666666666667011E-01
+ -0.139670745283318996E-03  0.172964393844581010E-01 -0.239719333030583992E-07
+ -0.141639884158474994E-03 -0.121306562769789989E-05  0.178147059631642984E-01
+ -0.435069390314679995E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887086857719992E-01 -0.125641066837268002E-02 -0.435069393811057987E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148310576349008E-02
+  0.194636363266590985E-01  0.000000000000000000E+00 -0.435069390314679995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887087494363994E-01
+ -0.125641069296695000E-02 -0.435069393811057987E-03 -0.118148311374545001E-02
+  0.194636364069358005E-01 -0.139670743897207989E-03 -0.166666666666667011E-01
+  0.172964393844356988E-01 -0.239719184131230996E-07 -0.141639876158003994E-03
+ -0.121306489559900995E-05  0.178147059624353017E-01 -0.435069391188779996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887088940556986E-01
+ -0.125641082318989007E-02 -0.435069392936957987E-03 -0.118148322855787997E-02
+  0.194636365278264001E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964393409659987E-01 -0.239438466648548014E-07 -0.166666666666667011E-01
+ -0.121143526110419994E-05  0.178146131934183993E-01 -0.166666666666667011E-01
+  0.172964393844281007E-01 -0.239719133478075007E-07 -0.121306464654854992E-05
+  0.178147059621872987E-01 -0.166666666666667011E-01 -0.435069391175080993E-03
+  0.193884418584855008E-01 -0.125613776959208998E-02 -0.435069392950656990E-03
+ -0.118121619344611992E-02  0.194633634743727989E-01 -0.139670761454532995E-03
+ -0.166666666666667011E-01  0.172964743993852012E-01 -0.239698237241572016E-07
+ -0.141639617487093999E-03 -0.121288294402584004E-05  0.178147057813331997E-01
+ -0.141639617487093999E-03  0.178147057813331997E-01 -0.121288294402580997E-05
+ -0.166666666666667011E-01 -0.139670761454532995E-03 -0.239698237241842008E-07
+  0.172964743993852012E-01 -0.166666666666667011E-01 -0.435069391188779996E-03
+  0.193887088940556986E-01 -0.125641082318989007E-02 -0.435069392936957987E-03
+ -0.118148322855787997E-02  0.194636365278264001E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172964393409659987E-01 -0.239438466648548014E-07
+ -0.166666666666667011E-01 -0.121143526110419994E-05  0.178146131934183993E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.435064994898318983E-03
+  0.000000000000000000E+00  0.193885567754814997E-01 -0.125634406113793993E-02
+ -0.435073789166010982E-03 -0.118148100321111005E-02  0.194636166968597014E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133478075007E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.435064994898318983E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885567754814997E-01 -0.125634406113793993E-02 -0.435073789166010982E-03
+ -0.118148100321111005E-02  0.194636166968597014E-01 -0.161601720097545002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172965076670156016E-01
+ -0.926701503617335961E-07 -0.162351552330414998E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.102033039390967000E-05
+  0.173051453906522996E-01 -0.166666666666667011E-01 -0.435069886305821003E-03
+  0.194633772872509006E-01 -0.118124110147639996E-02 -0.435068897819141016E-03
+ -0.125613724075302000E-02  0.193884430666162004E-01 -0.166666666666667011E-01
+ -0.139670761026265005E-03  0.172964744480312016E-01 -0.239698159440932015E-07
+ -0.141639614515128010E-03 -0.121288245172214010E-05  0.178147057808154992E-01
+ -0.141639611356817998E-03  0.178147057804570984E-01 -0.121288216154248992E-05
+ -0.166666666666667011E-01 -0.139670760479100993E-03 -0.239698100456143004E-07
+  0.172964744480210014E-01 -0.166666666666667011E-01 -0.435069393124601020E-03
+  0.194636395974100984E-01 -0.118148625156355995E-02 -0.435069391001137993E-03
+ -0.125641387127325998E-02  0.193887118958671993E-01 -0.166666666666667011E-01
+  0.172965077543725011E-01 -0.927568140336766949E-07 -0.166666666666667011E-01
+ -0.102046181269095990E-05  0.173051339147857995E-01 -0.166666666666667011E-01
+  0.173041376239574003E-01 -0.235410465544014012E-07 -0.117464369127346007E-05
+  0.178146677204709010E-01 -0.166666666666667011E-01 -0.435069392055963979E-03
+  0.193887089236092006E-01 -0.125641083591904006E-02 -0.435069392069774004E-03
+ -0.118148322857497003E-02  0.194636365312624987E-01 -0.435064994897326992E-03
+ -0.166666666666667011E-01  0.193885567754197990E-01 -0.125634406109512999E-02
+  0.000000000000000000E+00 -0.435073789167003027E-03 -0.118148100318356004E-02
+  0.194636166968270991E-01 -0.166666666666667011E-01 -0.139670744005780998E-03
+  0.000000000000000000E+00  0.172964393844374995E-01 -0.239719195794844008E-07
+ -0.141639876784672010E-03 -0.121306495294346998E-05  0.178147059624923984E-01
+ -0.166666666666667011E-01 -0.435069390971013978E-03  0.193887087720762998E-01
+ -0.125641070286031001E-02 -0.435069393154724005E-03 -0.118148311400433992E-02
+  0.194636364098218009E-01 -0.140348087384463996E-03  0.178144318601930994E-01
+ -0.933801313166044989E-06 -0.166666666666667011E-01 -0.140348087290235009E-03
+ -0.933800980670895958E-06  0.178144318372258983E-01 -0.166666666666667011E-01
+ -0.139670745283319999E-03  0.172964393844581010E-01 -0.239719333030395990E-07
+ -0.141639884158481011E-03 -0.121306562769829991E-05  0.178147059631642984E-01
+ -0.435069390314711004E-03 -0.166666666666667011E-01  0.193887086857633985E-01
+ -0.125641066836418010E-02  0.000000000000000000E+00 -0.435069393811026979E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148310575508990E-02  0.194636363266489990E-01 -0.435069390314652023E-03
+ -0.166666666666667011E-01  0.193887087494357992E-01 -0.125641069296695998E-02
+ -0.435069393811086014E-03 -0.118148311374589995E-02  0.194636364069360017E-01
+ -0.139670743896895007E-03 -0.166666666666667011E-01  0.172964393844356017E-01
+ -0.239719184097173985E-07 -0.141639876156198012E-03 -0.121306489543125995E-05
+  0.178147059624349999E-01 -0.166666666666667011E-01 -0.435069391192448990E-03
+  0.193887815133984009E-01 -0.125648507877395990E-02 -0.435069392933288993E-03
+ -0.118155584777784005E-02  0.194637107833718001E-01 -0.139700198047441003E-03
+ -0.166666666666667011E-01  0.172964151566879984E-01 -0.947849280858640987E-08
+ -0.141287467714373990E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106384068703984E-06
+  0.177495992955511994E-01 -0.140178145551758009E-03 -0.166666666666667011E-01
+  0.177492248107972012E-01 -0.496797023215807996E-07 -0.140415043308110993E-03
+ -0.517779485960914998E-06  0.178140139541092986E-01 -0.435069391188743024E-03
+ -0.166666666666667011E-01  0.193887087796531001E-01 -0.125641070621038001E-02
+ -0.435069392936995988E-03 -0.118148311415690994E-02  0.194636364108470017E-01
+ -0.139670745278628006E-03 -0.166666666666667011E-01  0.172964393844581010E-01
+ -0.239719332526419989E-07 -0.141639884131401002E-03 -0.121306562522040010E-05
+  0.178147059631618004E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887976612414012E-01 -0.125650164849316990E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118157208094097992E-02  0.194637273068695993E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133478075007E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.166666666666667011E-01
+ -0.139670744005789997E-03  0.172964393844378984E-01 -0.239719195795233015E-07
+ -0.141639876784720013E-03 -0.121306495294561001E-05  0.178147059624923984E-01
+ -0.435069390970999992E-03 -0.166666666666667011E-01  0.193887087720760014E-01
+ -0.125641070286029006E-02 -0.435069393154737991E-03 -0.118148311400450992E-02
+  0.194636364098219015E-01 -0.140348087384448004E-03  0.178144318601930994E-01
+ -0.933801313164241021E-06 -0.166666666666667011E-01 -0.140348087290217987E-03
+ -0.933800980664578045E-06  0.178144318372256000E-01 -0.166666666666667011E-01
+ -0.435069391192457989E-03  0.193887815133986993E-01 -0.125648507877402994E-02
+ -0.435069392933279994E-03 -0.118155584777777001E-02  0.194637107833719007E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964151566877000E-01
+ -0.947849244191978996E-08 -0.166666666666667011E-01 -0.427106377284980004E-06
+  0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569362010E-03
+  0.177492248215363989E-01 -0.496797136596620033E-07 -0.140415043288559006E-03
+ -0.517779487315692952E-06  0.178140139541107002E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.435069390319028026E-03 -0.166666666666667011E-01
+  0.193887087496057986E-01 -0.125641069305237993E-02 -0.435069393806710987E-03
+ -0.118148311376662990E-02  0.194636364069748005E-01 -0.139670783800487997E-03
+ -0.166666666666667011E-01  0.172964393410236991E-01 -0.239438517116492010E-07
+ -0.141639277731674996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.121143550903037006E-05
+  0.178146131936444997E-01 -0.166666666666667011E-01 -0.435630521898627018E-03
+  0.193599440195765006E-01 -0.109077905787125992E-02 -0.434507214005837999E-03
+ -0.113826801288032008E-02  0.192764807435569993E-01 -0.166666666666667011E-01
+ -0.139536670047989993E-03  0.173126649368985017E-01 -0.217199091649137011E-07
+ -0.140761520198303991E-03 -0.106837084216960004E-05  0.178145668887513993E-01
+ -0.559739137191329981E-03  0.187342109887733005E-01 -0.453812468425848997E-03
+ -0.166666666666667011E-01 -0.144437867213283009E-03 -0.570571543797145032E-04
+  0.173662394826822990E-01 -0.166666666666667011E-01 -0.141554480273248993E-03
+  0.173902726296997004E-01 -0.612977187720352984E-05 -0.543269256103840045E-03
+ -0.155576690918531002E-04  0.183343607413550998E-01 -0.166666666666667011E-01
+  0.190435224881807010E-01 -0.109812319463910009E-02 -0.166666666666667011E-01
+ -0.111910148431286992E-02  0.192002058717063995E-01 -0.166666666666667011E-01
+  0.191810336689041995E-01 -0.116112314020530008E-02 -0.117547931931102998E-02
+  0.194335300141195988E-01 -0.166666666666667011E-01 -0.435330234208519001E-03
+  0.194710205585382004E-01 -0.119426070267823997E-02 -0.434808332877938987E-03
+ -0.125626104852707998E-02  0.193894446024132014E-01 -0.140176366249097992E-03
+ -0.166666666666667011E-01  0.177492266081854984E-01 -0.496668683742545015E-07
+  0.000000000000000000E+00 -0.140412798776869000E-03 -0.517630021288783052E-06
+  0.178140137898258992E-01 -0.166666666666667011E-01 -0.435069391186561989E-03
+  0.000000000000000000E+00  0.193884418600598005E-01 -0.125613777099214995E-02
+ -0.435069392939175994E-03 -0.118121619465940005E-02  0.194633634756198985E-01
+ -0.166666666666667011E-01 -0.139670763690922996E-03  0.172964743992523005E-01
+ -0.239698477564660998E-07 -0.141639630395104001E-03 -0.121288412595031991E-05
+  0.178147057825145012E-01 -0.141639614589797013E-03  0.178147057809878995E-01
+ -0.121288267825071997E-05 -0.166666666666667011E-01 -0.139670760952332988E-03
+ -0.239698183155154984E-07  0.172964743992174985E-01 -0.166666666666667011E-01
+ -0.435069386976256979E-03  0.193887089986198996E-01 -0.125641101907938003E-02
+ -0.435069397149481980E-03 -0.118148348210879005E-02  0.194636367633517007E-01
+ -0.139670105391945004E-03 -0.166666666666667011E-01  0.172964392993067012E-01
+ -0.239365206872189997E-07  0.000000000000000000E+00 -0.141635362111391987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121107470134998995E-05  0.178146126578259990E-01  0.000000000000000000E+00
+ -0.435064987171317002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885562197462988E-01 -0.125634365104461995E-02 -0.435073796892796990E-03
+ -0.118148071382402999E-02  0.194636163662551993E-01 -0.139670481067569999E-03
+ -0.166666666666667011E-01  0.172964394190999009E-01 -0.239690924292986984E-07
+ -0.141638358730468997E-03 -0.121292585922250005E-05  0.178147058233791983E-01
+ -0.166666666666667011E-01 -0.435069392178805006E-03  0.194636362774779012E-01
+ -0.118148298166642000E-02 -0.435069391946934006E-03 -0.125641057978546000E-02
+  0.193887086723010006E-01 -0.139670520926417992E-03 -0.166666666666667011E-01
+  0.172964393709954013E-01 -0.239410464318514008E-07 -0.141637761129918003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121129767514461000E-05  0.178146131114593001E-01
+  0.000000000000000000E+00 -0.435064997641161015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885573025995017E-01 -0.125634454891261995E-02
+ -0.435073786423245982E-03 -0.118148144256120997E-02  0.194636171497057989E-01
+ -0.139670065452413992E-03 -0.166666666666667011E-01  0.172964393188763006E-01
+ -0.239646299428580996E-07 -0.141635960674409009E-03 -0.121270661878923007E-05
+  0.178147055903274988E-01 -0.434952598751025003E-03 -0.166666666666667011E-01
+  0.193531525592223995E-01 -0.122237998047711009E-02 -0.435186141503925985E-03
+ -0.114985437531923995E-02  0.194307215840422015E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173112910791177986E-01 -0.253865277378783987E-04
+ -0.166666666666667011E-01 -0.138259363976029009E-03  0.178692219185750990E-01
+ -0.166666666666667011E-01  0.193634106903348008E-01 -0.124536794862644003E-02
+ -0.118108829657000993E-02  0.194595580860945985E-01 -0.166666666666667011E-01
+ -0.435070677644360025E-03  0.194631053235192995E-01 -0.118099346864504007E-02
+ -0.435068106476127980E-03 -0.125584229082854990E-02  0.193881574850998002E-01
+ -0.139730287993055990E-03 -0.166666666666667011E-01  0.172965129995646012E-01
+ -0.246134321794111988E-07 -0.141982715456250990E-03 -0.124443001707338994E-05
+  0.178147384368366989E-01 -0.141918001868253997E-03  0.178147321831826008E-01
+ -0.123839758836049003E-05 -0.166666666666667011E-01 -0.139719073147306011E-03
+ -0.244906574575426016E-07  0.172965128605519006E-01 -0.166666666666667011E-01
+ -0.434974886832897021E-03  0.193565238210357989E-01 -0.122539181989999007E-02
+ -0.435163868604747015E-03 -0.115249078341532994E-02  0.194335198475173998E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173108110594776994E-01
+ -0.252486229488234996E-04 -0.166666666666667011E-01 -0.137635746082908988E-03
+  0.178684051236191986E-01 -0.166666666666667011E-01 -0.140103968298790010E-03
+  0.173357792454707990E-01 -0.263227882029069994E-07 -0.143405882709704010E-03
+ -0.122767817853530995E-05  0.178147220760386006E-01 -0.166666666666667011E-01
+  0.193641798191387000E-01 -0.124570775063769998E-02 -0.118110177773261002E-02
+  0.194596602746646002E-01 -0.140117598274002998E-03 -0.166666666666667011E-01
+  0.173359378634117996E-01 -0.264605452288720992E-07 -0.143470065352011001E-03
+ -0.123349701123627008E-05  0.178147283869216008E-01 -0.139699930872859987E-03
+ -0.166666666666667011E-01  0.172964334628613985E-01 -0.947718670749511931E-08
+ -0.141286093437001994E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427033784487718019E-06
+  0.177496010453740000E-01 -0.166666666666667011E-01 -0.435116790829975981E-03
+  0.188852131512914009E-01 -0.625688270833019968E-03 -0.435021984600298978E-03
+ -0.676914763138863006E-03  0.188324695108036989E-01 -0.166666666666667011E-01
+ -0.139445451109797009E-03  0.174062618540683015E-01 -0.190292153995733009E-07
+ -0.139963261540874002E-03 -0.765758082281971048E-06  0.178142630176260988E-01
+ -0.532898885828498952E-03  0.187657876969587016E-01 -0.512856505158025013E-03
+ -0.166666666666667011E-01 -0.274567397040450026E-03 -0.183874311667027007E-03
+  0.177330722022739987E-01 -0.166666666666667011E-01 -0.142934406988999004E-03
+  0.173279989518831008E-01 -0.109940052362783001E-06 -0.158611614292560006E-03
+ -0.505928282170762005E-05  0.178189142473604983E-01 -0.166666666666667011E-01
+  0.191837857135102006E-01 -0.116715933292911990E-02 -0.166666666666667011E-01
+ -0.115261550950918989E-02  0.193075009282458986E-01 -0.166666666666667011E-01
+  0.192715504188021997E-01 -0.120392392247149001E-02 -0.117898318238484007E-02
+  0.194477863400541011E-01 -0.166666666666667011E-01 -0.435541715949862990E-03
+  0.194766610313947992E-01 -0.120362756621229003E-02 -0.434596353936575995E-03
+ -0.125584159877691992E-02  0.193897157775975992E-01 -0.435053772892238027E-03
+ -0.166666666666667011E-01  0.193916866829808993E-01 -0.125980517520797000E-02
+ -0.435085010459394024E-03 -0.118504095739005010E-02  0.194671524144910991E-01
+ -0.166666666666667011E-01 -0.139670776269950009E-03  0.172964393783541989E-01
+ -0.239722661868781990E-07 -0.141640063065594013E-03 -0.121308200605163008E-05
+  0.178147059781510013E-01 -0.166666666666667011E-01 -0.435069330991408006E-03
+  0.193887306635153987E-01 -0.125643445841789009E-02 -0.435069453134317996E-03
+ -0.118150726982896000E-02  0.194636605867372983E-01 -0.140346027430057995E-03
+  0.178144315994148983E-01 -0.933532497931826955E-06 -0.166666666666667011E-01
+ -0.140345973839123002E-03 -0.933513688386790009E-06  0.178144303098709995E-01
+ -0.166666666666667011E-01 -0.139688634719731995E-03  0.172964363725470992E-01
+ -0.241646447296168007E-07 -0.141743168415957993E-03 -0.122254521803203994E-05
+  0.178147146809073016E-01 -0.435059870297020007E-03 -0.166666666666667011E-01
+  0.193910369733737015E-01 -0.125956508747994997E-02  0.000000000000000000E+00
+ -0.435078913540760980E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531740769997E-02  0.194660938610356007E-01
+  0.000000000000000000E+00 -0.435045710174240005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426795061985E-01 -0.125569402526346007E-02
+ -0.435093072169449998E-03 -0.118112078544045998E-02  0.194631751705481998E-01
+ -0.139419879842097997E-03 -0.166666666666667011E-01  0.172964832727322999E-01
+ -0.213161584778305005E-07 -0.140230214929005993E-03 -0.108242332731610002E-05
+  0.178145871202936984E-01 -0.166666666666667011E-01 -0.435070648727890980E-03
+  0.194635173256722006E-01 -0.118139703309657999E-02 -0.435068135392832026E-03
+ -0.125625483745070996E-02  0.193885626932298015E-01 -0.139434374869739004E-03
+ -0.166666666666667011E-01  0.172964444175751006E-01 -0.849107878205794962E-08
+ -0.140068522154645993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811097207312005E-06
+  0.177500579653601005E-01 -0.140208513794423006E-03 -0.166666666666667011E-01
+  0.177497183126398006E-01 -0.504600381181250993E-07 -0.140451251998147000E-03
+ -0.520324482149772044E-06  0.178140160927331009E-01 -0.435048849909143985E-03
+ -0.166666666666667011E-01  0.193915158406403991E-01 -0.125973026996141989E-02
+ -0.435089932877478006E-03 -0.118503856970659008E-02  0.194671300235494012E-01
+ -0.435388199753579007E-03 -0.166666666666667011E-01  0.194580790950915003E-01
+ -0.118263497490555007E-02 -0.434750258397953015E-03 -0.124178531210186001E-02
+  0.193754724111637015E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173081480358036992E-01 -0.244895773689184990E-04 -0.166666666666667011E-01
+ -0.133922698934493990E-03  0.178635891343980995E-01 -0.166666666666667011E-01
+  0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02
+  0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611135999E-03
+  0.172960572372906005E-01 -0.117224517748023995E-06 -0.158587369045146999E-03
+ -0.567482454000282988E-05  0.178189579824735989E-01 -0.139720580884206998E-03
+ -0.166666666666667011E-01  0.177036794314488002E-01 -0.306504126161977009E-07
+ -0.139932852514181998E-03 -0.499375406900951968E-06  0.178139944442497998E-01
+ -0.546160778925739950E-03  0.184421024561668007E-01 -0.894165394995071063E-04
+ -0.166666666666667011E-01 -0.147442848397543992E-03 -0.240246210218879017E-04
+  0.177137736588962001E-01 -0.166666666666667011E-01 -0.400413710403862977E-03
+  0.186079235147511013E-01 -0.750380380436990973E-03 -0.465646679664696004E-03
+ -0.925379800643594003E-03  0.191586075383834986E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177029162142695005E-01 -0.193968097316926012E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.245143979550076002E-03
+  0.179475102930650998E-01 -0.166666666666667011E-01 -0.140401212229714989E-03
+  0.173460971401947009E-01 -0.290712390926593986E-07 -0.144639789824670004E-03
+ -0.132223121416671002E-05  0.178148169771682983E-01 -0.166666666666667011E-01
+  0.193485878399233015E-01 -0.123880031430750989E-02 -0.118081271609363995E-02
+  0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01
+  0.173348680988957005E-01 -0.228988835753830005E-07 -0.141707074016601988E-03
+ -0.107451126571115990E-05  0.178145682418863992E-01 -0.139419950330182008E-03
+ -0.166666666666667011E-01  0.172964916756991986E-01 -0.212265241962166009E-07
+ -0.140190964959546998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.107696768439272990E-05  0.178142558899423015E-01
+ -0.166666666666667011E-01 -0.435630511714110973E-03  0.193599454053489002E-01
+ -0.109078026057308000E-02 -0.434507224229087005E-03 -0.113826963448308990E-02
+  0.192764822629944987E-01 -0.166666666666667011E-01 -0.139539348987166009E-03
+  0.173126647078671002E-01 -0.217461924902823005E-07 -0.140774574657894001E-03
+ -0.106961747895517997E-05  0.178145678787864983E-01 -0.559739151621855982E-03
+  0.187342113122531996E-01 -0.453812905743477017E-03 -0.166666666666667011E-01
+ -0.144437865847829008E-03 -0.570572508916922003E-04  0.173662393449596011E-01
+ -0.166666666666667011E-01 -0.141554479709531003E-03  0.173902724631262995E-01
+ -0.612976885063492026E-05 -0.543269258690614047E-03 -0.155576910986749004E-04
+  0.183343607371364015E-01 -0.166666666666667011E-01  0.190435223900498989E-01
+ -0.109812314311903995E-02 -0.166666666666667011E-01 -0.111910146500585994E-02
+  0.192002058225646012E-01 -0.166666666666667011E-01  0.191810336309519991E-01
+ -0.116112312176440999E-02 -0.117547931761240003E-02  0.194335300082579994E-01
+ -0.166666666666667011E-01 -0.435325182482829002E-03  0.194712654108410016E-01
+ -0.119439354715173991E-02 -0.434813392979988976E-03 -0.125665050313407000E-02
+  0.193897929840027988E-01 -0.140180749732543006E-03 -0.166666666666667011E-01
+  0.177492505912984003E-01 -0.497281655052260991E-07  0.000000000000000000E+00
+ -0.140418216862212009E-03 -0.517996101551560007E-06  0.178140139712071009E-01
+ -0.166666666666667011E-01 -0.435069320552657990E-03  0.193884456575911003E-01
+ -0.125614320010757996E-02 -0.435069463573064001E-03 -0.118122256466309992E-02
+  0.194633694941631005E-01 -0.166666666666667011E-01 -0.139670893318960006E-03
+  0.172964735659094995E-01 -0.239712911883834992E-07 -0.141640386974580993E-03
+ -0.121295694635543998E-05  0.178147058443772002E-01 -0.141657679701159996E-03
+  0.178147071052390006E-01 -0.121453479340950007E-05 -0.166666666666667011E-01
+ -0.139673890220687005E-03 -0.240034008201301999E-07  0.172964735798710016E-01
+ -0.166666666666667011E-01 -0.435069148260108026E-03  0.193887190916836004E-01
+ -0.125642659983513007E-02 -0.435069635865442011E-03 -0.118150231844085991E-02
+  0.194636542898556017E-01 -0.139671819173706988E-03 -0.166666666666667011E-01
+  0.172964367943914989E-01 -0.239065653681595988E-07  0.000000000000000000E+00
+ -0.141643861658325012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120918365724227991E-05  0.178144555196989990E-01
+  0.000000000000000000E+00 -0.435057571499996014E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193887668592458984E-01 -0.125672668568808997E-02
+ -0.435081212181971026E-03 -0.118196881219207990E-02  0.194640540662835010E-01
+ -0.139678894290856009E-03 -0.166666666666667011E-01  0.172963729635307005E-01
+ -0.240483845199339011E-07 -0.141687157028328993E-03 -0.121683863414542000E-05
+  0.178146601266745003E-01 -0.166666666666667011E-01 -0.435063686986125991E-03
+  0.193888428213916005E-01 -0.125667757246277997E-02 -0.435075097036248006E-03
+ -0.118183173664951000E-02  0.194639441484454993E-01 -0.139679133300664996E-03
+ -0.166666666666667011E-01  0.172963909863181990E-01 -0.238879039025341995E-07
+ -0.141683556731170989E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120749743554915009E-05  0.178141338836764994E-01
+  0.000000000000000000E+00 -0.435044658394895018E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193878886191347008E-01 -0.125607319657829999E-02
+ -0.435094123787185003E-03 -0.118150841007670995E-02  0.194635600909605008E-01
+ -0.139671709913231998E-03 -0.166666666666667011E-01  0.172964345130826001E-01
+ -0.239825539017364991E-07 -0.141645500444457010E-03 -0.121359829368120991E-05
+  0.178147062946855013E-01 -0.434952586391279994E-03 -0.166666666666667011E-01
+  0.193531521576786013E-01 -0.122237980995592004E-02 -0.435186153854376021E-03
+ -0.114985437868289999E-02  0.194307215405399997E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173112910777010014E-01 -0.253865275619669011E-04
+ -0.166666666666667011E-01 -0.138259364869725991E-03  0.178692219193366010E-01
+ -0.166666666666667011E-01  0.193634106886295017E-01 -0.124536794787297005E-02
+ -0.118108829653989990E-02  0.194595580858667010E-01 -0.166666666666667011E-01
+ -0.435067679897617021E-03  0.193880971906136984E-01 -0.125581995532966003E-02
+ -0.435071104218809021E-03 -0.118092995363368010E-02  0.194630638101403984E-01
+ -0.139730294231680001E-03 -0.166666666666667011E-01  0.172965125754464995E-01
+ -0.246135293977140997E-07 -0.141982756437381995E-03 -0.124443577379881005E-05
+  0.178147384420490988E-01 -0.141918111126178997E-03  0.178147321937730009E-01
+ -0.123840965646412996E-05 -0.166666666666667011E-01 -0.139719091240698002E-03
+ -0.244908832673502001E-07  0.172965124365098005E-01 -0.166666666666667011E-01
+ -0.434974852046012982E-03  0.193565226857273995E-01 -0.122539133752111993E-02
+ -0.435163903370489979E-03 -0.115249079254224011E-02  0.194335197239201007E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173108110555817013E-01
+ -0.252486224725561010E-04 -0.166666666666667011E-01 -0.137635748516305002E-03
+  0.178684051256846992E-01 -0.166666666666667011E-01 -0.140103968384359013E-03
+  0.173357792453362990E-01 -0.263227891208892990E-07 -0.143405883131451987E-03
+ -0.122767822040640990E-05  0.178147220760749986E-01 -0.166666666666667011E-01
+  0.193641798143363991E-01 -0.124570774851647990E-02 -0.118110177764830008E-02
+  0.194596602740228983E-01 -0.140117598304397007E-03 -0.166666666666667011E-01
+  0.173359378633641988E-01 -0.264605455567292014E-07 -0.143470065501732002E-03
+ -0.123349702618485005E-05  0.178147283869347015E-01 -0.139708750660181013E-03
+ -0.166666666666667011E-01  0.172963799080418992E-01 -0.951229877766877080E-08
+ -0.141326952140999990E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.428620398176468988E-06
+  0.177495879055481996E-01 -0.166666666666667011E-01 -0.435139436933837007E-03
+  0.188854634743449989E-01 -0.626119914025915028E-03 -0.434999328204409004E-03
+ -0.676888990402598964E-03  0.188324542389332987E-01 -0.166666666666667011E-01
+ -0.139445446199876991E-03  0.174062644632392988E-01 -0.190291660309238009E-07
+ -0.139963236529218007E-03 -0.765751518464875966E-06  0.178142630110634005E-01
+ -0.532897137108844952E-03  0.187657880254245013E-01 -0.512857774623407003E-03
+ -0.166666666666667011E-01 -0.274572252963939997E-03 -0.183877623767562004E-03
+  0.177330837430943015E-01 -0.166666666666667011E-01 -0.142934406314033008E-03
+  0.173279990759568001E-01 -0.109939898040709003E-06 -0.158611602798506006E-03
+ -0.505927466613655039E-05  0.178189142396055003E-01 -0.166666666666667011E-01
+  0.191837858462441009E-01 -0.116715939594670996E-02 -0.166666666666667011E-01
+ -0.115261553028868000E-02  0.193075010029012985E-01 -0.166666666666667011E-01
+  0.192715504745154011E-01 -0.120392394815805009E-02 -0.117898318409425003E-02
+  0.194477863480717016E-01 -0.166666666666667011E-01 -0.435541725864183982E-03
+  0.194766613131175000E-01 -0.120362800929753994E-02 -0.434596343992153973E-03
+ -0.125584159683252996E-02  0.193897158068569997E-01 -0.435053772712346986E-03
+ -0.166666666666667011E-01  0.193916866769025011E-01 -0.125980517264371998E-02
+ -0.435085010639267014E-03 -0.118504095747396995E-02  0.194671524138403003E-01
+ -0.166666666666667011E-01 -0.139670776270514011E-03  0.172964393783540983E-01
+ -0.239722661929319005E-07 -0.141640063068849004E-03 -0.121308200634946994E-05
+  0.178147059781512997E-01 -0.166666666666667011E-01 -0.435069330990369015E-03
+  0.193887306635063990E-01 -0.125643445843000995E-02 -0.435069453135356987E-03
+ -0.118150726985585992E-02  0.194636605867599989E-01 -0.140346027478982998E-03
+  0.178144315994209004E-01 -0.933532505083072018E-06 -0.166666666666667011E-01
+ -0.140345973895093990E-03 -0.933513695241501967E-06  0.178144303098549985E-01
+ -0.166666666666667011E-01 -0.139688634737237009E-03  0.172964363725436991E-01
+ -0.241646449186064015E-07 -0.141743168517025991E-03 -0.122254522732879002E-05
+  0.178147146809153993E-01 -0.435059870304747008E-03 -0.166666666666667011E-01
+  0.193910369723532990E-01 -0.125956508720873007E-02  0.000000000000000000E+00
+ -0.435078913533033980E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531739844002E-02  0.194660938594058003E-01
+  0.000000000000000000E+00 -0.435045710134425011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426850105004E-01 -0.125569403179517989E-02
+ -0.435093072209260005E-03 -0.118112079243948005E-02  0.194631751773721995E-01
+ -0.139419883587548995E-03 -0.166666666666667011E-01  0.172964832718376996E-01
+ -0.213161991804850005E-07 -0.140230234433833000E-03 -0.108242532544913993E-05
+  0.178145871219505987E-01 -0.166666666666667011E-01 -0.435070648695012982E-03
+  0.194635173293556986E-01 -0.118139703587940995E-02 -0.435068135425710023E-03
+ -0.125625484210198000E-02  0.193885626975772996E-01 -0.139434378884568000E-03
+ -0.166666666666667011E-01  0.172964444169705009E-01 -0.849109393347174922E-08
+ -0.140068540591421997E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811762342743974E-06
+  0.177500579574621994E-01 -0.140208513821624988E-03 -0.166666666666667011E-01
+  0.177497183043298992E-01 -0.504600292032754029E-07 -0.140451252065581990E-03
+ -0.520324484157912038E-06  0.178140160927338989E-01 -0.435048849877068026E-03
+ -0.166666666666667011E-01  0.193915158396935003E-01 -0.125973026964695009E-02
+ -0.435089932909550007E-03 -0.118503856986224010E-02  0.194671300235720011E-01
+ -0.435388199753534012E-03 -0.166666666666667011E-01  0.194580790950902999E-01
+ -0.118263497490352998E-02 -0.434750258397998009E-03 -0.124178531210191010E-02
+  0.193754724111636009E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173081480358036992E-01 -0.244895773689180992E-04 -0.166666666666667011E-01
+ -0.133922698934496999E-03  0.178635891343980995E-01 -0.166666666666667011E-01
+  0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02
+  0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611646007E-03
+  0.172960572372683995E-01 -0.117224517813049999E-06 -0.158587369049941992E-03
+ -0.567482454330607033E-05  0.178189579824768012E-01 -0.139720580884228004E-03
+ -0.166666666666667011E-01  0.177036794314212008E-01 -0.306504126126849030E-07
+ -0.139932852514272990E-03 -0.499375406920812032E-06  0.178139944442497998E-01
+ -0.546160778925799039E-03  0.184421024561692987E-01 -0.894165395041286943E-04
+ -0.166666666666667011E-01 -0.147442848396934996E-03 -0.240246210175965007E-04
+  0.177137736588654990E-01 -0.166666666666667011E-01 -0.400413710403472989E-03
+  0.186079235147468998E-01 -0.750380380434690947E-03 -0.465646679664998009E-03
+ -0.925379800643236976E-03  0.191586075383828984E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177029162142689003E-01 -0.193968097316642006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.245143979549885995E-03
+  0.179475102930649992E-01 -0.166666666666667011E-01 -0.140401212229714989E-03
+  0.173460971401947009E-01 -0.290712390925843005E-07 -0.144639789824673013E-03
+ -0.132223121416698996E-05  0.178148169771682983E-01 -0.166666666666667011E-01
+  0.193485878399233015E-01 -0.123880031430749992E-02 -0.118081271609363995E-02
+  0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01
+  0.173348680988957005E-01 -0.228988835753840990E-07 -0.141707074016602990E-03
+ -0.107451126571121009E-05  0.178145682418863992E-01 -0.139419954077844995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964916755905009E-01
+ -0.212265636585413010E-07 -0.140190986545845990E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.107696962013746004E-05  0.178142558877735016E-01
+ -0.166666666666667011E-01 -0.435069846225575999E-03  0.194634066063343984E-01
+ -0.118126932115139994E-02 -0.435068937899507017E-03 -0.125616762178684996E-02
+  0.193884733114783013E-01 -0.166666666666667011E-01 -0.139670406363162002E-03
+  0.172964705688706984E-01 -0.239661812368390999E-07 -0.141637605266820997E-03
+ -0.121271180256684004E-05  0.178147053610641995E-01 -0.142072630916558992E-03
+  0.178147467538722001E-01 -0.125303449493682997E-05 -0.166666666666667011E-01
+ -0.139745783560114997E-03 -0.247866380946762004E-07  0.172964714736013984E-01
+ -0.166666666666667011E-01 -0.434919264262457973E-03  0.193344334821136006E-01
+ -0.120384249982520997E-02 -0.435219446910729000E-03 -0.113217480938375002E-02
+  0.194124575348414985E-01 -0.166666666666667011E-01  0.173153477755207987E-01
+ -0.270609262604330985E-04 -0.166666666666667011E-01 -0.148526207309566006E-03
+  0.178819709504901003E-01 -0.166666666666667011E-01  0.173381030673696000E-01
+ -0.222893784193113984E-07 -0.104900286329099006E-05  0.178148086104071984E-01
+ -0.166666666666667011E-01 -0.435070197675724009E-03  0.194629909526723016E-01
+ -0.118086398896642989E-02 -0.435068586447951984E-03 -0.125574648914054000E-02
+  0.193880551117544993E-01 -0.435351553302722974E-03 -0.166666666666667011E-01
+  0.194221659534600001E-01 -0.114701564680454995E-02 -0.434786972534015011E-03
+ -0.120710615390273008E-02  0.193420430171151007E-01 -0.166666666666667011E-01
+ -0.139670121825768003E-03  0.172964394653271000E-01 -0.239652296206953015E-07
+ -0.141636284668589994E-03 -0.121273580815533010E-05  0.178147056312705013E-01
+ -0.166666666666667011E-01 -0.435070564583170979E-03  0.194632895210796984E-01
+ -0.118117290840524997E-02 -0.435068219538200975E-03 -0.125602935372259991E-02
+  0.193883427513572015E-01 -0.140356103029070988E-03  0.178144326340144016E-01
+ -0.934844351950365017E-06 -0.166666666666667011E-01 -0.140500373954539995E-03
+ -0.935645690836052991E-06  0.178144577216279992E-01 -0.166666666666667011E-01
+ -0.139419788136554014E-03  0.172964863338696991E-01 -0.213147023989074007E-07
+ -0.140223522691785012E-03 -0.108230807367247995E-05  0.178145859435818003E-01
+ -0.435214369892329993E-03 -0.166666666666667011E-01  0.194296915933112983E-01
+ -0.115819918280740002E-02  0.000000000000000000E+00 -0.434924344707075003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121631545945458997E-02  0.193618156735865000E-01  0.000000000000000000E+00
+ -0.435418791966529978E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.194737697407992016E-01 -0.119862135208015992E-02 -0.434719602197164011E-03
+ -0.125646773106410990E-02  0.193899154748590000E-01 -0.139674312472573998E-03
+ -0.166666666666667011E-01  0.172964142795966995E-01 -0.239964322444068015E-07
+ -0.141660280806240994E-03 -0.121419135139913995E-05  0.178146576430187990E-01
+ -0.166666666666667011E-01 -0.435064741241858997E-03  0.193888829829566983E-01
+ -0.125669444198150005E-02 -0.435074042815188978E-03 -0.118183195684604006E-02
+  0.194639537546182004E-01 -0.139703753692154993E-03 -0.166666666666667011E-01
+  0.172963763508955995E-01 -0.954521092502657923E-08 -0.141306390167940008E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.430379165314042026E-06  0.177499308188320999E-01
+ -0.139758462452603012E-03 -0.166666666666667011E-01  0.177495927166552991E-01
+ -0.464544378539779993E-07 -0.139910115331338996E-03 -0.481441843847050964E-06
+  0.178139831899020995E-01 -0.435407966296089978E-03 -0.166666666666667011E-01
+  0.194236847721564986E-01 -0.114950911636884008E-02 -0.434730445651113973E-03
+ -0.120708640519090991E-02  0.193421924466846001E-01 -0.416148250027581024E-03
+ -0.166666666666667011E-01  0.185549551170714998E-01 -0.503294505756778955E-03
+ -0.452623601339539015E-03 -0.518655138222936054E-03  0.187784353169859995E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.182536262402052986E-01
+ -0.562462921602506017E-03 -0.166666666666667011E-01 -0.776619709920083967E-03
+  0.186715804513362012E-01 -0.166666666666667011E-01  0.173100319756059016E-01
+ -0.233382192352905001E-07 -0.116274420326912994E-05  0.178149543965509007E-01
+ -0.166666666666667011E-01 -0.143574443571915002E-03  0.173387404346211985E-01
+ -0.394234709446026970E-04 -0.556824893407507947E-03 -0.414251929427215991E-03
+  0.187007563880755012E-01 -0.143222946652751988E-03 -0.166666666666667011E-01
+  0.174281634389273007E-01 -0.707336094321511021E-07 -0.153074744284785989E-03
+ -0.259053351753128992E-05  0.178159320930474999E-01 -0.143343235945712010E-03
+  0.178139865856733991E-01 -0.995973199358266933E-06 -0.166666666666667011E-01
+ -0.140432036984107007E-03 -0.263668001789773996E-07  0.174281413140941997E-01
+ -0.166666666666667011E-01 -0.434113176607574020E-03  0.193474145478290012E-01
+ -0.123277679909749010E-02 -0.436022648061583991E-03 -0.117159827098749008E-02
+  0.194498157758568983E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172976663726301001E-01 -0.749716013957179010E-08 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.328919335683087000E-06  0.177355871461097984E-01
+ -0.166666666666667011E-01 -0.140992307333851993E-03  0.177352147833274992E-01
+ -0.465109349848994022E-07 -0.141543817822521994E-03 -0.593601105003849030E-06
+  0.178140918400187011E-01 -0.166666666666667011E-01  0.193681748787879991E-01
+ -0.124747135029532009E-02 -0.118117041740436004E-02  0.194601894329670989E-01
+ -0.402827943802412983E-03 -0.166666666666667011E-01  0.186951349558098986E-01
+ -0.851875272769232967E-03 -0.463811391972161985E-03 -0.104547015467971994E-02
+  0.192724136708169007E-01 -0.139676914109847992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965234506337016E-01 -0.258479543060504998E-07
+ -0.166751637452680000E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.152816050945488003E-05  0.178205642139422984E-01 -0.166666666666667011E-01
+ -0.435069391172627010E-03  0.000000000000000000E+00  0.193884418587671990E-01
+ -0.125613776993248002E-02 -0.435069392953110973E-03 -0.118121619381530009E-02
+  0.194633634747354983E-01 -0.166666666666667011E-01 -0.139670761673773989E-03
+  0.172964743993378987E-01 -0.239698260758715003E-07 -0.141639618752670998E-03
+ -0.121288305970079009E-05  0.178147057814278011E-01 -0.141639617499855004E-03
+  0.178147057813345007E-01 -0.121288294541192010E-05 -0.166666666666667011E-01
+ -0.139670761456658004E-03 -0.239698237501548993E-07  0.172964743993366983E-01
+ -0.166666666666667011E-01 -0.435069391184759991E-03  0.193887118629383001E-01
+ -0.125641385905375007E-02 -0.435069392940977992E-03 -0.118148619757772006E-02
+  0.194636395637332003E-01 -0.166666666666667011E-01  0.172965077542778997E-01
+ -0.927568134856068057E-07 -0.166666666666667011E-01 -0.102046181376419003E-05
+  0.173051339147867016E-01 -0.166666666666667011E-01  0.173041376239571991E-01
+ -0.235410465543548011E-07 -0.117464369127414998E-05  0.178146677204709010E-01
+ -0.166666666666667011E-01 -0.435069390787829998E-03  0.193887086342375004E-01
+ -0.125641056657932004E-02 -0.435069393337907985E-03 -0.118148298373717004E-02
+  0.194636362741715009E-01 -0.435064997112617009E-03 -0.166666666666667011E-01
+  0.193885568522766984E-01 -0.125634409504329003E-02  0.000000000000000000E+00
+ -0.435073786951775026E-03 -0.118148100464545011E-02  0.194636167070038994E-01
+ -0.166666666666667011E-01 -0.139670744016726994E-03  0.000000000000000000E+00
+  0.172964393844361013E-01 -0.239719196971529010E-07 -0.141639876847867010E-03
+ -0.121306495872900994E-05  0.178147059624979009E-01 -0.166666666666667011E-01
+ -0.435069390950542994E-03  0.193887087721348016E-01 -0.125641070333806998E-02
+ -0.435069393175194989E-03 -0.118148311476701999E-02  0.194636364105120994E-01
+ -0.140348088180879989E-03  0.178144318602812997E-01 -0.933801415168445956E-06
+ -0.166666666666667011E-01 -0.140348088084880012E-03 -0.933801076426891949E-06
+  0.178144318368827007E-01 -0.166666666666667011E-01 -0.139670745303743007E-03
+  0.172964393843559987E-01 -0.239719335227778987E-07 -0.141639884277225002E-03
+ -0.121306563867630006E-05  0.178147059631548997E-01 -0.435069390608568011E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193887086845648017E-01
+ -0.125641066869512999E-02 -0.435069393517170026E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118148310483521006E-02  0.194636363135432984E-01
+ -0.435069389993829011E-03 -0.166666666666667011E-01  0.193887084930986983E-01
+ -0.125641043857266008E-02 -0.435069394131910002E-03 -0.118148287005216007E-02
+  0.194636361542637991E-01 -0.139670502006070010E-03 -0.166666666666667011E-01
+  0.172964394138492988E-01 -0.239693181051815999E-07 -0.141638479654202995E-03
+ -0.121293697133386993E-05  0.178147058357341001E-01 -0.166666666666667011E-01
+ -0.435069390724191016E-03  0.193887813829540998E-01 -0.125648495565761011E-02
+ -0.435069393401547020E-03 -0.118155573441382005E-02  0.194637106641438011E-01
+ -0.139699943939322992E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964151706326008E-01 -0.947752965014964063E-08 -0.141286304262072990E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427064219430228011E-06
+  0.177495997096570012E-01 -0.140178146812241995E-03 -0.166666666666667011E-01
+  0.177492252668765005E-01 -0.496801880864810970E-07 -0.140415043122611998E-03
+ -0.517779586424547950E-06  0.178140139541973011E-01 -0.435069391018619994E-03
+ -0.166666666666667011E-01  0.193887087741046009E-01 -0.125641070402467006E-02
+ -0.435069393107117989E-03 -0.118148311448062994E-02  0.194636364104064999E-01
+ -0.139670745278655002E-03 -0.166666666666667011E-01  0.172964393844560992E-01
+ -0.239719332530519000E-07 -0.141639884131573011E-03 -0.121306562524490010E-05
+  0.178147059631618004E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887976612396006E-01 -0.125650164849242007E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118157208094086998E-02  0.194637273068684995E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133479159011E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.166666666666667011E-01
+ -0.139670744021023010E-03  0.172964393843868004E-01 -0.239719197464163008E-07
+ -0.141639876873170012E-03 -0.121306496126475003E-05  0.178147059625002983E-01
+ -0.435069390942508026E-03 -0.166666666666667011E-01  0.193887087711050003E-01
+ -0.125641070244318989E-02 -0.435069393183230011E-03 -0.118148311400550999E-02
+  0.194636364097085998E-01 -0.140348087386128002E-03  0.178144318601935990E-01
+ -0.933801313750166004E-06 -0.166666666666667011E-01 -0.140348087289432998E-03
+ -0.933800972552974967E-06  0.178144318366253995E-01 -0.166666666666667011E-01
+ -0.435069391191810016E-03  0.193887815133764012E-01 -0.125648507876439009E-02
+ -0.435069392933928997E-03 -0.118155584777763990E-02  0.194637107833691009E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964151566877000E-01
+ -0.947849244181166918E-08 -0.166666666666667011E-01 -0.427106377285012985E-06
+  0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569364992E-03
+  0.177492248215362983E-01 -0.496797136596528977E-07 -0.140415043288561988E-03
+ -0.517779487315898039E-06  0.178140139541107002E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627734998E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.435069390319006992E-03 -0.166666666666667011E-01
+  0.193887087496051012E-01 -0.125641069305206009E-02 -0.435069393806730991E-03
+ -0.118148311376660995E-02  0.194636364069746999E-01 -0.139670541873138993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964393705303011E-01
+ -0.239412684749743987E-07 -0.141637881918781994E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.121130856871430998E-05
+  0.178146131135742992E-01
+  0.186264861206935986E-17  0.188469719401502014E-16  0.000000000000000000E+00
+  0.552421818745575958E-18  0.917870006576351024E-18  0.914518973837806942E-17
+ -0.462479989200847985E-18 -0.203714864886006990E-16 -0.195088628411609992E-16
+  0.361312491563162037E-20  0.221189708273807991E-17 -0.223400286947376007E-18
+ -0.740148597608108019E-18  0.715802397433987990E-18  0.210593204619582009E-19
+ -0.247092745603879988E-17  0.234522247271034021E-18  0.359989002583078020E-20
+  0.130125436521942010E-18  0.523865972357081977E-17  0.000000000000000000E+00
+  0.502798757490152989E-17 -0.774865740148233950E-17  0.305533254404098983E-16
+  0.000000000000000000E+00  0.162630325872825993E-17 -0.590890184004599994E-17
+  0.202691596146165005E-15 -0.691124674850884999E-15 -0.677626357803439970E-20
+ -0.638832248819193008E-17 -0.354921197842210034E-17  0.180212876973253014E-17
+  0.592118381778119037E-17  0.987885261959830012E-19 -0.448662764248762018E-18
+  0.394717353420504035E-17  0.000000000000000000E+00 -0.120292707493299994E-16
+  0.852284551527277001E-17  0.000000000000000000E+00  0.303397988982417995E-17
+  0.101048383629478005E-18  0.740148804403260964E-18 -0.135433289076572003E-16
+  0.223533152833267012E-19 -0.447899111107253025E-17  0.212817027997642990E-19
+  0.000000000000000000E+00 -0.403081803774639981E-18 -0.672555409776245036E-17
+  0.000000000000000000E+00  0.491279109407494004E-18 -0.205371608391277999E-16
+  0.173167415736669007E-16  0.462479989200847985E-18 -0.704731412115577978E-18
+  0.000000000000000000E+00 -0.474338450462407997E-18 -0.612574227454310005E-17
+ -0.184314369322535985E-17  0.765717784317887969E-17  0.648342840630083007E-17
+ -0.304990590835039002E-18 -0.592118946587380969E-17  0.339088701131534005E-19
+ -0.112557736932424994E-16 -0.381335904017217022E-18  0.000000000000000000E+00
+ -0.669901779474492986E-17  0.138243891116723001E-17 -0.462599103209056006E-18
+  0.620975527371875958E-25  0.182632162877623988E-17  0.000000000000000000E+00
+  0.430176265969362029E-19 -0.395986452044138980E-18 -0.425221003356408028E-17
+  0.763810107027342937E-17  0.449986253228846961E-21  0.149920187933444999E-16
+  0.933333695763632040E-17 -0.449986253228846961E-21 -0.102263638350942002E-18
+  0.716941713442839936E-17 -0.185604230015463005E-18  0.148029860142325988E-16
+  0.765722441914230953E-17  0.000000000000000000E+00  0.395087089926695984E-18
+ -0.112484787433542003E-16  0.148029860142325988E-16  0.186249855764197994E-18
+ -0.123980477187407996E-17 -0.177475529531162003E-18  0.000000000000000000E+00
+  0.468189851533179025E-17  0.846067343136115030E-19  0.251873187799946985E-17
+  0.000000000000000000E+00  0.427222242771388022E-18 -0.126919416816583993E-16
+  0.000000000000000000E+00 -0.639166659846272950E-25 -0.577339967784366969E-17
+  0.000000000000000000E+00 -0.299518791083001013E-18  0.885930391776569904E-18
+ -0.415350546620027017E-18  0.777912411780388954E-17  0.148029595444530002E-16
+ -0.870855748895828018E-19  0.370461440010243996E-17  0.000000000000000000E+00
+  0.925637604759498948E-17 -0.157717534778751000E-16  0.313749473992465019E-16
+  0.575982404132923990E-19  0.219550939928314995E-17 -0.590890184004599994E-17
+  0.200929767615876011E-15 -0.698361724352226037E-15 -0.762329652528870004E-20
+  0.762499059118321011E-17  0.420595399031252993E-17 -0.953539688950861913E-18
+  0.740148701005684010E-18  0.731844738233841023E-17  0.151625515003419992E-18
+  0.117147568281026001E-16  0.000000000000000000E+00  0.912148605074475063E-18
+  0.159581007262709991E-17  0.449986253228846961E-21 -0.130862557845303000E-24
+  0.171921983655087999E-17  0.000000000000000000E+00  0.158448927876369990E-18
+  0.945956303890129087E-18  0.984593083121869034E-18 -0.108044346378205993E-16
+ -0.148029595444530002E-16  0.326160624252108976E-18 -0.372472316079360021E-17
+  0.148029595444530002E-16 -0.106167109608854004E-16  0.550232602536393962E-17
+  0.123056946577105000E-16  0.677626357803439970E-20  0.603087458445062034E-18
+ -0.292734586571086005E-17  0.670850094225405965E-18 -0.124971241037898994E-16
+  0.000000000000000000E+00  0.128342432167972003E-16  0.179743617411750996E-16
+ -0.477944389246467971E-18  0.592118946587380969E-17 -0.141023146889066998E-19
+ -0.776697175539328012E-17 -0.238961064843469015E-19  0.000000000000000000E+00
+  0.212532690638741003E-16 -0.147509485406731005E-17  0.000000000000000000E+00
+  0.635022970263750003E-17 -0.216767304995182992E-18  0.592118946264263003E-17
+  0.350644635857248989E-19 -0.606692755904139014E-18 -0.229492316153697017E-18
+  0.119207909163859998E-16 -0.462599103209056006E-18  0.119335121574955004E-17
+ -0.283978943926022000E-18  0.000000000000000000E+00  0.526661030130147969E-18
+  0.834631058858251966E-18  0.572340905439509016E-18 -0.148029860142325988E-16
+ -0.952426877165385946E-17  0.592118381778119037E-17 -0.152714247567623012E-19
+ -0.136949449939554005E-17  0.000000000000000000E+00 -0.453732450462848006E-18
+ -0.649938968340065021E-17 -0.212605269760829000E-18  0.148061359180052007E-17
+  0.173938215718671006E-16 -0.149755425074559992E-16 -0.176182853028893989E-18
+  0.000000000000000000E+00  0.314858028362183973E-19  0.222314384918731995E-17
+ -0.224993126614423998E-21  0.577572400366616003E-25  0.196242353951357996E-16
+  0.592118381778119037E-17  0.846496107036753039E-18 -0.256018086451873006E-19
+  0.809438002004071989E-18 -0.177860077526130999E-16  0.000000000000000000E+00
+ -0.222683638344174993E-18  0.123060118814754002E-16  0.148029595444530002E-16
+  0.169406589450859996E-18 -0.813151629364127964E-19 -0.921571846612678961E-18
+  0.000000000000000000E+00  0.831362837730095975E-18  0.000000000000000000E+00
+ -0.542101086242751976E-19 -0.159835117146885992E-17  0.000000000000000000E+00
+  0.726754268744189980E-18 -0.753520509877426044E-17  0.777915058758348969E-17
+  0.000000000000000000E+00 -0.513301966036106005E-18 -0.420128341838133016E-18
+  0.608169656128588029E-17  0.578099986501059981E-19 -0.201217970576179000E-17
+ -0.369654608040760013E-19 -0.148029595444530002E-16  0.144305243943295996E-16
+ -0.705346343352828992E-19  0.000000000000000000E+00 -0.147612593664461005E-18
+  0.183565822566964011E-16  0.278007338868100017E-18 -0.662135360669951994E-17
+  0.462585868319254965E-18  0.382512344841314970E-17 -0.174270411452671991E-18
+  0.000000000000000000E+00 -0.413352078260099028E-17  0.111698234754425002E-16
+  0.240451477901814990E-18  0.000000000000000000E+00 -0.117367664498414002E-16
+  0.000000000000000000E+00 -0.428175154837049014E-18 -0.174319380544935010E-17
+  0.575982404132923990E-19 -0.271728169479180011E-17  0.735145188877927989E-18
+  0.530165449030040969E-17  0.000000000000000000E+00 -0.422661168873770002E-18
+  0.147489611940655008E-17  0.542185789537478032E-17  0.000000000000000000E+00
+  0.160646151193882004E-16 -0.132814766129474004E-17  0.148027477862161989E-16
+  0.314846080772212007E-17 -0.793361300897053979E-19  0.000000000000000000E+00
+ -0.798196203889169946E-19 -0.312772209729571009E-17  0.473491417515153986E-18
+  0.994130806456849964E-17 -0.462585868319254965E-18  0.140167012111641990E-16
+  0.389635155736977975E-19 -0.575982404132923990E-19  0.106920968931909993E-16
+ -0.542101086242751976E-19 -0.921571846612678961E-18 -0.460785923306338999E-18
+  0.693889390390722992E-17  0.590890184004599994E-17 -0.176182853028893989E-18
+ -0.708119543904594929E-17  0.000000000000000000E+00 -0.790175860669855044E-18
+ -0.677626357803440031E-19 -0.899887803162968988E-17  0.000000000000000000E+00
+  0.282909004382935976E-18 -0.319839640883224001E-17 -0.189735380184963011E-18
+ -0.921571846612678961E-18  0.940111280245706959E-17 -0.136473551414918999E-17
+ -0.462585868319254965E-18  0.947011739499720023E-25 -0.774842994750098042E-17
+ -0.592118381778119037E-17  0.380929522460731986E-18 -0.412849204146454994E-18
+ -0.790655128793651072E-17  0.574799928767763978E-17  0.449986253228846961E-21
+ -0.184669492435164995E-16 -0.668551742520389010E-17  0.000000000000000000E+00
+  0.757247454845344985E-18 -0.758433300971500986E-17 -0.956088439213291989E-19
+ -0.739967982721356930E-17  0.674366604445490992E-17  0.000000000000000000E+00
+ -0.129066645337873990E-18  0.567512074660380996E-17  0.740306795900258034E-17
+  0.289685267960971005E-18 -0.853809210832335031E-18 -0.346944695195360995E-17
+  0.000000000000000000E+00 -0.440457132572235996E-19  0.140946282423116000E-17
+  0.126038502551440006E-17 -0.460785923306338999E-18 -0.334096264176697982E-17
+ -0.121481465295212001E-16 -0.179994501291539010E-20  0.348945878064791996E-18
+  0.402866943610530006E-18  0.000000000000000000E+00  0.102462200732453003E-16
+  0.324395420824920984E-19 -0.824904211507282051E-17  0.260886147754324986E-18
+ -0.169406589450859993E-20 -0.300220240242445980E-18 -0.155398517387845997E-16
+ -0.462612338098855988E-18 -0.113502414932076005E-18  0.853279815240300958E-17
+  0.118584612615602002E-19  0.000000000000000000E+00 -0.420128341838133016E-18
+ -0.590890184004599994E-17 -0.325260651745650993E-18 -0.414707330975704984E-17
+ -0.739967982721356930E-17 -0.609863722023095955E-18 -0.775204553327135978E-17
+ -0.110046520507278993E-16 -0.596311194867026976E-17  0.506525702458071988E-18
+  0.406575814682063982E-19 -0.585807986321074037E-17  0.000000000000000000E+00
+ -0.957507219399941999E-17 -0.993410828451683924E-18  0.000000000000000000E+00
+ -0.258493941422821001E-23 -0.140905965640059988E-16  0.000000000000000000E+00
+ -0.455999856282107988E-19  0.130766005188302993E-17  0.851098705401120981E-17
+  0.412593057330019984E-17  0.913207396258542929E-21 -0.199420672541224012E-17
+  0.107126374421619007E-16  0.169406589450859993E-20 -0.169813165265542009E-16
+  0.111740586401786993E-16 -0.220546203641338017E-18 -0.580217568869195973E-19
+  0.745639133001020026E-17  0.000000000000000000E+00  0.633051248954183005E-18
+ -0.106607566741425994E-16  0.580217568869195973E-19 -0.175505226671090992E-17
+ -0.127986678330124993E-17 -0.686052019522905026E-18  0.000000000000000000E+00
+  0.469395219121791008E-18 -0.219520603079348977E-18  0.373882580183111030E-17
+  0.462599103209056006E-18  0.597412337698458029E-17 -0.221075599233372003E-18
+  0.000000000000000000E+00  0.234092940533119000E-17 -0.456210270350426032E-18
+  0.000000000000000000E+00  0.247588805817228019E-18  0.115207598134018006E-16
+  0.293496916223615019E-18 -0.516984574123192008E-17  0.462585868319254965E-18
+ -0.979837125471933953E-17  0.542101086242752024E-18  0.000000000000000000E+00
+ -0.111998538839297005E-16 -0.886573158126109934E-17  0.107314690359872997E-18
+  0.000000000000000000E+00  0.132318325413044006E-16  0.000000000000000000E+00
+ -0.490891988880818995E-18  0.300838917864701994E-17 -0.449986253228846961E-21
+  0.153224066374812011E-16  0.752300914782277012E-18 -0.202894665677824002E-16
+ -0.592118381778119037E-17 -0.607071160275117030E-18  0.706332833781481025E-18
+  0.183276753962148989E-16  0.740306795900258034E-17 -0.673708830422389036E-18
+  0.996110745971057006E-18  0.000000000000000000E+00  0.562224322594635998E-24
+ -0.143283150371639000E-17 -0.592118381778119037E-17  0.695175821679365018E-18
+ -0.377743814046069017E-18  0.851295905259153931E-17 -0.617021150427395005E-17
+  0.000000000000000000E+00 -0.699522159489963935E-17 -0.230593470233653008E-17
+  0.899972506457693922E-21  0.136092783635348004E-16 -0.203287907341032009E-18
+ -0.235813972515596983E-17  0.000000000000000000E+00  0.151788304147971006E-17
+  0.000000000000000000E+00  0.718283939271646976E-18  0.339279047022710001E-17
+  0.000000000000000000E+00  0.740095037663445042E-19 -0.151788304147971006E-17
+  0.498732999343331963E-17 -0.592076030130756014E-17  0.124344436656931001E-17
+  0.311708124589582987E-17 -0.189735380184963011E-18 -0.460785923306338999E-18
+  0.145657373796625989E-16  0.661871544984509977E-17 -0.179994501291539010E-20
+  0.232356033872243014E-17  0.178769240009195011E-19  0.740148597608108019E-18
+ -0.485518623621675008E-18  0.132951964373823990E-18 -0.604535355389274969E-17
+ -0.332301613119702991E-18 -0.169406589450859993E-20  0.379629579047537022E-18
+ -0.785816701559762074E-17  0.000000000000000000E+00 -0.108081404069648993E-17
+ -0.867086452280546054E-17 -0.128749007982653989E-18  0.000000000000000000E+00
+  0.406575814682063982E-19 -0.596311194867026976E-17 -0.157209315010398008E-17
+  0.542101086242752024E-18  0.739967982721356930E-17  0.348977574268771980E-18
+ -0.242590236093632002E-17  0.357786716920216001E-17  0.000000000000000000E+00
+  0.508219768352580023E-19 -0.327971157176865014E-17  0.108420217248549998E-18
+  0.460785923306338999E-18  0.269695290405768997E-17  0.105729828849834003E-16
+  0.000000000000000000E+00  0.605520075134427981E-17  0.295792342423450008E-18
+  0.740148804403260964E-18  0.887040033888795054E-17 -0.196630757771205002E-18
+ -0.695752862874681990E-17 -0.545383338913363016E-18  0.000000000000000000E+00
+  0.539586457180591013E-18 -0.226178972740580010E-17  0.000000000000000000E+00
+  0.101643953670515996E-19 -0.219550939928314995E-17  0.157209315010398008E-17
+ -0.740306795900258034E-17  0.156701095242045996E-18  0.000000000000000000E+00
+  0.182959116606928994E-18  0.556924162819702011E-17  0.000000000000000000E+00
+  0.492126142354748016E-18  0.129542647455343004E-16 -0.267003768769613992E-18
+  0.000000000000000000E+00  0.511303497676177987E-18 -0.249632582995209015E-17
+ -0.104854969288279003E-17  0.219719850209398022E-24 -0.877420254237048056E-17
+ -0.186868703054100004E-17  0.000000000000000000E+00  0.310192729707385011E-23
+ -0.646582600286570010E-17  0.000000000000000000E+00 -0.831029897533542997E-18
+  0.106143816202805005E-18 -0.664497347120999001E-18 -0.396502740054638028E-17
+  0.913207396258542929E-21  0.321382829034003006E-17  0.242124367972641983E-17
+ -0.190582413132217990E-20 -0.144015989742860999E-18 -0.112475097000963994E-16
+  0.329502123734093982E-18  0.148029595444530002E-16 -0.150205510589462992E-16
+  0.000000000000000000E+00  0.901573514533289918E-18  0.229584989725403987E-17
+  0.000000000000000000E+00 -0.629603722527569038E-18  0.415692006776889032E-17
+ -0.259039880627106002E-18  0.000000000000000000E+00  0.526221725872965983E-18
+  0.153314307621524007E-16 -0.121059189334008992E-17  0.219719850209398022E-24
+  0.518485626953725989E-17  0.124986814033672995E-17 -0.148029595444530002E-16
+  0.200429726146822988E-17  0.922342050742921017E-19  0.000000000000000000E+00
+  0.356920501310487004E-18  0.132035399740042995E-16 -0.379419114896017021E-18
+  0.180196383238973012E-16  0.000000000000000000E+00  0.117761188624817996E-16
+  0.427189116772794014E-18  0.000000000000000000E+00 -0.945056279636684015E-18
+ -0.306948887770498012E-17 -0.627647260344651004E-18 -0.148029860142325988E-16
+ -0.211552721983768995E-16  0.000000000000000000E+00 -0.363690567209131027E-19
+  0.235090141659442018E-17  0.000000000000000000E+00 -0.154382013937734992E-18
+  0.104986094217475005E-17 -0.202012618175241010E-16  0.000000000000000000E+00
+ -0.201818834573138001E-18 -0.277787102030008014E-18  0.922418879559933069E-18
+ -0.740306795900258034E-17 -0.161027316020146994E-16  0.159919820441612000E-17
+  0.000000000000000000E+00  0.155950028687531998E-16 -0.217486462555504989E-20
+  0.592118919445516966E-17 -0.163450889040478004E-18  0.138365478911949995E-17
+ -0.688214269644119025E-20  0.137618501731589997E-16  0.462638807878457974E-18
+ -0.300866102864727018E-17 -0.425819344452497991E-18  0.000000000000000000E+00
+ -0.534138976538562025E-17 -0.230392961653169981E-18 -0.693889390390722992E-17
+  0.000000000000000000E+00 -0.124683249835832991E-17 -0.593600689435813986E-17
+  0.203287907341032009E-18 -0.146240238343454992E-17 -0.462585868319254965E-18
+  0.804681299891585021E-20 -0.921571846612678961E-18  0.758941520739853008E-18
+ -0.596311194867026976E-17  0.103338019565025002E-18 -0.121972744404619008E-18
+ -0.641119237776780004E-17  0.578099986501059981E-19  0.904419429430779033E-17
+ -0.317770699320046013E-17  0.462599103209056006E-18  0.723480113396294003E-25
+  0.177058483496122985E-16 -0.592123675734038991E-17  0.353544910182120027E-20
+ -0.227547188927349024E-18 -0.648947871549377031E-17  0.181164310453569009E-17
+ -0.449986253228846961E-21 -0.153640401864531989E-16 -0.669340050167796020E-17
+ -0.449986253228846961E-21  0.501443504774546032E-17 -0.146028480106641009E-16
+ -0.454644934438745996E-18  0.580217568869195973E-19  0.102203789509092007E-17
+  0.000000000000000000E+00 -0.394823232538910986E-18  0.152398167869993994E-16
+  0.000000000000000000E+00  0.485858098545066983E-17  0.339523416855172998E-18
+ -0.196963000034115994E-18  0.000000000000000000E+00  0.163869376255981004E-16
+ -0.375984607678321967E-20  0.602139840335321016E-17  0.000000000000000000E+00
+  0.374388562686400991E-18 -0.187956610995729014E-17  0.000000000000000000E+00
+  0.117614743347384008E-23 -0.156049686373756991E-16  0.592118381778119037E-17
+ -0.111927793918019994E-17 -0.125932623433033007E-17  0.107700239243383999E-17
+ -0.680955925205089026E-17  0.000000000000000000E+00 -0.474338450462408009E-19
+ -0.198459819541683001E-16  0.148031713026898014E-16 -0.589365524699542022E-17
+ -0.100559751498030998E-16  0.119262238973404995E-16  0.000000000000000000E+00
+ -0.406575814682063982E-19 -0.298155597433513989E-17  0.291379333855478980E-18
+ -0.143173979074393986E-16 -0.580217568869195973E-19  0.793500464987829020E-17
+  0.509551198266546019E-17 -0.207274917893538006E-19  0.592118946587380969E-17
+ -0.249908860027332019E-18  0.726475167665756951E-17  0.143720452004045010E-18
+  0.219719850209398022E-24 -0.713783873983173068E-17  0.273122128532314001E-17
+ -0.148029860142325988E-16  0.616308195352442029E-17 -0.103685693916237999E-18
+  0.000000000000000000E+00  0.172732113104808003E-18 -0.119086404205086997E-17
+  0.148812751953918994E-18 -0.973629116658004042E-17  0.462585868319254965E-18
+ -0.128415437610183997E-16  0.364164300122516006E-18  0.581611368201348043E-25
+  0.965757993893369943E-18 -0.318378260621606992E-17  0.523869464800819009E-18
+  0.000000000000000000E+00 -0.125934272098304005E-16 -0.592118381778119037E-17
+  0.170794585972679995E-18 -0.638088177192808014E-18  0.148029860142325988E-16
+  0.636134305731703948E-19 -0.115664869328127008E-16  0.103770143886900998E-17
+ -0.740148701005684010E-18  0.109459983278530000E-16  0.860412658285831038E-19
+  0.453890545720096961E-18 -0.462479989200847985E-18  0.574764794271245981E-18
+ -0.417079023228017025E-17 -0.359989002583078020E-20 -0.123844851232456994E-24
+ -0.997743062832839024E-17 -0.592118381778119037E-17 -0.350289482720280986E-18
+  0.671115955244159048E-18 -0.462718217217262968E-18  0.366619682373303022E-17
+  0.148029595444530002E-16 -0.269938812378104984E-18 -0.623165778889683993E-17
+ -0.148029595444530002E-16 -0.428259858131774020E-17  0.948338087745914968E-17
+  0.300036010576418020E-16  0.000000000000000000E+00  0.409286320113278001E-17
+ -0.590890184004599994E-17  0.201607393973679993E-15 -0.699147770927278027E-15
+  0.762329652528870004E-20  0.813490442543030046E-17 -0.681799066366914999E-17
+ -0.607060631336584989E-19 -0.592118946264263003E-17  0.138582841111243012E-18
+ -0.269407654562161003E-17  0.361013453120626983E-18  0.232644547280538988E-24
+ -0.191654297740858989E-16 -0.185276296712765011E-18  0.000000000000000000E+00
+  0.658091044686770012E-17  0.383052929495380021E-18  0.000000000000000000E+00
+ -0.177559921692520007E-18 -0.413370558966372027E-17 -0.446111077151846004E-19
+ -0.468735550811414966E-17  0.000000000000000000E+00 -0.165755308426161985E-16
+  0.248649031945594005E-18 -0.219719850209398022E-24 -0.274940472223534988E-18
+ -0.658549193252238029E-19  0.554334983005379011E-18  0.000000000000000000E+00
+  0.614026536820119009E-17 -0.592118381778119037E-17  0.286291090607452007E-18
+ -0.122457478756202001E-16  0.148029860142325988E-16 -0.552535673821147045E-18
+  0.768758534050744985E-19  0.492023165039092978E-18  0.592118381778119037E-17
+  0.874500688343341996E-18  0.104618365722086995E-19  0.111289438122280006E-16
+ -0.462585868319254965E-18  0.884185636357436999E-17  0.155484871948446003E-18
+ -0.148029595444530002E-16 -0.592115734800158020E-17 -0.959692361744608029E-20
+  0.592118946587380969E-17 -0.191465157017479006E-18 -0.592118381778119037E-17
+  0.220854337068372988E-18  0.888178896156158063E-17  0.000000000000000000E+00
+ -0.806069459784106008E-17  0.127742485722792993E-18  0.219719850209398022E-24
+ -0.296056543911099002E-17  0.296059190889059018E-17  0.719732306674833017E-19
+  0.000000000000000000E+00 -0.120712129288665994E-16  0.000000000000000000E+00
+  0.154282173561696008E-19  0.296059190889059018E-17  0.449986253228846961E-21
+ -0.296056543911099002E-17 -0.166744391367418999E-17  0.868868912763262986E-17
+  0.000000000000000000E+00 -0.440416500555818969E-18 -0.236804406269326013E-18
+  0.139891281099502011E-16  0.000000000000000000E+00  0.569938098259822986E-17
+  0.131207118545926006E-16 -0.449986253228846961E-21  0.720490237761527062E-17
+  0.609070955769230024E-18  0.000000000000000000E+00 -0.856757678897806949E-17
+  0.148001282823888015E-19  0.148369108142005008E-16 -0.160937199231215010E-18
+  0.000000000000000000E+00  0.250599984904598978E-18 -0.593031173923045011E-17
+  0.000000000000000000E+00 -0.747366643183191062E-17 -0.539419458477777017E-18
+ -0.728712515589093926E-17  0.000000000000000000E+00 -0.489944721671871044E-18
+ -0.740148688080987035E-18  0.126086815672215012E-18  0.800808902944986061E-17
+  0.000000000000000000E+00 -0.401836344027630001E-18 -0.161952699515021994E-17
+ -0.221719344273285996E-16  0.591906623541305000E-17 -0.113502414932076005E-18
+  0.769783542464707994E-17 -0.243945488809238016E-18  0.000000000000000000E+00
+ -0.169496586701505991E-16  0.197941011861488998E-18  0.000000000000000000E+00
+ -0.302670152115197982E-25  0.196007899393148981E-17  0.000000000000000000E+00
+  0.268184722574811015E-18 -0.507787081806380982E-18 -0.568046702169790025E-17
+  0.335786576739177007E-17  0.000000000000000000E+00  0.116899459592193998E-16
+  0.128688313346714001E-16  0.000000000000000000E+00 -0.259192081859816000E-18
+ -0.276302147394353002E-17  0.527066251428988001E-18  0.740306795900258034E-17
+ -0.150450521486901014E-16 -0.592118381778119037E-17  0.143254447204384002E-18
+ -0.742678488152571000E-17 -0.740306795900258034E-17  0.813151629364127964E-19
+ -0.108420217248549998E-18 -0.135525271560687996E-17 -0.592076030130756014E-17
+ -0.118245799436700000E-17 -0.325260651745650993E-18  0.962229428080884993E-18
+  0.000000000000000000E+00  0.177156676220440991E-16  0.103507426154475994E-16
+ -0.179994501291539010E-20  0.233949856058605991E-17  0.612251172066077035E-18
+ -0.740148597608108019E-18 -0.667834855456528034E-17 -0.108665062709865995E-18
+  0.525176309165426997E-17 -0.475608999883290026E-18  0.190582413132217990E-20
+ -0.340083728322602019E-18  0.438821300192851001E-17 -0.462585868319254965E-18
+ -0.132984172718925009E-18  0.124611252035315994E-16 -0.315096256378600015E-18
+  0.740306795900258034E-17 -0.191090632900570002E-17  0.590890184004599994E-17
+ -0.216840434497100983E-18  0.314418630020796016E-17 -0.739967982721356930E-17
+ -0.860585474410368952E-18 -0.244623115167042015E-17  0.140133130793750986E-16
+ -0.591906623541305000E-17 -0.271050543121376012E-18 -0.395733792957208964E-17
+  0.256142763249699999E-17  0.000000000000000000E+00  0.588772601636464012E-17
+  0.292279306361937011E-18 -0.462585868319254965E-18  0.305022850878928992E-23
+  0.125181543436834992E-17  0.592123675734038991E-17 -0.840797659796876002E-18
+ -0.119751929896036996E-17 -0.803410750470703968E-18  0.990969653027917929E-17
+  0.000000000000000000E+00 -0.615012094155625980E-17 -0.804469541654772036E-18
+ -0.190582413132217990E-20 -0.342201310690737000E-18  0.120498907076396997E-16
+  0.361683068477586004E-18  0.000000000000000000E+00  0.190463629996254992E-16
+  0.000000000000000000E+00 -0.736389268519206972E-18 -0.564293349460814963E-17
+  0.000000000000000000E+00  0.467562186884373980E-18 -0.188380127469356010E-17
+  0.650521303491303046E-18  0.000000000000000000E+00  0.298155597433514008E-18
+  0.254787510534094004E-17  0.474338450462407997E-18  0.000000000000000000E+00
+  0.253944448053778995E-17 -0.988826262624670062E-17  0.179994501291539010E-20
+  0.443980138421951982E-17  0.118181258669405998E-18  0.740148597608108019E-18
+  0.269978517047506999E-17  0.191455088678460996E-18 -0.363737123374678000E-17
+ -0.190370654895404001E-18  0.000000000000000000E+00  0.452156775156186007E-18
+  0.744150207898425038E-17  0.000000000000000000E+00 -0.131290106824417010E-18
+ -0.699437456195238967E-17 -0.154159996400282999E-18 -0.740306795900258034E-17
+  0.120617491689012002E-17  0.000000000000000000E+00 -0.704731412115577978E-18
+  0.135525271560687996E-17 -0.739967982721356930E-17 -0.304931861011547978E-18
+ -0.133796794952696997E-17  0.309348476086989009E-16  0.000000000000000000E+00
+ -0.444083492377669035E-18  0.396729056670233018E-18 -0.410980386007786987E-17
+  0.000000000000000000E+00 -0.829202903714597009E-17  0.612574227454310005E-17
+  0.000000000000000000E+00  0.876900950248488035E-17 -0.586046090260397985E-19
+  0.000000000000000000E+00  0.543265756545227034E-17  0.562482816536058969E-20
+  0.418476627590987005E-17 -0.160089227031063005E-18  0.304931861011548002E-19
+ -0.397258452262267003E-18  0.144321708717923994E-16  0.000000000000000000E+00
+  0.203287907341032004E-17 -0.487890977618476995E-17 -0.101643953670516005E-18
+  0.184314369322535985E-17 -0.609863722023096013E-17  0.000000000000000000E+00
+  0.298968749062877971E-16 -0.674661742488049989E-17  0.000000000000000000E+00
+ -0.346859991900635989E-18 -0.512412489595360027E-25 -0.229649779221929994E-16
+  0.592118381778119037E-17  0.968136764274826967E-18  0.248448055279030997E-19
+ -0.107229265609408996E-16  0.148029595444530002E-16  0.106183896722397993E-16
+  0.555835236088826021E-17  0.000000000000000000E+00 -0.102162426050442001E-16
+  0.105082980687594002E-17  0.000000000000000000E+00  0.760580847189921965E-17
+  0.559763864169073973E-19 -0.733749295744850035E-18  0.839903522791146024E-19
+  0.219719850209398022E-24  0.117210647846692996E-18 -0.125995396489508994E-16
+  0.462585868319254965E-18  0.128018813561245007E-16  0.318308925156784023E-18
+ -0.818753745432010944E-17  0.000000000000000000E+00 -0.817699696977523956E-19
+  0.740148681618638981E-18 -0.599551467812316950E-19  0.108906402847091006E-16
+ -0.462599103209056006E-18 -0.494820159361191037E-18 -0.232935714856157988E-18
+ -0.685910313144217059E-19  0.143819535639848992E-17 -0.400925499014078976E-17
+ -0.410781759263196998E-17 -0.735211518423296988E-18 -0.686166170447438018E-18
+  0.184201211014737999E-17  0.565444743514457976E-18 -0.496970112021859027E-17
+ -0.300019069917472988E-17  0.677626357803439982E-18 -0.639762992444437982E-18
+  0.112363387228590996E-17 -0.391203242024195008E-17  0.452220302627229964E-17
+  0.251521139731243992E-17 -0.321237245246193018E-17 -0.204498708472292984E-17
+  0.573610877316734989E-18  0.449273430335979002E-18 -0.296543215624495000E-17
+ -0.273389396303369975E-19  0.644925013057074959E-18  0.106550458191936003E-17
+  0.459232167923234995E-18  0.257365667067299011E-17 -0.726396926719566976E-18
+ -0.219646231134880988E-17  0.282697246146122979E-18 -0.721963238636283036E-18
+ -0.174012786050891999E-17  0.322031338634244021E-18 -0.812092838180059969E-19
+ -0.176127328530276993E-18  0.192190328165929010E-17  0.695940524306275957E-18
+  0.238706623117694998E-17 -0.975821659906356921E-18 -0.341196120810363013E-17
+ -0.296800344717906993E-17 -0.310014058695074020E-18  0.106340354316346995E-17
+  0.127478458561772008E-17  0.185161402269790016E-17  0.246253653590506004E-17
+  0.468239813242176996E-17  0.147044919643347001E-17 -0.303020954682541977E-15
+  0.120558517020060009E-15  0.514724981387492984E-16 -0.395303500220003983E-15
+ -0.937401198330966936E-15 -0.695346287059999988E-16  0.360751332235607007E-16
+ -0.841069835305629941E-16 -0.465122731996280971E-16  0.138223865591080994E-17
+  0.316557388212613009E-17  0.153530511954130000E-17 -0.114158865466198001E-17
+  0.251918186425269985E-17 -0.826679341101821022E-18 -0.242299109877044003E-17
+ -0.276451039904612007E-17 -0.219837847523779012E-17  0.189542150793871004E-17
+ -0.349685640873116977E-17 -0.322931311140702004E-19  0.296090954624581016E-17
+  0.262580213648832990E-17 -0.138913403349705005E-17  0.200852687617675992E-18
+  0.140967458246796992E-17 -0.665692623106138030E-18  0.296959494267761993E-18
+ -0.141501031100923995E-17  0.514096059424157008E-18  0.474857258142600987E-17
+  0.234265490408897985E-17  0.238700305525767010E-17  0.123478461273650007E-17
+ -0.777840529785158018E-18 -0.393563436465992003E-18  0.400450542246108993E-19
+ -0.231993556138459993E-18 -0.105675301103853995E-17  0.725377840204900971E-18
+  0.508905335644263999E-17 -0.168930133418030005E-17  0.151288025313498993E-17
+ -0.167222832633720003E-18  0.128336079420867007E-17  0.258071334847867999E-17
+  0.101915268911433000E-18  0.208333047333116009E-17  0.371393755278652995E-18
+ -0.927165655505069004E-18 -0.352746209139563026E-17  0.397578736595447006E-17
+ -0.321124748682885999E-17  0.311549305911972006E-18 -0.439610099624981972E-18
+ -0.860585474410368952E-18  0.169745402629762000E-17  0.229715335295365982E-17
+  0.248350060134961013E-17 -0.577305893113008979E-18  0.141623908780918997E-17
+ -0.112485975395371008E-17 -0.291379333855478980E-18  0.351052804989545007E-17
+ -0.548877349820787005E-18  0.332036915323686022E-18  0.230392961653169981E-18
+ -0.317806761809812986E-17 -0.157886941368202007E-17  0.170761842166467006E-17
+  0.134170018845081000E-17 -0.340422541501502998E-17 -0.157209315010398008E-17
+ -0.237169225231204018E-17  0.648827237596793963E-18 -0.152211820621598002E-17
+ -0.254906293670056003E-17 -0.265743104157052001E-17 -0.576081633090791036E-18
+  0.295951293498425985E-17  0.491017591086957017E-18  0.570295000582212027E-17
+  0.425298666891756003E-17  0.322490243264190984E-17  0.170575850282616993E-17
+  0.316149982129022996E-18  0.316149982129022996E-18  0.170575850282616993E-17
+ -0.104501833606318004E-18  0.278429414238003988E-19  0.201993100848688014E-17
+  0.689958412277429955E-18  0.253745521456217996E-17  0.317289218488110982E-17
+  0.416205158728287997E-19 -0.223087532941556984E-17  0.148681372554603001E-17
+  0.224561764616170982E-18 -0.161663087507475004E-17  0.252552248152889988E-19
+  0.181779337333578008E-17  0.362974215988189973E-17  0.105511449044334006E-18
+  0.811011020335495983E-18  0.212430842451632013E-17  0.291664890900878997E-18
+ -0.253106873399842011E-17  0.740117494324606029E-18 -0.573664210352185016E-17
+ -0.213444055721414999E-17  0.191567131261783994E-18 -0.154648946414654008E-17
+  0.911779475726123009E-19  0.386130223460529005E-18 -0.437674703786760990E-19
+  0.153868974389064991E-17  0.341944482530724993E-19  0.120185332147335992E-17
+  0.429533419052512003E-18 -0.521260203663573975E-17 -0.244979246689270001E-17
+  0.353401443498422013E-17  0.108347743827260008E-17 -0.297276516321888987E-17
+ -0.502982861212076016E-18  0.717817057257475013E-18 -0.404118796699791005E-18
+ -0.824222669082385976E-18 -0.496792572966965000E-18 -0.145300107449145002E-17
+  0.429727421330391006E-18  0.674705684382426013E-18  0.516237685549988988E-18
+  0.152970157584615000E-17  0.114320869044779997E-17  0.120185320191991991E-17
+ -0.132446040544864991E-17  0.212862970992680982E-17 -0.506722571443859031E-18
+ -0.328233673284015989E-17  0.174357265416989999E-19 -0.250028140764131988E-17
+  0.304621564884264023E-18  0.287174051018836004E-18 -0.360316566105648980E-18
+  0.246878009516860991E-17  0.307227700801689012E-17 -0.398859873928170002E-18
+  0.135186458381786007E-17  0.163646765409530999E-17 -0.134905216973517997E-17
+ -0.277534977379301996E-17  0.107718664691529003E-17 -0.868460233841873968E-18
+ -0.314460981668159001E-18 -0.295265097501007996E-17  0.587343884942705016E-18
+ -0.103889092135869005E-17  0.220358991989202002E-17  0.172080450780937999E-17
+  0.203458402189975985E-17 -0.188128398314380995E-17  0.435593310570423997E-20
+  0.150997560841400997E-17  0.338780091677218015E-19 -0.474444329580814977E-18
+  0.134286485875329006E-17  0.260568510399103997E-18  0.163964402764750996E-17
+  0.391340595364909024E-18 -0.555124217806786988E-18 -0.134424128729257002E-17
+  0.120085428439502995E-18 -0.232039433643183985E-17  0.311023860107363000E-17
+  0.161539357362928995E-17 -0.227030637899263991E-17 -0.123412700414952002E-17
+  0.460785923306338999E-18 -0.167034897198548008E-17 -0.772738893357237991E-18
+  0.655603501174828000E-17  0.355923244436257002E-17  0.966041076343529922E-18
+  0.461463549664142979E-17 -0.813151629364127964E-19 -0.300879655391883993E-15
+  0.116570897662623010E-15  0.517977587904949990E-16 -0.397920832027020018E-15
+ -0.938173692378863056E-15 -0.685385179600290032E-16  0.332324906525752010E-16
+ -0.811796376648520952E-16 -0.500833641052523013E-16 -0.327264414061500016E-17
+ -0.249197093082215006E-17  0.355081340008800993E-17  0.577464711790618967E-18
+  0.309632893868808988E-17  0.237164602067062017E-17 -0.236675814580756013E-17
+  0.129894498034675006E-18 -0.177843283691745018E-17  0.905876559778580015E-18
+  0.334760614285670004E-18  0.702903342961835980E-18  0.194244010831745001E-17
+ -0.311708124589582987E-17 -0.384044738285100005E-17 -0.259964999424184991E-17
+ -0.230437174456910998E-17 -0.203085578963201996E-17 -0.186536838192342997E-17
+ -0.292541894847392008E-17  0.444950377659624030E-18  0.473120840600730037E-18
+  0.144694403214716009E-17 -0.387457071246265019E-18 -0.268733332021511007E-17
+  0.480630192996640042E-18 -0.968667995173845007E-18 -0.225453915329464011E-17
+ -0.172670305520930993E-17  0.397513637481238987E-18 -0.223802472500957004E-17
+ -0.103900998689928001E-17  0.125363275017413008E-17  0.283438399974970010E-18
+  0.289612476067066011E-17  0.115254714341709003E-17 -0.905774764864446938E-18
+  0.102320256539338992E-17  0.130652185136016004E-17 -0.227171558456370018E-18
+ -0.315659132105927017E-17 -0.140885526007124000E-17  0.456135927492873002E-17
+ -0.136525167485142000E-17 -0.181712389987688988E-17  0.477260714130435988E-17
+ -0.161968912255027995E-17 -0.847032947254299978E-19 -0.283247817561837995E-17
+ -0.347622321553165013E-17  0.308735568340312003E-17  0.609016689075842040E-18
+ -0.308828212568918019E-17 -0.265629532258948987E-16  0.869394617061814022E-17
+  0.707441917546791951E-17 -0.104625509644850993E-16 -0.644592072860523040E-18
+  0.182366193543851017E-17  0.220253050832250004E-17  0.181095644122968989E-17
+  0.465952824284591002E-17 -0.179909797996813007E-17 -0.443167638003450014E-17
+ -0.110114283143058996E-18 -0.271050543121376012E-18 -0.184653182501437012E-17
+ -0.338094772539718024E-18  0.535365328665338963E-18 -0.193755392891210995E-18
+  0.238648297443423004E-17  0.185387667186595989E-18 -0.583221945298563973E-17
+  0.392947092559352014E-17  0.206476279333813002E-17  0.168586566858662003E-17
+  0.805104553428405950E-19  0.805104553428405950E-19  0.168586566858662003E-17
+  0.710517439930086985E-19 -0.850266110884137972E-18  0.661505608924907992E-18
+ -0.318501041740767019E-17 -0.399340393980533022E-17  0.329464601164237984E-17
+  0.105759985244622995E-17 -0.263066876232583997E-17 -0.462818734786794995E-17
+  0.282065931092569004E-17  0.273173264807079987E-18 -0.191766477016721988E-17
+ -0.325548795749949012E-18 -0.339916505461695014E-17  0.329796223641775006E-19
+  0.184871977884447985E-17  0.476420507970624038E-18 -0.416450631384511008E-17
+  0.953001031626930087E-18  0.133898346897670997E-17 -0.128899552551295991E-17
+ -0.313369156609463019E-18  0.327541076635822000E-17  0.831022609569495005E-18
+ -0.292570682126152987E-17 -0.143599266409235012E-18  0.746109032496585016E-18
+  0.369159449897113977E-18  0.213209126763412006E-18  0.120185446207787991E-17
+  0.429056143033479976E-18  0.100975456627066995E-17 -0.214856782078001993E-17
+  0.104065643190709998E-17 -0.985474136165822018E-19 -0.219472349794965002E-17
+ -0.121077973535169001E-17 -0.306481117342981015E-17 -0.729646695292633014E-18
+ -0.417870234395862012E-17 -0.982270248064390015E-19 -0.222740183391410013E-17
+  0.429074284915790001E-18 -0.757393922655926042E-18 -0.222277203054574005E-17
+ -0.366790309232547969E-17 -0.513331296435003975E-18  0.120185407756813993E-17
+ -0.768679095514467055E-19 -0.849541118088088029E-18  0.468620978068442048E-18
+ -0.329866393396347012E-17 -0.130866590350788998E-17 -0.196412382089491007E-17
+  0.201593841446522999E-18 -0.493820208249257001E-18  0.525838053655470036E-17
+ -0.339024937138534000E-18 -0.388951895548529013E-17 -0.206676039130048994E-18
+  0.863973606199385961E-18 -0.153736479926655993E-18 -0.531777872198089969E-17
+ -0.115969894699322004E-17  0.228376918290164009E-17  0.239277618139722024E-18
+ -0.675919057019131015E-18  0.872979948708863904E-18  0.700124022566006054E-19
+  0.160436471636099012E-18 -0.430442219081631023E-17  0.141636781779202008E-17
+ -0.264584619614596010E-17 -0.214590875462231992E-17  0.136936648480856995E-18
+ -0.261616426743056012E-17 -0.140853348681295003E-18  0.109188833473735006E-17
+  0.822558327290080971E-18  0.590700594208202994E-17  0.211440599458354992E-17
+  0.103192880386794999E-17  0.259586853807156981E-18  0.285297240247498991E-17
+  0.138834510998783011E-19  0.738126141010415962E-18 -0.284059369630692984E-17
+ -0.500243348423792998E-17 -0.177634224131680013E-17 -0.282456371151746988E-17
+ -0.224463731022390023E-18 -0.116551733542191996E-17  0.295162620673421996E-17
+  0.304931861011547978E-18  0.155854062294790993E-17  0.683820286230236957E-18
+  0.203340846900235013E-17 -0.972340883888733010E-18  0.813151629364127964E-19
+  0.219550939928314995E-17  0.636214387616585012E-18 -0.135525271560688006E-18
+ -0.958841296291867984E-18 -0.337270652495431019E-17  0.881761298091727038E-18
+ -0.346436475427008983E-18  0.455327854752469024E-17 -0.188188717875612010E-17
+  0.140212904092025990E-18  0.250969594776295999E-17 -0.298155597433514008E-18
+ -0.317806761809812986E-17  0.271050543121375993E-17  0.335425047112703011E-17
+ -0.105879118406788005E-17  0.387941089842470018E-17 -0.111130722679763993E-17
+  0.620875150337401956E-18  0.674555863369643047E-18  0.227798923252203012E-17
+  0.182620303428026999E-17 -0.209725357740164995E-17 -0.209725357740164995E-17
+  0.182620303428026999E-17 -0.414781446358590017E-19  0.219658225253763007E-17
+ -0.306111089692803988E-17  0.206912281912994004E-17 -0.255938945946767010E-17
+  0.304542693212857005E-17 -0.388794043270335027E-17 -0.294961296034002983E-17
+ -0.640841332232825959E-18 -0.176859620984863995E-17 -0.351940186256116005E-18
+ -0.669305072706347014E-18  0.115719589845962996E-17  0.249115036765450015E-17
+ -0.267263072927983979E-18 -0.327855516196318992E-17 -0.658166933893129994E-20
+  0.288553457408329992E-17 -0.288969575785353004E-17  0.633241671747305985E-19
+  0.180772237182975992E-17  0.283017902710578996E-17 -0.565930133097904035E-17
+ -0.921226912297244036E-19  0.248537333825643019E-17  0.337764313885002993E-18
+  0.278072479341339001E-18  0.197859530774522996E-17  0.566495635123675990E-17
+  0.179570984817912009E-18 -0.119601052152306996E-17 -0.127224348677596007E-17
+ -0.247841840366608010E-17 -0.225030184305866019E-17  0.475991984506901981E-18
+  0.205724781425613012E-18 -0.942906488971646074E-18 -0.319654352426012007E-17
+ -0.119902807639766995E-17 -0.333492753201779013E-18  0.201170324972895992E-20
+ -0.324868899007545985E-17  0.537018888559226030E-18  0.262834323533009010E-17
+  0.974087889342445029E-19 -0.954182615081968947E-18  0.422838847269347037E-17
+  0.232256434137128999E-17  0.579741112836364996E-18 -0.817757371014823969E-18
+  0.861889628043636043E-19  0.258000547574478990E-17 -0.228948704303652010E-18
+ -0.986816337813258049E-18 -0.235678976639628984E-17 -0.976768781707729945E-18
+ -0.542140790912154955E-18  0.304632752502048986E-17  0.286085377935140017E-17
+  0.137515798986736007E-17  0.341100167859307016E-17 -0.366794382918673990E-17
+  0.388735183230519980E-18 -0.792068449911377045E-18  0.321449003483007017E-17
+ -0.494074318133433003E-17 -0.477853637193513999E-17 -0.292649883276361008E-18
+ -0.408307537973959009E-17 -0.194192967971836990E-17 -0.100750598608959000E-18
+  0.430052854827544002E-18  0.212506008087322998E-18 -0.402770783864319995E-18
+ -0.447233396150270964E-18 -0.198459819541682993E-17 -0.236970701884191007E-19
+ -0.220994204661097016E-17  0.296419179891642018E-17 -0.212986434587094013E-17
+ -0.359565486109450012E-18 -0.322052514457926014E-17 -0.491003823699637011E-17
+  0.156785798536770993E-17  0.182249726513603012E-17  0.101220437196889006E-17
+  0.126123205846164993E-17 -0.939359538505019076E-18  0.159242194083808006E-18
+ -0.338384368472172993E-17 -0.302051948990884002E-17 -0.450578514547435023E-18
+ -0.156574040299956994E-17  0.863973606199385985E-19  0.408269880576573004E-18
+ -0.350142244571246017E-18  0.166018457661843011E-18  0.184314369322535985E-17
+ -0.124683249835832991E-17  0.245300741524844993E-17 -0.271050543121375988E-19
+ -0.525838053655470036E-17  0.500596471827291982E-18 -0.289852854753073998E-17
+ -0.358234790318279013E-17 -0.745680161159402964E-18  0.395130447418413017E-17
+ -0.782221691899545996E-18 -0.116890546721092989E-18  0.241870258088465000E-17
+  0.352365706057788989E-18  0.645439105807776955E-18  0.299849663328021983E-18
+  0.216671027907649988E-17 -0.233781093442186990E-18 -0.794834541879753984E-18
+ -0.102999206386122998E-17 -0.467562186884373980E-18  0.241234983378025006E-17
+ -0.226581313390524994E-17  0.247037159066717002E-17  0.853809210832335031E-18
+  0.853809210832335031E-18  0.250594697445184986E-17  0.696049324406221015E-18
+  0.310955224796036990E-17  0.556959317995735977E-19 -0.495739267270380026E-18
+ -0.152187997819956003E-18 -0.313020110589274015E-17  0.360892352575412997E-19
+  0.274798235917439008E-17  0.219805416308431992E-17  0.128987869551562998E-17
+ -0.181271491937153990E-17  0.479946121031822013E-17  0.521859986993336979E-17
+ -0.162858689760435992E-17 -0.410148944122608997E-18 -0.129790139506878007E-17
+  0.180246481185638019E-17  0.252833916722396005E-17  0.184847156355081005E-17
+  0.234299073457090997E-17  0.381342421941949979E-17 -0.210785886003519001E-17
+ -0.186922460423510001E-18  0.283411363932078009E-17  0.245599891393374988E-17
+  0.115727546289480009E-17 -0.267567233540009983E-17 -0.230406371026381001E-18
+  0.847032947254299978E-19 -0.199476259078387999E-17 -0.149691897603515996E-17
+  0.169237182861408996E-17  0.202949094162130015E-17  0.258247110728035017E-17
+ -0.483304261121869975E-18 -0.267057163278154011E-17 -0.105757357420620006E-17
+  0.236854234853944018E-17  0.219443737320928007E-17 -0.769317674343717994E-18
+  0.107361426064483001E-18 -0.208888912704750988E-17 -0.317531476101955994E-17
+ -0.239075049362525982E-18  0.460785923306338999E-18  0.297573262282276003E-18
+ -0.931736241979730072E-20  0.330131091192363981E-17 -0.233781093442186990E-18
+  0.243924312985556986E-17  0.508219768352580011E-18  0.596311194867026957E-18
+ -0.164154985177883001E-17 -0.290601122335189004E-17  0.116551733542191996E-17
+  0.240557357020220988E-17 -0.636968776335234048E-18 -0.350774872303727035E-17
+ -0.278843246236116018E-17 -0.640356908124250960E-18  0.105032085459533001E-18
+ -0.916951328718438058E-18  0.397946666531910978E-18 -0.123324688397775994E-17
+  0.324032453972133018E-17 -0.997275416273532054E-18  0.182323841896488004E-18
+ -0.127499634385454002E-17 -0.769560981766082995E-18 -0.242439037817415004E-17
+  0.214918066753528012E-19 -0.279227223476463018E-17 -0.537361858320706021E-18
+ -0.899122578378295995E-19  0.277170356165288005E-17  0.464597571568984025E-18
+ -0.136671413017441992E-17  0.274447939333253981E-17 -0.310972264716655005E-17
+ -0.120358087848916004E-18 -0.688214269644119025E-21  0.293847640803337985E-18
+ -0.641415699308319028E-18 -0.881761298091727038E-18  0.296628622022741023E-18
+ -0.105601516593214997E-17 -0.284306608745905986E-17 -0.338283783309686005E-18
+  0.266548033611127006E-17  0.116345931005788996E-17 -0.166613820907728004E-18
+  0.215419523463735996E-17  0.626804380968182010E-19  0.639509875176997045E-18
+ -0.297890899637496982E-17  0.119222803137702003E-17  0.431986803099692980E-18
+  0.235475159336694994E-17  0.261563774112127983E-17  0.307684718090124983E-17
+  0.745388993583784009E-18  0.363483013490502018E-17  0.542101086242752024E-18
+  0.819927892942163017E-18 -0.137908119795569001E-17  0.153143556863578002E-17
+  0.243945488809237997E-17 -0.169967748778416003E-17  0.284603070277445010E-18
+  0.346055310600744018E-17 -0.934065582584680084E-18 -0.214364186615361989E-17
+  0.121972744404618998E-17 -0.187702501111552994E-17  0.129087821161555006E-17
+ -0.375923809903299015E-17  0.745388993583784009E-18  0.496530713680471013E-17
+ -0.156912853478858993E-18  0.322888959493338981E-17 -0.216332214728747998E-17
+ -0.179570984817912009E-18 -0.179570984817912009E-18 -0.216332214728747998E-17
+  0.159764972230942001E-18 -0.124635604232549994E-17  0.583605700658213030E-18
+  0.123751513593852991E-17 -0.106874382119810999E-17  0.503334853245148010E-18
+ -0.570046969647549980E-18  0.186124902247292013E-17 -0.224104734636542000E-17
+  0.346772455512312988E-17  0.205875328297097990E-17  0.145346221537408007E-17
+  0.586146799499976027E-18  0.264697796016969010E-17 -0.372201497146810978E-18
+ -0.208443972253258984E-17  0.831015421872823993E-18 -0.985304458448664060E-18
+ -0.122467729283171000E-17 -0.128923708528025005E-17 -0.334881093141887986E-17
+ -0.263824293290113013E-17  0.237010406553594010E-18 -0.229704747383526007E-18
+  0.129257227751006001E-17  0.914372066561017023E-18 -0.293793377755153996E-17
+ -0.146488971553649996E-17  0.586824425857778967E-17 -0.383028298748394999E-17
+  0.325260651745650984E-17  0.116043513773838992E-17 -0.711507675693612044E-19
+  0.270785845325358986E-17  0.158620154263168996E-18  0.269861719145015013E-17
+  0.383335348191774008E-18 -0.514254878101766997E-18  0.233966381899399013E-18
+ -0.790076598996348969E-18  0.667991358028422990E-18 -0.269652938758407014E-17
+  0.390482188684231994E-17  0.140692172538938997E-17  0.380910716380259017E-17
+ -0.247757137071882994E-18 -0.199222149194211016E-17 -0.181773270480773007E-17
+ -0.490537955578646973E-18  0.115196480826585000E-17 -0.847198383376810945E-19
+ -0.315405916610788012E-17  0.144814344403536006E-17 -0.268858018189743015E-17
+ -0.495129841954081968E-18 -0.158074907822647006E-17  0.403686653748175000E-18
+ -0.171910438467481003E-17  0.160230685720048001E-18 -0.580566047782802009E-18
+ -0.580566047782802009E-18  0.160230685720048001E-18 -0.945765225168629963E-19
+  0.838986134255384004E-18  0.121803337815168004E-17  0.381164826264434972E-19
+ -0.338474365722817992E-17  0.347055868269689008E-17 -0.280497979692496991E-18
+ -0.317614840398065008E-17 -0.181985442307893013E-17  0.225148543511765008E-17
+  0.429147919237410989E-18  0.940467960525839941E-18 -0.454856692675558993E-18
+  0.142979161496525994E-17  0.279498538717379988E-18  0.440419909444671001E-18
+  0.234553680134310998E-17  0.186512684518457011E-19  0.991875581234786082E-18
+  0.226687192508932013E-17  0.186688708552808003E-17  0.291392237873034990E-17
+ -0.296599174392933981E-17 -0.949735692108884063E-18 -0.128833711277379005E-17
+  0.571747239396652970E-19  0.453162626781050970E-18  0.663129190387836005E-18
+  0.217093571646361010E-17 -0.296568412519281004E-17  0.396069170862700993E-18
+ -0.465832944811199039E-18  0.173788617681510005E-17  0.292773924322348985E-17
+  0.747797743527539004E-18  0.310340345257573986E-17 -0.667401578251672007E-18
+  0.168471584229874004E-17 -0.578829807955515980E-17  0.479071164337129953E-18
+ -0.370114100408684010E-18 -0.957904246464584045E-18  0.356807490403558990E-18
+ -0.988544004688977926E-18  0.260777962546842991E-17 -0.169343680949112000E-18
+ -0.331726748645320978E-18  0.152314043111593995E-17  0.754985966476637967E-18
+ -0.297055098494334995E-17 -0.629786780666742000E-18 -0.471802637821747000E-18
+  0.474004104058814025E-17 -0.183018673611032999E-17  0.454297022273105981E-17
+  0.143070440877121008E-17  0.922260060882323081E-18  0.793934569373296001E-18
+ -0.330004036250274988E-17 -0.365155903561329030E-17 -0.558321767182671991E-17
+  0.340233613449596002E-17 -0.638672768397092996E-18 -0.904571630663488944E-18
+  0.737624249173749028E-18  0.273475174932892005E-17 -0.610287238496723009E-18
+ -0.285280696635248008E-17  0.452885521275844996E-19  0.143734418754817006E-18
+  0.210147054416443989E-17  0.147160394056858999E-17 -0.253326999085539019E-17
+  0.491174926009342989E-17 -0.494642756650379982E-17  0.321644052671446983E-17
+  0.534345440819455030E-18  0.892521263499816052E-18 -0.104905030517445001E-17
+ -0.843221298991655988E-18  0.269229422284779016E-17 -0.344928028861956983E-17
+  0.700708005616119955E-18 -0.147680194353786994E-17 -0.533011198491406996E-18
+ -0.702080298252345043E-18 -0.236921071047437994E-17  0.328175636224288007E-18
+  0.405514376520035964E-17  0.228538753592071018E-17 -0.130887766174470991E-17
+  0.848968549887674039E-18  0.757247454845344985E-18  0.107234371122393999E-17
+  0.672544160119913988E-18 -0.233955793987558016E-17 -0.857197342621352039E-18
+ -0.433680868994201966E-18 -0.257498015965307014E-18 -0.151110677790167007E-17
+  0.113841228110978004E-17 -0.813151629364128037E-18  0.508219768352580023E-19
+  0.102167724434385003E-17  0.247117416265649998E-17 -0.302836777956074006E-17
+  0.121851355649726996E-17 -0.366020141865323971E-17  0.252415818281781983E-18
+  0.178808655165382994E-17 -0.542101086242752024E-18 -0.133068876013651007E-17
+ -0.149077798716756994E-17 -0.214129929065887010E-17  0.226581313390524994E-17
+ -0.329051124184614016E-17  0.433680868994201966E-18  0.429276297668479027E-17
+  0.542101086242751976E-19  0.131970380160180008E-17 -0.479420648145933992E-18
+  0.660685698858354024E-19 -0.711507675693611996E-18 -0.669083120031099994E-18
+ -0.599011112386401003E-18 -0.343234624711939003E-17 -0.195054689191077018E-17
+ -0.204929033676336999E-18 -0.454904338278842028E-17  0.175727572819745014E-17
+  0.357017653500514010E-18 -0.114005842222755001E-17 -0.367261574528644008E-18
+ -0.318721147938444995E-17 -0.278055444590598997E-18 -0.108154216643068992E-17
+  0.286835796186848008E-17  0.144473814824863007E-17  0.175732205031176005E-17
+ -0.276777279938202999E-17  0.315270956923971003E-17  0.331322231274440013E-18
+ -0.406575814682063982E-19 -0.190650738250813994E-17 -0.344001255703653014E-18
+  0.401387737880132007E-18 -0.550238940807836995E-18  0.166319913296329992E-18
+ -0.231558231661587994E-17  0.179282091988977989E-17 -0.379113418345304005E-19
+ -0.283504574423973992E-17 -0.850505037875492034E-19  0.215451811295723997E-17
+ -0.600969876076925994E-18  0.965194043396275044E-18  0.422912962652231018E-17
+  0.270344316993893994E-17 -0.545658624621219970E-17 -0.227004829864152009E-18
+  0.253432257818487008E-17  0.476667791067357997E-18 -0.109775469964156997E-17
+  0.393478567735145006E-17 -0.542101086242752024E-18 -0.286974762529756993E-17
+ -0.131324333068139999E-17 -0.230392961653169981E-18  0.265629532258949010E-17
+  0.313508069602497990E-17 -0.453162626781050970E-18 -0.273549290315775986E-17
+  0.514307817660970006E-18 -0.307181792277692011E-17 -0.460785923306338999E-18
+ -0.569206140554890020E-18 -0.328648783534668994E-17  0.174804770128381011E-17
+  0.894466792300540965E-18  0.303576608295941010E-17 -0.894466792300540965E-18
+  0.361767771772311983E-17 -0.211059806315422997E-17 -0.100288700954909006E-17
+ -0.830092288309213971E-19 -0.346436475427008993E-17  0.565288613173838978E-18
+  0.395511446808555009E-18  0.201424434857073006E-17 -0.695414049695780964E-18
+ -0.493777856601893997E-17 -0.332585501505931016E-17  0.490182939999496966E-18
+  0.203239589653500981E-17 -0.274086295969446009E-18  0.384873697335969983E-18
+ -0.622858729446303948E-19 -0.186535018394995997E-18  0.517960647246005043E-18
+  0.502290537721800005E-18 -0.168068542091994001E-17  0.219370283682532990E-17
+ -0.834135547143374013E-18  0.702852057763857018E-18  0.742186150251978975E-18
+  0.299452616633997011E-18 -0.852618070750257997E-18 -0.208606347807502990E-17
+ -0.405305265261183001E-18 -0.237848175077988005E-17 -0.266900081679829998E-17
+  0.347918783084704018E-17 -0.211758236813574988E-18 -0.154132203131700996E-17
+  0.376929661528163988E-18  0.262664916943558997E-17  0.316693613899383005E-17
+ -0.806375365786094019E-18 -0.826704156520197035E-18 -0.734112867473461040E-18
+ -0.262071993880480987E-17 -0.917283742317204066E-18 -0.243945488809238016E-18
+  0.172117094882074002E-17 -0.476614851508154026E-18 -0.271050543121375988E-19
+  0.149924831664010991E-18 -0.293611728892637004E-17 -0.138701645112892008E-17
+  0.199306852488936985E-17 -0.197300443195128015E-17 -0.137409919868329007E-17
+  0.401330662417864981E-18  0.251442911077089996E-17 -0.623416249179164953E-18
+ -0.179232171639009990E-17 -0.450060497858702980E-17  0.117250396136948002E-17
+  0.139756465830018993E-17 -0.331705050461928996E-17 -0.113723582348357996E-18
+ -0.464527261216917018E-18  0.124683311874379009E-17  0.389591589814325992E-17
+  0.189226867995338999E-17 -0.708799730321770986E-18 -0.708799730321770986E-18
+  0.189226867995338999E-17  0.730089460974003017E-18 -0.241616148204289018E-17
+ -0.642898006966013957E-18  0.193208215268705984E-17  0.114116513818835998E-17
+  0.940812894841274938E-18 -0.213640693092592999E-18  0.191601065377061008E-17
+  0.197291178772267999E-17  0.162956090277563995E-17  0.467328921951633987E-17
+ -0.169970726628621002E-17 -0.373541529739146016E-18  0.309167025747820009E-18
+ -0.140413908980875996E-19 -0.890122439723413042E-18  0.444510317573746012E-18
+  0.159521450258606000E-17  0.588714368121340047E-18  0.247807429653125996E-17
+ -0.150405258163781997E-17  0.211586183246164007E-18  0.825751244454536043E-18
+  0.100479283368041005E-18 -0.744541960636529998E-18 -0.301924894048795009E-17
+  0.654121193517132955E-18 -0.872008011489114096E-18  0.347595984938333010E-18
+  0.120877279715728994E-17  0.108739371704337996E-17  0.222961127874442003E-17
+ -0.224159359253126005E-18 -0.703638131074452959E-19  0.364852824584889013E-18
+ -0.251407909045142986E-17 -0.133273603215256995E-17  0.305124807351774998E-17
+  0.349674804807092995E-17 -0.668332363235947976E-18 -0.168391328646527988E-18
+  0.109031825667353006E-17 -0.648693735224327031E-18 -0.364916181449932026E-17
+ -0.650813065602986984E-18  0.385502589719925017E-17 -0.120598315164507008E-17
+  0.122143819895578996E-17 -0.100551719832776992E-17 -0.315327560109728015E-17
+ -0.315810940427846024E-18 -0.218116670784693986E-17  0.142069090570837004E-18
+ -0.386443752054550038E-17  0.128694000471918996E-18  0.282494506116690973E-19
+ -0.308648228407384022E-18  0.201310440079036986E-17  0.188112236626928001E-17
+  0.514747278364022024E-18  0.514747278364022024E-18  0.188112236626928001E-17
+  0.364092852397105992E-19  0.183392682345548997E-17  0.100813833416394001E-17
+ -0.180547891482710983E-17 -0.148124239527669002E-17 -0.121626326668438009E-17
+ -0.819908795219576046E-18  0.141548838806682992E-17 -0.102046261316098992E-17
+ -0.176117589589950993E-17  0.674229731042702993E-18  0.876369010356629040E-20
+  0.392250486364678998E-18  0.305615136027426994E-17  0.253348435073654018E-18
+  0.509936297130785021E-18  0.346726570183350994E-17 -0.498687128533197966E-17
+ -0.382091095733158007E-17 -0.130344191058839003E-18 -0.132234278800973008E-17
+ -0.290958106810184998E-17 -0.547838083491907977E-17 -0.148880064826396994E-17
+ -0.257873617436497015E-17 -0.166142330785849998E-18 -0.315692570397513015E-18
+ -0.292263116495110999E-17 -0.485469728861570009E-18  0.120185407756813993E-17
+  0.429089928317842021E-18  0.125501397022973005E-17  0.805596610804541017E-19
+ -0.244340208613324994E-17  0.286470517184008993E-18 -0.183891871488211999E-17
+  0.127540845256485994E-17  0.319791558987090013E-18  0.369842425732445988E-17
+ -0.145110647130905996E-17 -0.142820325539914005E-17  0.130250556227022007E-17
+  0.429164057413021993E-18  0.923727205858856957E-18 -0.206702571605096997E-17
+  0.375403241563379036E-17 -0.302615707496255018E-20  0.120185410341754004E-17
+  0.833068366285379940E-19  0.230376963355947018E-17 -0.555518782958974975E-18
+ -0.887232412834819968E-18 -0.103930280883612994E-18 -0.255683181701366009E-18
+  0.252700699284744995E-17  0.441434446465967964E-18  0.708476885929217991E-18
+ -0.800512309604317966E-18  0.294386300818232006E-17  0.223320236543596015E-17
+  0.855079760253216000E-18 -0.338161691451272986E-17 -0.735145188877927989E-18
+ -0.866514705041148958E-18  0.259976331798577006E-17  0.576172986546055986E-17
+  0.350862222576413020E-17  0.354864453252188973E-17 -0.407577010831044005E-17
+  0.449193684955051025E-18 -0.545627853502432998E-17 -0.652112798989854975E-18
+ -0.316399912411029001E-18 -0.733328286662454975E-18 -0.102626644238229001E-18
+  0.179973821776224996E-18  0.426600202950748001E-18  0.133225047713300999E-17
+ -0.149374260248296000E-17 -0.580937546874361965E-17 -0.151407139321705993E-18
+ -0.290677388387667010E-17  0.242759642683082015E-17 -0.291633443739655992E-17
+ -0.190359074366827988E-19  0.564585509653169000E-18  0.270940528099907018E-17
+  0.271364871754146018E-18  0.104165200177577996E-17  0.286929763904433993E-17
+  0.548919701468149027E-17  0.101931481651438992E-17  0.179570984817912019E-17
+ -0.677626357803439970E-20 -0.691178884959508980E-18 -0.161023610251002992E-17
+  0.125360876193635993E-18  0.285280696635248008E-17  0.387602276663567989E-17
+  0.254787510534094004E-17  0.243945488809238016E-18  0.224971950790742016E-17
+ -0.589534931288993036E-18 -0.846030404351885973E-18  0.133284687435465996E-17
+  0.132183461885973996E-17  0.130259894730511999E-17 -0.676196989704949025E-18
+  0.109267250195805009E-18 -0.239032697715164018E-17  0.211419423634673018E-17
+ -0.355330321373178992E-17  0.124683249835832991E-17 -0.117229359899994994E-17
+  0.204812566646090015E-17  0.154366460681175994E-17 -0.102999206386122998E-17
+  0.473067901041526970E-17 -0.445751088492576035E-18  0.175706396996064004E-17
+  0.234119906621088999E-17 -0.149416611895659004E-17 -0.149416611895659004E-17
+  0.234119906621088999E-17 -0.545767150717587043E-18  0.293270268735776000E-17
+  0.335791322687940991E-17 -0.198208356635465987E-17 -0.400158216617632984E-17
+  0.149475408927575000E-17 -0.266451620510562989E-18  0.215783878432394017E-17
+  0.296575751287548994E-17 -0.132635612379727001E-17  0.125805661614397001E-17
+  0.329964597375880009E-17 -0.725804251810673020E-18 -0.202273142845066994E-17
+ -0.361750742191450980E-18  0.238477798464633989E-17 -0.615240396004691044E-19
+ -0.294892886904875015E-18 -0.136428694961263993E-17 -0.460427427752502973E-18
+  0.408633405776274967E-17  0.241370248087692008E-17  0.212034179096043994E-18
+  0.920172551053097029E-18 -0.155108110818391000E-17  0.871671969365264044E-18
+ -0.135194748282488005E-18  0.579682825683748995E-18  0.307642366442761999E-17
+ -0.397258452262267013E-17  0.241912609735827984E-17  0.178893358460108000E-17
+ -0.498055372985528985E-18  0.296678583731738995E-17 -0.628909555627129947E-19
+ -0.257821443584816018E-18 -0.909587653898011082E-18  0.284076321663370992E-17
+ -0.257331256353816998E-17  0.642289202035175025E-18  0.575876525014516962E-18
+ -0.132401837567687995E-17  0.105540305227885996E-17 -0.410048649765807025E-17
+ -0.330766365902804013E-17  0.314333926726071010E-17  0.249366499671665981E-17
+ -0.134678238613434004E-17 -0.326266503370516004E-18 -0.803834266944331022E-18
+  0.584275019020739044E-18 -0.521181830722954026E-18 -0.246360856197893001E-18
+ -0.890764331878753999E-18  0.548568871229028054E-19  0.885333063976731012E-18
+ -0.171898057455870004E-18  0.173926304317850004E-17 -0.285280696635248008E-17
+  0.132475952950572997E-17 -0.372813610800100016E-17  0.343791462020794002E-17
+ -0.608794342927187998E-17  0.192763522971397014E-17 -0.638662842229741974E-18
+  0.836445035413621970E-18 -0.484502845829459987E-18 -0.647503748616709023E-18
+ -0.270095252911453979E-18  0.150473208466164007E-17 -0.401671791702481975E-17
+  0.289321267132416987E-17  0.217652064114137995E-17 -0.225518220867271997E-17
+ -0.839409650729012021E-18 -0.231409401189875007E-17 -0.246802570644996992E-18
+  0.105003309913973996E-17  0.491384988525900984E-18  0.217168659764162005E-17
+ -0.437069000783218975E-18  0.285079526310274996E-17  0.445195223120940015E-18
+  0.207681890754913995E-17  0.875990886138556925E-18 -0.940788906603511093E-18
+ -0.431139770152439017E-18  0.545235108147592974E-17  0.266137752027301012E-17
+ -0.111469866730911004E-17  0.519358251608974022E-17 -0.341298856642441984E-17
+  0.329834629660825014E-17  0.162630325872826002E-18  0.349655200626575026E-17
+ -0.140792757701426004E-17  0.634427677493470954E-18 -0.722519104007917997E-18
+ -0.278911008871896016E-16  0.584113920426565976E-17  0.682708555486965996E-17
+ -0.992044987824237019E-17  0.376929661528164012E-19 -0.102173349262550002E-18
+ -0.187074505590503005E-17  0.622484512937185011E-17  0.489839153397161963E-17
+ -0.178723951870656986E-17 -0.215485181781494006E-17 -0.132137139771670997E-18
+  0.650521303491303046E-18  0.110114283143059006E-17 -0.937384231202242061E-18
+  0.769355052567648023E-18 -0.277877270876202001E-17 -0.341527911112772982E-17
+  0.188585358734149006E-17 -0.307333598216620014E-17 -0.275261078554908998E-17
+  0.577606530023473009E-18  0.170577906601920992E-17 -0.163191393435032013E-19
+ -0.163191393435032013E-19  0.170577906601920992E-17  0.448670475468240001E-18
+ -0.291729493222043008E-17  0.269717696660611989E-17 -0.178990335531848991E-18
+  0.256607323391346977E-18  0.134101628714247995E-17  0.230979308011910995E-17
+ -0.403170793523692000E-17  0.147556225905089993E-18 -0.337953223511012990E-17
+  0.117050351108804007E-18 -0.178627074576569006E-18  0.405489664315588988E-18
+ -0.296848046543621999E-17  0.780119362087592001E-20  0.311241529209519983E-17
+  0.188868776166629986E-17  0.299491679632580009E-17  0.416857321052395034E-17
+ -0.175461764953809003E-17  0.476829251835777018E-17 -0.883650515905821941E-18
+ -0.237576416358402012E-17 -0.452084617380231971E-18 -0.281131514415595014E-17
+ -0.148642275037992002E-17  0.311120674489479000E-18  0.238552529446801001E-17
+ -0.793446617904934958E-19  0.120185555098361008E-17  0.429438776165916018E-18
+ -0.198933798085102997E-17  0.251437329702588004E-17  0.193038858216918014E-17
+ -0.381279230735173007E-18 -0.241638795747386001E-17  0.338070251827785015E-17
+ -0.883954289874912006E-18  0.230546801479983986E-17  0.280509763384942007E-17
+  0.255304094930993989E-19 -0.243909807437984001E-19  0.430172629206993001E-18
+  0.270270744496287993E-17  0.151309846511602999E-17 -0.110864972489187992E-17
+ -0.322256764930377990E-20  0.120185576747228005E-17 -0.704689046369850962E-19
+ -0.210033616755281988E-17  0.380112200160870019E-18 -0.169167342968315994E-17
+ -0.103844874485367003E-18  0.251310187995528003E-17 -0.687636018697156054E-19
+  0.690260094094115994E-18  0.443239768152865043E-18  0.344151637139014981E-17
+  0.106920187246231007E-17 -0.219024191314240976E-18 -0.110791909500862003E-17
+  0.171270061934819991E-17 -0.592029708016452967E-18 -0.107287972426087994E-17
+ -0.206796869538028005E-17  0.276609196837732003E-20 -0.759576795450294022E-18
+  0.875726188342539956E-18  0.519314816225762026E-18 -0.109032911543856009E-17
+  0.142830372383842998E-17  0.100573850791572003E-17  0.604142927981981971E-17
+  0.479671630253418989E-17  0.110844683623943006E-18 -0.235422778124405997E-17
+ -0.161887517682752998E-18  0.235435785539538009E-17 -0.177241644212961999E-17
+  0.210064170919065983E-17  0.814342769446204974E-18 -0.282994638255850981E-17
+  0.217052192733914010E-19  0.234776357155211005E-17 -0.308228175752572013E-20
+ -0.145160674586253994E-17 -0.227329022625726990E-17 -0.495544052645817993E-18
+  0.544329179940724975E-17  0.237253928525929987E-17 -0.201932654625425008E-17
+ -0.155176435936987995E-17  0.160208341039270009E-19  0.892772726406032943E-18
+ -0.179570984817912019E-17  0.110643678735093001E-17  0.243945488809238016E-18
+ -0.147722546001149998E-17 -0.300791563965369003E-15  0.123197871683703990E-15
+  0.519874941706799006E-16 -0.398570506297564017E-15 -0.938756451046574014E-15
+ -0.674170463378642972E-16  0.300205417165869003E-16 -0.819385791855919976E-16
+ -0.474473975733969001E-16 -0.327433820650950991E-17 -0.162460919283374998E-17
+  0.252287936159080996E-17  0.695202291458967003E-18  0.296090954624581016E-17
+  0.361391397802216010E-17  0.408203287321918007E-18  0.495238006025435965E-17
+  0.275835450342189021E-18  0.364880639030068002E-19  0.194083312525632001E-17
+  0.278119002058286018E-18 -0.641258404061236032E-18  0.165955267525392004E-17
+  0.475587792606270030E-18  0.475587792606270030E-18  0.165955267525392004E-17
+  0.499313877686834030E-18  0.118594492762701009E-17 -0.532072040726810014E-18
+  0.597479878085993991E-17  0.616095653587975001E-17  0.267973686332527998E-17
+  0.237255357212338008E-17  0.415645509636498002E-18  0.406581717756872996E-17
+ -0.216855405062163982E-17  0.128431581959682994E-18 -0.333001210673627003E-17
+ -0.529555996006423025E-18  0.158490256251231992E-17  0.779339004809818954E-19
+ -0.184932006891523006E-17 -0.121805361876716993E-17  0.305020810095135983E-17
+  0.451000953589631032E-17 -0.143328195155579993E-17  0.261449190125046997E-17
+  0.213311232586893987E-17  0.308627464263071995E-18 -0.142468127993853007E-17
+  0.138447030256762007E-17 -0.416491634298082012E-18 -0.634896151970501991E-18
+  0.766861496138588998E-18 -0.678597100070076063E-20  0.120185447177140002E-17
+  0.429046681675597000E-18 -0.176084645720124014E-17  0.144825178137232999E-17
+  0.314218888807008010E-17  0.734785045431920036E-18 -0.117162594263929007E-17
+  0.608863462349075997E-17 -0.662627775282060006E-19 -0.127825823877199996E-17
+ -0.171021320481935997E-18  0.496852428684837022E-19 -0.322174981279925987E-17
+  0.429050616589522023E-18  0.104417858731019005E-17  0.163454775014506994E-17
+  0.178823488869088012E-17  0.581478820662879978E-19  0.120185445884671008E-17
+ -0.129816506770870004E-18 -0.127996804138450008E-17 -0.154357189784606990E-19
+  0.214940359022386997E-17 -0.331603456810303990E-17  0.189241510509073018E-17
+  0.197721030105515002E-18 -0.416669355256307981E-17 -0.851010143081933984E-19
+ -0.125601149543838993E-17 -0.238461833303634009E-17 -0.169618840330709007E-17
+ -0.169618840330709007E-17 -0.238461833303634009E-17  0.400549807943398004E-18
+ -0.271531580761243988E-17  0.199875144619239998E-17  0.743049289256009004E-18
+ -0.307195681303235978E-18 -0.282936086004326019E-17 -0.136888465210175001E-17
+  0.725986645135740000E-18 -0.163159410949012006E-17  0.172113641403015996E-17
+  0.328205462052660005E-19 -0.427613578091477998E-17 -0.808777579770756965E-20
+  0.234837217968203999E-17 -0.253264027376469004E-19  0.574118313848444962E-18
+ -0.393557272547890009E-18 -0.326626289571594991E-17 -0.775608933612533044E-18
+  0.105058856049080991E-17 -0.972396842945079028E-18  0.134638939547039006E-17
+ -0.102121372777366003E-18 -0.136083563029536003E-17  0.281128595079716003E-17
+  0.405831033256048975E-19  0.242827227208932005E-19 -0.170833281491397008E-17
+ -0.127819918718633995E-17  0.635195301101919971E-18  0.626566152951766979E-18
+ -0.199179797546848994E-17  0.132218081797795004E-18 -0.149746601790291009E-17
+  0.212336552384022990E-19 -0.292432710560827011E-17  0.729454186263562966E-18
+ -0.214066401594843015E-17  0.303724839061711014E-17 -0.113084192414368992E-17
+ -0.340000958562557999E-20 -0.173530847038446989E-17  0.626566152951766979E-18
+ -0.199179797546848994E-17  0.132218081797795004E-18 -0.149746601790291009E-17
+ -0.127819918718633995E-17  0.635195301101919971E-18 -0.108475496177924000E-19
+ -0.173530847038446989E-17  0.407397618620635981E-18  0.303835815115187008E-17
+ -0.182128580743015985E-17  0.358929916256484033E-17 -0.155546948389484994E-17
+  0.165743789754505008E-17 -0.733902645660011989E-18 -0.390963850189295982E-18
+ -0.130426075704324999E-17 -0.336457008402797007E-17 -0.323961040779898006E-17
+  0.134737673440338004E-17  0.209549852390063008E-18  0.950982283458043069E-18
+  0.311317695677142983E-17 -0.147193675656687989E-18 -0.592093288710995990E-17
+  0.271635327503675007E-17  0.162966204193836999E-17 -0.257298989513354013E-17
+  0.145506049662061992E-17 -0.187935630575551008E-17 -0.341302337267962986E-19
+ -0.249774864015655006E-18 -0.557204531224242037E-17 -0.140921796185665985E-19
+ -0.191936809076605013E-17  0.109847716108033996E-17  0.307745161082448993E-17
+ -0.606673286482521039E-18 -0.725513167447356014E-18  0.223354010499117018E-17
+  0.660402286256383978E-18  0.261454292557767008E-17 -0.573978120970277956E-18
+  0.106320334455071992E-17 -0.381807789389963005E-18 -0.201346560792103982E-17
+ -0.501719392383432000E-18  0.170682457038868001E-18  0.276999327163805993E-17
+  0.143519550603408003E-17  0.166110181328899991E-17  0.857665614334221004E-19
+  0.132134526597416992E-18 -0.163342014702957008E-17 -0.159162656391022990E-18
+  0.333481192424111018E-18  0.110073055806264995E-17 -0.165744961785494992E-17
+ -0.723664504377990964E-18 -0.373996400265941977E-18 -0.369427622167534017E-19
+  0.137017753030935008E-17  0.219171077410990982E-17 -0.120449150112210007E-17
+ -0.380275891434780971E-17  0.355095637040657016E-17 -0.545627094050522035E-18
+  0.120185446207787991E-17  0.297192801651785003E-17  0.587392766169290955E-18
+  0.135525271560688006E-18 -0.279859685772820986E-17 -0.256481576428601989E-17
+  0.100246349307546002E-17 -0.813151629364128037E-18 -0.449943901581484032E-17
+ -0.105709711817336990E-17  0.673391193067168962E-18  0.214299335655337986E-17
+ -0.399799551104030001E-18 -0.399799551104030001E-18  0.214299335655337986E-17
+  0.815057453495450983E-18  0.164519606391896997E-18 -0.132860922807655002E-17
+  0.338956115711568991E-17  0.276652871974074985E-17  0.209062470706900983E-17
+  0.640476975732763991E-19  0.210311470833367019E-17  0.292498933315886017E-18
+ -0.266081026581987999E-18 -0.339530842093640994E-18  0.891473360161341033E-18
+  0.127055950214517004E-17  0.864474929349381958E-18 -0.173876150030865007E-18
+ -0.268677531257492988E-17  0.262439820419368005E-17 -0.394640487662082006E-17
+  0.239815370840882017E-17 -0.692232213035683983E-18  0.895380620082258923E-18
+  0.271301013410857011E-17 -0.270990856870302012E-18 -0.462501914737739018E-18
+ -0.282121724895181004E-17 -0.224758925933616008E-17 -0.213500811600352005E-17
+ -0.540097564326268985E-19 -0.571747239396653018E-18 -0.201000918383445008E-17
+ -0.203393786459438984E-17  0.320178454062126010E-18 -0.899548989984066960E-18
+ -0.643639160794860989E-18 -0.373628383703464999E-18  0.290045029488884995E-17
+  0.443262929210015969E-17 -0.124590605607226994E-17  0.105385457017215992E-17
+  0.250840866395480999E-17 -0.133195930955738998E-17  0.189142457121884996E-17
+ -0.166018457661843011E-18  0.441388868814215984E-17  0.256481576428601989E-17
+ -0.336229728412594982E-17  0.516690097825123026E-19  0.324032453972133018E-17
+  0.344954167769314006E-18 -0.224103742019807013E-17 -0.501443504774545993E-18
+ -0.975781955236953991E-18  0.473322010925703029E-17 -0.283733538017529017E-17
+ -0.116551733542191996E-17  0.253093444639585018E-17  0.203287907341032009E-18
+ -0.279870273684662012E-17 -0.303322498411764990E-17 -0.289685267960971005E-18
+  0.198205709657505990E-18 -0.884990779140575957E-18 -0.239498565836152988E-18
+ -0.148604651406377009E-17 -0.211101786279453002E-17 -0.360401931144864038E-17
+ -0.530348504099598992E-18  0.888749319906575016E-18 -0.337449898656761984E-17
+  0.172225982869958989E-19  0.542094468797851985E-18  0.101984131697427992E-17
+  0.152297909443849011E-18  0.134546096510577998E-18  0.167670171908989002E-17
+ -0.190413006542766986E-17  0.281780730027413984E-17 -0.127285229170680006E-17
+ -0.171465938303872008E-17  0.128071381624849999E-17 -0.709098925749857965E-18
+  0.231568881611974999E-17  0.245639554703747008E-19 -0.577358832672211987E-18
+  0.380971266001097985E-18  0.689428570783346966E-18 -0.300781399570002012E-17
+ -0.515313669285834969E-18 -0.334866534763107014E-17 -0.885606033578872933E-18
+ -0.755712621218751975E-18  0.215419523463735996E-17  0.672544160119913988E-18
+ -0.336399135002044994E-17 -0.127245524501276999E-17  0.482123254002297968E-18
+  0.163985578588433008E-17 -0.132475952950572997E-17 -0.288668828424266018E-17
+ -0.171100655345368996E-17  0.948676900924816019E-19 -0.311083437790982016E-17
+ -0.542101086242751976E-19 -0.274438674910393021E-18 -0.137908147745226995E-17
+ -0.582758667710959018E-18 -0.948676900924816019E-19  0.260526158751741003E-17
+  0.896160858195049951E-18  0.346055310600744018E-17  0.406628754241267990E-18
+ -0.158687652201152991E-17  0.481114714040442978E-18 -0.347283508374262985E-17
+  0.670850094225405965E-18 -0.124582664673347005E-17 -0.243945488809237997E-17
+  0.605120337518472008E-17  0.677626357803439982E-18  0.306668278553419014E-17
+  0.286720652645581002E-18 -0.311708124589583006E-18 -0.779270311473956023E-18
+ -0.112231865511195007E-17  0.422245924206269008E-18  0.785411300341549989E-18
+ -0.177876918923402994E-17 -0.271008191474013008E-17 -0.191577676845240994E-17
+  0.192117660349115996E-17 -0.172907527995913996E-18 -0.196332952071170988E-17
+ -0.295853553788778990E-17  0.297413657783084012E-17  0.238766345557921019E-17
+ -0.258331152278270982E-17  0.434993770062446016E-17 -0.391244518336761018E-17
+  0.249172443099960983E-18  0.295345913046855004E-17  0.180478236513759987E-17
+ -0.333194968181260020E-18 -0.398801640413046019E-17  0.127766979159431005E-17
+  0.406976831843030025E-17  0.125052834133521992E-17  0.584942424528099012E-18
+  0.105310018145351002E-17 -0.248180653545509999E-18 -0.766564817265142048E-19
+ -0.351391618168446033E-17  0.116087933372732994E-17 -0.435374934888709989E-18
+ -0.120702194983737991E-17 -0.184250841851491989E-17  0.169237182861408996E-17
+ -0.340337838206777992E-17 -0.248543289526053016E-17  0.456352235223054964E-18
+  0.241855596312107998E-17 -0.178644542531852011E-17 -0.843300708330460982E-18
+ -0.535150122119347000E-17  0.259308548890062985E-17 -0.842797782518028958E-19
+  0.840044925439451976E-18  0.223701401369860989E-17  0.350290475337015982E-17
+  0.140776875833665005E-17 -0.137781820271733003E-17  0.193970544921234994E-18
+ -0.692872950854017966E-18  0.230816478126796993E-19  0.914583824797830984E-18
+ -0.332036915323686022E-18 -0.124683249835832991E-17  0.221583819001725007E-17
+ -0.271333769763113996E-17 -0.102999206386122998E-17 -0.175844039849992982E-17
+ -0.120617491689012002E-17 -0.683979104907848005E-18 -0.119558700504943992E-17
+  0.169406589450860008E-19 -0.143995601033231009E-18 -0.298933170139291000E-17
+ -0.111914228155973999E-18  0.259477955479513999E-17 -0.505043394800376962E-18
+ -0.261627301583172017E-17 -0.559571140779871969E-18 -0.249122977699329985E-17
+ -0.244248420444131008E-17  0.763575231435010022E-18 -0.498453660450472978E-18
+  0.131165285269981993E-17 -0.409226721750142997E-18 -0.166919670939219005E-18
+  0.356664398265104977E-17  0.123031535588687009E-17  0.140753053032022997E-18
+ -0.174446435487023001E-17 -0.348347593514250969E-17  0.608010837450976981E-19
+ -0.288229430082877002E-18 -0.270444385168497019E-17 -0.511978477056020969E-18
+ -0.180010383159299994E-17 -0.519787062038522031E-18 -0.778333943020545996E-18
+  0.223510818956729003E-17 -0.461209439779967016E-18 -0.254326936369024016E-17
+ -0.754514450101469011E-18 -0.729630168938883045E-18  0.215419524756206011E-17
+  0.423516473627149977E-18  0.118034041199887003E-17  0.526282745952778004E-17
+  0.189749380216831006E-17  0.362191288245938979E-17 -0.373710936328597001E-17
+  0.325260651745650993E-18  0.315943289325853988E-17 -0.894466792300540965E-18
+ -0.152582397536022009E-17 -0.189735380184963011E-18  0.109436656785256008E-17
+ -0.137907918816530007E-17  0.548877349820787005E-18  0.989334482393022989E-18
+  0.359374903696317968E-17  0.372694496791892029E-19  0.346055310600744018E-17
+  0.325207712186447985E-18 -0.166962105304642995E-17 -0.189715527850262003E-18
+  0.189418900882599987E-17 -0.165532075461661994E-17  0.311165494107747979E-18
+ -0.215505034116195009E-17  0.118796370852416008E-17  0.601181634313739955E-18
+ -0.347389387492670004E-17 -0.208708918203459989E-17  0.337288519596662010E-17
+  0.931736241979729988E-18 -0.145689666927739995E-18  0.227220724004029024E-18
+ -0.162143943672644003E-17  0.174742897018562006E-17  0.323481882556416991E-17
+ -0.635274710440724965E-18  0.375235595633655002E-18 -0.145080924035446995E-17
+  0.143389882520053003E-17 -0.235633978014306013E-18 -0.112496563307211995E-17
+  0.293844332080887002E-18 -0.161068608876326000E-19 -0.307600014795399014E-17
+  0.258090939028385005E-17 -0.476763082273924010E-17 -0.228127148519264015E-17
+ -0.309844652105622987E-17  0.174912303608012996E-18  0.531513174402072981E-18
+ -0.382435375685316988E-18  0.900819539404948025E-18  0.326277091282355990E-17
+ -0.264724265796570986E-18 -0.879365783037773085E-18 -0.260547334575423015E-17
+ -0.143360326322790005E-17 -0.504831636563563002E-18  0.179242759550851016E-17
+  0.298155597433513989E-17  0.101982766849417992E-17 -0.297393267780985003E-17
+  0.257498015965306995E-17  0.151788304147971006E-17 -0.326954717640159979E-18
+  0.230392961653170000E-17  0.139591029707509004E-17 -0.742847894742021012E-18
+  0.256650983018053003E-17 -0.153143556863578002E-17  0.409540429997454021E-18
+ -0.142725051612349992E-18 -0.443845264361252992E-18  0.113671821521527009E-17
+  0.263935466364439986E-17  0.180841534238793007E-17 -0.220183567660795011E-17
+ -0.102829799796672003E-17  0.334027442749732992E-17 -0.454009659728304982E-18
+ -0.325260651745650984E-17 -0.522448571488674997E-18 -0.336345389745121001E-17
+ -0.457041606071333975E-17 -0.304355831294945982E-17  0.427237952841651978E-17
+ -0.186941463343081017E-17 -0.129563254076186006E-17 -0.112984180780269004E-17
+  0.463510525760183965E-18  0.543403608029869993E-17  0.628157090539567034E-17
+  0.107863850412424995E-18 -0.496782957561507984E-18  0.143475058212455998E-17
+ -0.157547458689992002E-17 -0.229157071099953982E-17 -0.353871744751775981E-17
+ -0.655427268083501956E-18 -0.373057479213008010E-18 -0.288389363397911984E-17
+  0.267562859761936013E-18  0.221342377338269010E-17  0.759040992641635024E-19
+ -0.427051187586338014E-18 -0.246546273902076005E-17  0.311034949497450000E-17
+ -0.250275745587688010E-17  0.149384832852448007E-17  0.785170068645730031E-18
+ -0.124784693198205006E-18 -0.251252152874503999E-19  0.306800057149170005E-17
+  0.512280225515675992E-18  0.352936752798798985E-18 -0.468641674547420009E-18
+  0.342326765393769015E-17 -0.232885905658585013E-17  0.440296799120129026E-17
+ -0.682831107464595018E-18 -0.159760541806344992E-17 -0.278081237108601981E-17
+  0.216042711637734983E-17  0.166110979752061002E-17  0.117815888393792008E-17
+ -0.169403964681784996E-18  0.249326037070838015E-17 -0.597174026640863979E-18
+ -0.197246651090594008E-18 -0.770422236383328028E-18 -0.405218297824251983E-18
+ -0.384955849310639988E-19 -0.747681707350008986E-18 -0.163334696595593989E-19
+  0.145763242918459997E-17  0.291859303545338004E-17 -0.294086889444445013E-17
+  0.108848259833857008E-17  0.139201626084264003E-17  0.337320625108601995E-20
+  0.120185558006418003E-17  0.379317687846262973E-17 -0.300611688448085986E-17
+ -0.418480598057927000E-18 -0.261770197033990986E-18 -0.125675852095236000E-16
+  0.569849893866608996E-20 -0.467203818658997981E-18 -0.969974113361066045E-18
+ -0.308892401784452019E-18 -0.128391665958031008E-18 -0.149140333571066003E-17
+  0.168559556503606008E-18  0.264274279543341995E-18 -0.103443898683430996E-18
+ -0.166316242682362005E-18  0.493529040673638011E-19  0.366553507924297990E-18
+ -0.965617559869902025E-19 -0.628914596258986970E-18  0.680130388614003019E-19
+  0.250558797806600002E-18 -0.222187329976643995E-18  0.573591561356962000E-19
+  0.337396632102723016E-19 -0.405891570879359990E-17 -0.890046339107058019E-19
+  0.484291087592646026E-18  0.594828887209332028E-18 -0.110802497412702995E-18
+ -0.409245953699384999E-18  0.386593778060743030E-17 -0.846764863036188956E-19
+  0.417633151520367001E-19  0.269234881676822002E-19 -0.653364819565015025E-17
+ -0.108858622973204005E-18 -0.264863232139479010E-19 -0.124010917433949999E-19
+  0.374812079160027996E-18  0.271262301358190021E-18  0.288414718540088996E-17
+ -0.396919639083364985E-17  0.240811466904398010E-17 -0.235051642863068013E-19
+ -0.148738985537855005E-17 -0.355753837846806022E-19  0.407761660808220029E-17
+  0.397766672030618977E-17  0.493311988480904980E-17  0.739565642071410984E-19
+  0.343477958748839018E-14 -0.366744259744019982E-14  0.123993155153046002E-13
+ -0.668060306510329026E-14  0.135525271560687994E-19  0.152465930505773989E-18
+  0.242251422914729993E-18 -0.542312844479565985E-18  0.462585868319255004E-17
+ -0.919879898349006038E-19  0.210064170919066003E-18 -0.643745039913268017E-19
+  0.125157058890702999E-16 -0.582864546829365060E-19  0.246041564481448014E-18
+  0.101296537813243998E-18 -0.238514220907215010E-17 -0.797809496952801045E-19
+ -0.192534559377843010E-18 -0.187360213774079013E-17 -0.389635155736977975E-19
+ -0.200746808499269012E-18 -0.245893664587923003E-17 -0.948676900924816019E-19
+  0.000000000000000000E+00 -0.108420217248549998E-18  0.261375838676955993E-18
+  0.725966792801038968E-19 -0.167646349107347008E-18 -0.805740091075652957E-19
+ -0.511324047944521005E-17 -0.804386616798363005E-20  0.803374354523752020E-19
+  0.233691757936031022E-18  0.322467056021906979E-19  0.692362580416071978E-19
+  0.782403671634307027E-17  0.714617874796812030E-19 -0.205034912794744004E-18
+ -0.370259277068535988E-18 -0.219593291575676978E-18 -0.226587930835426006E-18
+  0.589328467008100031E-17 -0.458175341293122029E-19 -0.555484868553859964E-19
+  0.307568912804366982E-18  0.664220076179670995E-17 -0.838628792230762059E-19
+  0.133440776417054003E-19  0.513182852027898008E-19  0.130496013436366007E-19
+ -0.467879824239593957E-18  0.114281685243550001E-16 -0.521729943861285986E-17
+  0.141818726358787999E-16 -0.217687467444354995E-18  0.338813178901719991E-18
+  0.218534500391609006E-18 -0.564123942871364026E-17  0.155854062294790998E-18
+  0.738612730005749992E-18  0.132137139771670997E-18  0.652215369385811020E-19
+  0.528548559086683026E-18  0.393023287525995020E-18  0.813151629364127964E-19
+  0.287313575708658983E-17  0.687790753170491972E-18  0.106387338175140007E-17
+ -0.813151629364127964E-19 -0.669156028330897037E-17 -0.169406589450859993E-20
+ -0.383705925106198005E-18  0.159877468794249008E-19  0.637946834691517017E-17
+ -0.251992301808153990E-19  0.130597260343342002E-17 -0.397914845927722009E-19
+ -0.771823101021564955E-19 -0.678629956492047021E-19  0.244403275117150019E-18
+  0.616306942007001003E-19  0.578090437734863976E-17  0.519520791676689028E-19
+  0.250034502269737982E-18  0.145212009554413988E-18 -0.175005708853890006E-18
+  0.394009615144800003E-18 -0.110346462401244998E-18  0.222948572015913991E-18
+  0.171997642246803001E-18  0.319375928924822986E-20  0.101563716873418997E-18
+  0.901803069674074007E-19 -0.274728422129961016E-17 -0.421283293404232023E-21
+  0.608978550920113979E-19  0.763010629372050983E-18 -0.286952614910220011E-18
+  0.730908685021870015E-19 -0.451919320602110971E-17  0.589814511873686954E-22
+  0.126157748391557989E-18  0.141953434272505006E-18  0.100040548709848005E-17
+  0.102890605326513008E-20 -0.935924522377375949E-19  0.847890824022396966E-19
+ -0.879092467376866968E-17  0.476821187834545992E-19 -0.753033951901363993E-19
+  0.168555449681110991E-19 -0.100415019031744998E-16  0.329480295536328014E-20
+ -0.430020650141573979E-19 -0.930238733580653041E-19  0.843287605802546953E-19
+ -0.745054355771767074E-17 -0.127373076708930990E-18  0.142668167903681001E-19
+ -0.212738678080100992E-18 -0.124931763730073991E-18  0.141197446866016998E-17
+ -0.623935018560200991E-21 -0.208825078665083006E-19 -0.702882739988749020E-17
+  0.135013553282889001E-18 -0.273662399296958983E-18  0.979870564908593936E-17
+ -0.967980574380733076E-17 -0.586955097351761970E-17  0.928385604696770927E-21
+  0.588030583085907051E-19 -0.303121635649031013E-20 -0.785213004720344977E-17
+ -0.143126772192240001E-21  0.442508069709277023E-17  0.348261833625540021E-18
+  0.403209644538310013E-17  0.158323610013567989E-18 -0.736025309049684025E-19
+  0.252063439340834008E-18  0.327873715300706015E-21  0.308873893618246024E-19
+ -0.782545946699665940E-19 -0.916516118708754965E-20 -0.330581739190081984E-17
+  0.194817577868489006E-18 -0.215146368602591997E-18  0.701978555037001045E-19
+ -0.378514539581815021E-17  0.926442286059390996E-20  0.154159996400282999E-18
+  0.327166475876973987E-18  0.149444776103045990E-18 -0.384315608916437980E-19
+  0.541531313897068035E-18  0.866120139888960971E-19  0.110405247154699008E-18
+  0.129192718002985003E-17 -0.571019320457605944E-19  0.236706004088174008E-19
+ -0.840441972133477969E-17 -0.208661272600177011E-17 -0.128778654135807999E-16
+ -0.241127284462270001E-19 -0.124068820076829006E-18 -0.257368975789749001E-19
+  0.361217365792718986E-18  0.122756911625320001E-18 -0.415100531279882990E-17
+ -0.172979513388722001E-17  0.192235450868344009E-17 -0.817419881324901948E-19
+ -0.542101086242751976E-19 -0.247333620598255988E-18  0.251695840276614999E-17
+ -0.439123055680311038E-17  0.812389299711598993E-17  0.749624158320056041E-19
+ -0.872105122493027937E-17 -0.372694496791892029E-19  0.467901000063275970E-17
+  0.399121924746226012E-17  0.660008072500550978E-17 -0.527542707461818990E-19
+  0.343843876982052995E-14 -0.367179973492087989E-14  0.124130984354223001E-13
+ -0.668730478978197023E-14  0.811118750290718005E-17  0.122819777351874007E-18
+  0.342201310690737000E-18 -0.599540507978433977E-18 -0.469610947825544972E-17
+ -0.214310597913247990E-19  0.372694496791892029E-19 -0.345589442479755020E-18
+  0.437836624391667992E-18  0.205643717725583008E-18 -0.312684304341145982E-18
+ -0.472009937038071041E-21  0.120475133905591992E-18 -0.260656398339187997E-18
+  0.673638442522140059E-19 -0.964679226862538027E-19 -0.287342692466221023E-18
+ -0.221571907600904013E-18 -0.434358495352004983E-17 -0.228529489169210000E-17
+  0.277911509994136018E-17  0.156800809279949005E-22 -0.476438579319618995E-17
+  0.793257935632228010E-19 -0.191429446079471997E-18 -0.792505201274804995E-19
+ -0.761249535594635047E-19 -0.759284593898908991E-19  0.245210185927285981E-17
+  0.326322751652168995E-21 -0.219505881849384995E-18 -0.351708553712615972E-17
+ -0.957527733479134017E-19  0.210332177437534002E-18  0.632850078629209974E-17
+  0.783703999557241003E-17  0.712107216201591027E-17  0.931799314501438023E-19
+ -0.162733723449395010E-18  0.273630519451929002E-18  0.405238470426719007E-18
+  0.290400572395675995E-18  0.309034676849811019E-19 -0.982194259345465061E-17
+ -0.459295178406396010E-17  0.378501304692013982E-19  0.660156303266319966E-19
+ -0.393711501795638995E-18  0.111130722679763993E-17 -0.731836466427715011E-18
+  0.133492392487278003E-17  0.170253622398114007E-18  0.678102813836270969E-17
+  0.116382328043262001E-19  0.652850644096251986E-18  0.135101755087061000E-18
+  0.135525271560687994E-19  0.406575814682063982E-19 -0.688214269644119025E-20
+ -0.338813178901719985E-20  0.291379333855478980E-18 -0.673391193067169010E-19
+  0.575558887659297003E-17 -0.529395592033938013E-20 -0.101643953670516005E-18
+  0.225310763969643986E-18 -0.860585474410368990E-17 -0.355753837846806022E-19
+  0.111384832563939989E-18 -0.619022265765282981E-18  0.659097512082252000E-20
+  0.367135843075535984E-19 -0.148258098918869009E-16 -0.403656384107634971E-20
+  0.301976499779261008E-19  0.189408714322993997E-20  0.272869291952943018E-18
+ -0.557349643847283955E-19  0.426147018319669965E-17  0.254580724327268992E-21
+ -0.869775563299138967E-19 -0.110887275953398993E-18 -0.998341549897226014E-19
+ -0.481248056751729986E-20 -0.185349767128022002E-18 -0.145910104401558996E-18
+ -0.131668031408930009E-17 -0.607341738615819030E-21 -0.624901402953516003E-17
+ -0.116001184478416006E-20 -0.833449305952115017E-20 -0.220151721333313981E-21
+  0.165335936503532999E-18 -0.129960524270914003E-18 -0.396916978071411982E-18
+  0.219508923152721984E-18 -0.827237810600491973E-17  0.341451599951967009E-18
+  0.159674085227868006E-18  0.188315067856254009E-20 -0.847491822467939984E-19
+ -0.455167206907106011E-19 -0.209933309261154010E-17 -0.145404646964087995E-21
+  0.168411398277699010E-18 -0.313218061614544984E-18  0.560594904015392966E-19
+  0.153058648530649989E-18 -0.122794367412262001E-17  0.111914706524383009E-20
+  0.171517425742714004E-18 -0.183906875580670001E-18  0.219862189175055999E-18
+  0.897452583439613029E-17 -0.352102288514519020E-19  0.153068958878854008E-18
+  0.146666444626402991E-18 -0.303260900504276985E-19  0.235125754644082987E-18
+  0.538382462411191964E-21  0.130959794039688006E-19  0.881466470414765984E-17
+ -0.176754757224459993E-18 -0.338370818476285984E-19  0.458932469699724984E-17
+ -0.582300740523848969E-17  0.582615730901109008E-17 -0.949456505418176036E-21
+  0.113959685864603008E-18  0.144076135613940989E-19  0.582626318812949995E-17
+ -0.116571720284226000E-20 -0.141483631752557998E-16  0.100448206267148993E-17
+ -0.113650645697845998E-17 -0.471162076910205015E-20  0.673391193067169010E-19
+  0.104026233834669001E-18  0.535324822664718007E-18  0.223616698075135001E-18
+  0.861263100768173047E-17  0.000000000000000000E+00  0.243945488809238016E-18
+ -0.174488787134385990E-18  0.271050543121375988E-19  0.487890977618476995E-18
+ -0.125527106409534999E-16  0.100472472860477996E-18 -0.955823741417275020E-19
+  0.107162902717469998E-18 -0.101127663721311999E-18  0.305449075076592991E-19
+ -0.416346654315768998E-17 -0.370398036616291987E-19  0.639750339166004974E-19
+ -0.124552860677330003E-16  0.295344837712059013E-19  0.438212164389766970E-18
+  0.101705744062273998E-16 -0.262887759400580013E-17  0.254481784579693981E-17
+  0.126166498411475007E-18 -0.267108613912254992E-18 -0.118145276312759989E-18
+  0.181096822855342012E-18  0.187530220069472992E-18 -0.423863827445876037E-17
+ -0.640878176666766969E-17 -0.881828403118920017E-17 -0.228818836947481010E-19
+  0.146804706393460995E-18 -0.227785688362402982E-18  0.113841228110978004E-17
+  0.179570984817912009E-18 -0.121972744404618998E-17  0.237846851589007997E-17
+ -0.233781093442186990E-18 -0.337119113007212016E-18 -0.477943634444158998E-17
+  0.130072496962737990E-18  0.000000000000000000E+00 -0.406575814682063982E-19
+ -0.595040645446145988E-19  0.148442524006315990E-18  0.293073399749988014E-18
+  0.321449003483006979E-18 -0.327462937408513012E-17 -0.140607469244213991E-17
+  0.440457132572235984E-18 -0.105549829437158004E-18 -0.242918130488448022E-18
+ -0.188834994088198992E-18 -0.239202104304613992E-17  0.119405175783255012E-18
+  0.120003227366130008E-18 -0.403498702803366996E-18  0.243945488809237997E-17
+ -0.745388993583784058E-19 -0.514996031930614991E-17 -0.502876181595488006E-19
+  0.874958564734091035E-19 -0.126038502551440006E-17  0.198205709657505990E-18
+  0.262368455412020012E-18 -0.435120825004534007E-17 -0.847032947254300038E-20
+ -0.933853824347866003E-19 -0.218110983917982006E-19 -0.158951026508189991E-18
+ -0.123216824045899002E-19  0.105479069047138999E-16 -0.221602720078721986E-19
+ -0.112094436116716006E-16  0.552668609373801021E-20  0.600580946092660953E-19
+ -0.150321091163205985E-19 -0.566633402054696966E-18  0.471817359051710964E-20
+ -0.309740174670613979E-18 -0.713982335130080985E-19  0.717113685500036983E-17
+ -0.388401777744873018E-18 -0.159250724383875008E-18  0.102895322840944002E-18
+  0.202123237038557009E-18  0.107599654080898005E-19 -0.115560962453870006E-16
+  0.115264413034391004E-18 -0.323460706732736002E-19  0.217151454407421018E-18
+ -0.162863590805566010E-19 -0.447140234933781975E-19 -0.458895903494596031E-17
+ -0.264017439963143991E-19  0.132772414482112011E-18 -0.946559318556680990E-19
+ -0.602663941971434999E-17  0.533630756770209021E-18 -0.107064964532944006E-17
+ -0.152465930505774001E-19  0.198642461020933989E-18  0.271326821445969012E-18
+  0.764647908913610958E-17 -0.285476573004300971E-19 -0.423615735300657014E-18
+  0.405577242246589994E-17 -0.569841415265331035E-18 -0.202440874393777998E-18
+ -0.138405183581353003E-17  0.741153828847513037E-19  0.440457132572235996E-19
+ -0.145689666927739995E-18 -0.201593841446523018E-17  0.118584612615602002E-19
+  0.100288700954909006E-17 -0.389635155736978012E-18  0.253678923077150013E-17
+  0.463253320355523965E-17 -0.542861329849173998E-18 -0.578182497767161953E-18
+  0.378034402595259001E-17 -0.390167653256308987E-19 -0.809119123877932027E-18
+ -0.637257875959320984E-17 -0.796210970419041958E-19  0.108526096366957005E-19
+ -0.806968288849171933E-17  0.477726582251425006E-18 -0.325260651745650993E-18
+ -0.114409418473740993E-21 -0.136584062744755990E-19  0.204770214998727011E-18
+ -0.769571784227895035E-17  0.308108234563751989E-19  0.136366586621958005E-17
+  0.456872337957895004E-19  0.560083992759655039E-19  0.178381499097059997E-19
+  0.290512448573523984E-18  0.169088952095639994E-18  0.173985861321953990E-18
+ -0.136689941863163006E-18 -0.742890246389383958E-17  0.225469582647254023E-18
+  0.414781446358590005E-18  0.579688173277162020E-20  0.298155597433514008E-18
+  0.166018457661843011E-18 -0.339766090967380983E-18  0.405517023497995986E-19
+  0.152465930505773989E-18  0.265968345437850018E-18 -0.671697127172659976E-18
+ -0.315784470648243990E-18 -0.822315467062235941E-17 -0.257815653320528003E-18
+  0.697955148537542998E-18 -0.406575814682063982E-19 -0.247333620598255988E-18
+  0.582758667710959018E-18 -0.727897763222982996E-17  0.448338738054221020E-20
+ -0.542101086242751976E-19  0.189735380184963011E-18 -0.758941520739853008E-18
+  0.460785923306338999E-18  0.111490711682346993E-18 -0.542101086242751986E-17
+  0.787740640946499026E-18  0.358824332280602981E-18  0.387587718284786979E-17
+  0.202017357920150992E-18 -0.887267012248879966E-19 -0.146410968421885989E-19
+ -0.105032085459533001E-18  0.372694496791892029E-19 -0.189735380184963011E-18
+ -0.355922582691767008E-18 -0.767073037033493925E-17 -0.682708555486965996E-17
+  0.203287907341032009E-18  0.345589442479755020E-18  0.662040951573960991E-17
+ -0.326107684692905992E-19  0.393023287525995020E-18  0.105709711817336990E-17
+  0.792822838630025021E-18 -0.203287907341031991E-19 -0.542101086242751976E-19
+  0.542101086242752024E-18  0.474338450462407997E-18 -0.137219337455196992E-18
+ -0.847032947254299978E-19  0.326173859141910017E-18  0.731219906678634021E-19
+ -0.946472464592363042E-19  0.139237548491848992E-18  0.612680171563180001E-19
+ -0.188808491239547999E-17  0.145513853081274004E-20  0.351754011649361006E-19
+  0.103319340142544008E-16  0.706026828653623967E-19 -0.227702440388566992E-18
+ -0.768408726927613926E-17 -0.144477278643678006E-18  0.338346241908282006E-18
+  0.257484777844331994E-18  0.254997244763346007E-17  0.210498854330963010E-19
+ -0.183381484398103994E-18 -0.755890011070466999E-19  0.802413433023315002E-17
+  0.699252607177727046E-19  0.213594500225259994E-18 -0.235617227606901007E-21
+  0.130534219874883997E-16 -0.939645122611503009E-19  0.181688567186046989E-18
+  0.550571415715294980E-19  0.367633474932048013E-17  0.264951905901144992E-17
+ -0.711507675693612044E-19 -0.796210970419041958E-19 -0.555997720533642962E-19
+  0.694178241860625994E-19 -0.971685756843591012E-18  0.352246592049581004E-19
+ -0.177016651086348013E-19 -0.809573907778713970E-17 -0.165171424714588999E-18
+  0.330342849429177012E-19 -0.540915240116595965E-17 -0.352026892878886990E-17
+ -0.329326409892472010E-17  0.622238343986890020E-19 -0.660685698858354024E-19
+ -0.982558218814988032E-19  0.528548559086683026E-17 -0.313402190484091005E-19
+ -0.866032624180153034E-17  0.952700100629121048E-19  0.000000000000000000E+00
+  0.121972744404619008E-18  0.146367293285543002E-17 -0.155774652955986003E-18
+ -0.406575814682063982E-19  0.127393755267047002E-17  0.142301535138722000E-18
+  0.508219768352579978E-20 -0.738612730005749992E-18 -0.115196480826584991E-18
+ -0.121972744404619008E-18  0.894678550537355034E-20 -0.120066920273297001E-18
+  0.246976278573632984E-18 -0.799387343971245945E-19 -0.190053017540184000E-18
+  0.346884975340074986E-19 -0.175978087053195992E-19 -0.495753329340794005E-18
+ -0.285070427323536989E-18 -0.441968288153793989E-19  0.812601554256781032E-21
+ -0.748406217586133030E-17 -0.742510405052100003E-19  0.347283508374263019E-19
+  0.168771314740419006E-18 -0.142883870289960005E-18  0.396517298433418985E-19
+  0.398084309385840033E-17 -0.227640104574593005E-19 -0.514307817660970006E-19
+  0.349851076995627978E-18  0.116316152503736994E-16 -0.578894079889111008E-19
+  0.347336447933467014E-18  0.295353109518184001E-19 -0.312025761944802984E-18
+  0.281532575843647979E-18  0.110855436971906990E-18  0.133195930955738993E-18
+ -0.630616029230826988E-17 -0.385230584411256025E-17  0.125369147999762005E-18
+ -0.152512769607959997E-18  0.905901737088474059E-17 -0.838562617781756975E-19
+ -0.264274279543341995E-18 -0.777020380207811956E-19  0.542101086242751976E-19
+ -0.542101086242751976E-19 -0.542101086242751976E-19  0.406575814682063982E-19
+  0.840256683676266033E-18  0.551587855251999996E-17  0.158225754547102995E-17
+  0.362371282747230003E-19 -0.216840434497100983E-18 -0.745388993583784058E-19
+  0.400138364282931018E-17  0.162247341249214004E-19  0.161979169294623999E-17
+ -0.595570041038180046E-19  0.894466792300540965E-18  0.284603070277445010E-18
+  0.502798757490152989E-17 -0.613933450636857020E-19 -0.294972606436410006E-18
+  0.599699326656045025E-17  0.148230765769502993E-18  0.350671640163280003E-18
+  0.628498446862691006E-17  0.169406589450860008E-19  0.381164826264434972E-19
+ -0.220228566286117998E-19 -0.199529198637591003E-18  0.108631975485363994E-18
+ -0.129066645337873990E-18  0.144456340634422998E-18  0.530040792911729013E-19
+  0.403618230401881005E-18  0.737014865216481058E-17  0.110059844318994989E-18
+ -0.454074179816084022E-18  0.352795095513764987E-17  0.327364999223985992E-19
+  0.128494236353987006E-18 -0.206400753422191982E-17  0.908972231522271051E-19
+ -0.196988099795828006E-18 -0.510866746312750012E-19 -0.832093403647102925E-17
+  0.508219768352579978E-20 -0.156595216123638992E-18  0.304349525860310983E-18
+ -0.762382592088074007E-17  0.127054942088145006E-19 -0.180629776001980005E-18
+ -0.293741761684931004E-18  0.590206271074140996E-17 -0.421489881126395011E-19
+ -0.346224717190195023E-18 -0.299214388617581980E-18 -0.100318347108062999E-16
+  0.568359107607636009E-18  0.107471540347625997E-16 -0.118584612615602002E-19
+ -0.106064406863998999E-18 -0.429267033245619011E-18 -0.243921335135351988E-17
+ -0.434104385467828984E-19  0.210838411972415998E-18 -0.518765659418141033E-17
+ -0.190582413132218010E-18 -0.912678000666508976E-19  0.540618778585057037E-17
+  0.694567016748526037E-19  0.345589442479755020E-18 -0.406575814682063982E-19
+  0.202271467804326998E-17 -0.711507675693612044E-19  0.118923425794503999E-17
+ -0.203287907341031991E-19  0.797629750605692964E-17  0.188464830764082006E-19
+ -0.451061588025165991E-18 -0.189821406968669007E-18 -0.400002748021503026E-17
+  0.124839752407727995E-18  0.327464209198704022E-18  0.545653123870147032E-17
+ -0.251365764192934006E-18 -0.179010689506355000E-18 -0.904977154343205932E-18
+ -0.183495918530850001E-18 -0.399801975895928995E-18  0.278820123303884006E-23
+  0.562694574772872009E-18  0.202758511748997993E-19  0.854656243779589042E-18
+ -0.390636044278167019E-19 -0.107188002479181997E-17 -0.245956339028960991E-20
+ -0.580863803722753026E-19 -0.791746263062787051E-20 -0.574261868458814053E-19
+  0.236017789818529992E-18  0.373936922071946989E-18  0.518031784778684025E-18
+  0.245470727140724987E-17 -0.691522992094331016E-19 -0.285741270800317976E-18
+  0.316578564036294992E-19  0.110114283143058996E-18 -0.169406589450859996E-18
+ -0.754330485133237969E-17  0.942324153820408977E-20  0.281214938488428001E-18
+ -0.111808349037568006E-18  0.279944389067546021E-18 -0.269065309651248993E-19
+  0.764138861964645979E-17 -0.310440883891150985E-19  0.204959664400601989E-18
+  0.515554994382785973E-19  0.579622776531413976E-17 -0.229418597622752010E-19
+  0.920281259736054051E-17 -0.399130950750337972E-23 -0.120874248551148997E-18
+ -0.174419097167777988E-18 -0.868654828080977068E-18 -0.900415783177556089E-21
+  0.404465583235387984E-19 -0.681480957419390977E-17  0.149009478768038989E-19
+  0.338276207821614990E-18 -0.482449426342788030E-17  0.360144848996880021E-20
+ -0.141509621152472008E-18 -0.132682285823646993E-18 -0.142110385631281993E-16
+  0.189967376932361008E-22  0.110892271134246991E-18  0.633346199603779993E-19
+ -0.300611992980550998E-17 -0.269779993700494995E-18  0.391392087358040979E-18
+  0.589549975636383965E-19  0.134263796311126002E-16 -0.559058288800089044E-19
+ -0.227134697220322979E-18  0.286999495230071982E-17 -0.530824960132428959E-18
+ -0.419810704482912991E-18 -0.874646221334790992E-17 -0.519739416435238996E-17
+ -0.217856874033806009E-17 -0.364476457406177966E-23  0.131634213959238997E-18
+ -0.338019085513668988E-19  0.242463181151542991E-19  0.205405489709168013E-19
+  0.396478007354323017E-19  0.290554014399305015E-18 -0.223490056723352997E-17
+ -0.582914177666118979E-20 -0.155517709978212000E-20  0.518526149271776997E-17
+  0.184335545146216991E-18  0.465497544075441045E-18  0.198713929425859014E-17
+  0.592923063078010012E-20 -0.474073752666390980E-19 -0.119365471113852009E-18
+ -0.367304091612129012E-17  0.149091033606558009E-19 -0.402266203690663009E-19
+ -0.295418456786575992E-18 -0.928032954377330083E-18  0.123415347392912000E-19
+ -0.403637669146276001E-18 -0.193017632855574003E-18  0.642437432800943969E-17
+ -0.115514118181804992E-18  0.261563774112127983E-17 -0.575982404132923990E-19
+ -0.514996031930614991E-18 -0.508219768352580011E-18 -0.113336184716178005E-16
+ -0.258810717670098996E-20 -0.318484388167617024E-18  0.813151629364127964E-19
+  0.115196480826585000E-17 -0.162630325872826002E-18 -0.178829830989063991E-18
+ -0.260208521396520987E-17 -0.135525271560687994E-19  0.731095312598867980E-19
+ -0.169050570915216994E-17  0.107679063419703003E-18 -0.465868120989865006E-19
+  0.981367078732912033E-20  0.423516473627149989E-19 -0.982558218814988032E-19
+ -0.182959116606928994E-18 -0.101988060805338004E-18 -0.209992702514142018E-17
+  0.590276085117841030E-19  0.271050543121375988E-19  0.162630325872826002E-18
+ -0.382181265801140005E-17  0.383811804224604982E-21  0.423516473627149977E-18
+ -0.149077798716756994E-17 -0.279182059415017007E-17  0.592923063078009997E-19
+ -0.338813178901719985E-20  0.308319992800564986E-18  0.150771864611265003E-18
+ -0.740624433255478979E-19 -0.387094056895215014E-18  0.676501392170368033E-19
+ -0.121549227930992002E-18  0.295402740354937003E-19 -0.291416832279092011E-17
+ -0.460872725571868987E-19  0.758375729200866953E-19  0.521371940092172967E-20
+ -0.406504324951388992E-19 -0.651424377925056947E-19 -0.108738516348261003E-17
+ -0.957544277091384987E-20 -0.357024387267687978E-18 -0.120808074102144995E-18
+ -0.257471546185705991E-18  0.344363560811950999E-18  0.515776973146925997E-17
+  0.290371414279082971E-19  0.268688528882230986E-18  0.191512370935880005E-18
+  0.454950577675083974E-17  0.264846688527228016E-18 -0.236094717615498001E-18
+ -0.111563503576251997E-18  0.117314063194721006E-18 -0.277085652870562998E-18
+ -0.285873619698326003E-19  0.390852765598656020E-18  0.429106891079029014E-17
+ -0.138235776991902008E-17  0.236143521271638010E-19  0.298686647386773004E-19
+  0.674238226014422974E-18  0.847032947254300038E-20  0.176182853028893989E-18
+ -0.202096767258955992E-19 -0.745388993583784058E-19  0.325260651745650993E-18
+  0.119262238973405006E-17  0.307472959853311023E-18 -0.216840434497101002E-17
+ -0.154498809579183997E-17 -0.527193306371076993E-17  0.337489689921634990E-19
+  0.389635155736978012E-18  0.267662411332359003E-18  0.125360876193636007E-17
+  0.162317651601281012E-19 -0.846415539645091067E-17  0.115143541267381001E-20
+ -0.813151629364127964E-19 -0.243945488809238016E-18 -0.454009659728304982E-17
+ -0.104290931630686000E-19  0.338813178901719991E-18  0.623416249179164953E-18
+  0.313063377305188982E-17  0.220228566286117998E-19  0.609863722023096003E-19
+  0.155854062294790998E-18 -0.355753837846805998E-18  0.374388562686400991E-18
+ -0.416740210049116008E-18 -0.124831480601602994E-18  0.223034362923898006E-18
+ -0.571879588294660973E-19 -0.533562697900371999E-17 -0.143723355214125000E-18
+  0.107285742269608005E-18 -0.225737175575414976E-18 -0.854858075849051981E-19
+  0.293756650935957019E-19 -0.710708536503824959E-17 -0.797732982746140040E-20
+  0.214140516977727985E-19 -0.228315083954436005E-18 -0.165965518102638991E-18
+ -0.213240544471269990E-18  0.618450518525887022E-17  0.116043513773839002E-18
+ -0.139892785194967992E-18 -0.506313944221258028E-18  0.133937084784586000E-18
+  0.204346698525099987E-19  0.536807130322413020E-19  0.124523769413732997E-19
+  0.148952067263648992E-18 -0.301517259442928983E-18 -0.127393755267047002E-17
+  0.398105485209521015E-18  0.211419423634673018E-17  0.340168431617327016E-17
+ -0.430292737205185006E-18 -0.193123511973979996E-18 -0.107814853389060005E-16
+  0.631039545704453993E-19  0.230392961653169981E-18 -0.135525271560687994E-19
+  0.976205471710581020E-19 -0.194817577868488988E-19 -0.813151629364127964E-19
+ -0.117102304957906998E-18 -0.428768077900126986E-17 -0.606475590234079043E-18
+  0.101643953670515996E-19 -0.106005024343306003E-18  0.744917500634628952E-19
+  0.810409525633515039E-20  0.448588648865876997E-17  0.121760986167806011E-18
+  0.375955573638820974E-17  0.914795583034644005E-19 -0.891027874206852923E-17
+ -0.104166389249707997E-18  0.901362169886782956E-19 -0.633570718378866017E-19
+  0.112392829688519002E-18  0.194071254160812988E-18  0.147344671802759991E-17
+  0.697367299791314956E-19  0.211231296913984995E-19  0.479296835202547044E-18
+ -0.566753257617398987E-19  0.323987061890757993E-18  0.259933235688662995E-19
+  0.986793383551260015E-19 -0.179676863936318002E-18 -0.919808297546715955E-19
+ -0.606703892083144040E-19  0.386989004957421020E-18  0.314651646799352012E-17
+  0.101994729949027002E-18  0.684812902965300976E-18 -0.806073444862511971E-18
+  0.110753693756562992E-18 -0.925118797079306001E-20  0.301637366067872000E-17
+  0.452103835596982987E-19 -0.753329927464293003E-19 -0.465974000108271974E-18
+  0.212139401639839982E-17  0.984675801183124024E-19 -0.342677766723567977E-18
+ -0.147701370177469013E-19  0.162418567636011994E-18  0.148230765769503011E-19
+ -0.840150804557859028E-19  0.448834817816173048E-18 -0.330239617288731015E-17
+ -0.107028899458235995E-19 -0.720116040930874954E-19  0.171179072713674006E-18
+  0.947754072384057926E-17 -0.104498004108727002E-16 -0.159287360851361995E-18
+ -0.546449624875564009E-18 -0.169016160201734991E-18 -0.141622419855816012E-18
+  0.105248409733328002E-16 -0.382470559941918998E-19 -0.297665499382925019E-18
+  0.962848986359687018E-18 -0.178557947061475997E-18  0.440432446400829996E-19
+  0.263750093250461983E-17  0.518265773643397017E-17 -0.455663906451196969E-17
+ -0.368948450555535963E-20 -0.373638438713888985E-18  0.213542219055679009E-18
+ -0.448714132358229986E-17 -0.219963878966174016E-20  0.576077695339490031E-17
+  0.158077523781333993E-18 -0.930746933967116952E-18 -0.950560029288081993E-19
+  0.423162853915283991E-19  0.142072256667234998E-18  0.287333593479481977E-19
+  0.170800388783012010E-19 -0.143679075460453001E-16 -0.306660014178995024E-19
+ -0.150184396739703993E-18 -0.853679844308174942E-19 -0.189294334590933006E-18
+ -0.272474283524034976E-18  0.125462270442160009E-18 -0.751674786718394965E-19
+  0.306550050694755012E-17 -0.309179488361727017E-21 -0.225680824731844995E-17
+  0.535668041239238007E-21  0.109157572264066003E-20 -0.602222147500702957E-21
+ -0.399748891870335016E-18  0.651654608810402999E-19 -0.115821846718304011E-18
+  0.728333770199589983E-19 -0.105682434112270003E-16  0.785310085019301966E-19
+ -0.113602115086027998E-18 -0.212083351005076997E-19 -0.204710795319530007E-18
+ -0.245404952064859993E-18 -0.800663773371987988E-17 -0.216121373216464000E-19
+ -0.990499766618607977E-19  0.114815399364617001E-19 -0.876268280833858963E-19
+ -0.244044218853604000E-18 -0.920838732195320949E-17 -0.554290201001359983E-20
+ -0.815213370858219041E-19 -0.323977921931644982E-19  0.244566550410973989E-17
+  0.922736264539544005E-17 -0.225107239567582988E-18  0.146285264029161001E-20
+ -0.131179966443482995E-19  0.227861500072650001E-18  0.104859413987007006E-16
+ -0.325575790958343987E-21 -0.128914520652134002E-20  0.712380445010489047E-17
+ -0.689612758111431019E-19 -0.518476950651943026E-19 -0.987352703162770031E-18
+ -0.651557083079744989E-17  0.618057019281626989E-17 -0.256050918324738993E-22
+ -0.579315073000275023E-20  0.116825754961274992E-20  0.488482179479582027E-17
+ -0.112839648754171999E-20 -0.171478222420645004E-17  0.121003483029333003E-16
+  0.281361349456850020E-19  0.196479900706991004E-18  0.446695067007620992E-17
+  0.296023125814352014E-19 -0.525838715399959994E-19  0.134402804013066001E-16
+ -0.533630756770208973E-19 -0.135525271560687994E-19 -0.307388256558586017E-17
+  0.418095462764723033E-17  0.325938278103455002E-17 -0.804756263134598058E-22
+  0.212102343948397000E-18  0.553747789267498967E-19  0.178660424399612991E-17
+ -0.110855436971906990E-18 -0.115760817367797994E-16 -0.210618278531899992E-19
+ -0.934068891307130031E-19  0.132841484063259997E-18  0.281252988796605007E-18
+ -0.193943299659809005E-18 -0.345377684242941011E-18 -0.582891016608967021E-18
+ -0.584484497340988962E-17 -0.130601892554772000E-18 -0.345854140275771989E-18
+  0.152100316675026008E-18 -0.102914503091396998E-18  0.114150924532318007E-21
+ -0.415920887652381993E-18 -0.147870942203041992E-18 -0.441264460850088028E-18
+ -0.741908217566160965E-18  0.214934610365779012E-19  0.377141419764976985E-18
+  0.129262521706926995E-17 -0.169406589450860008E-19  0.576660030490727979E-17
+  0.296461531539004998E-19  0.508219768352580011E-18  0.572594272343907029E-18
+  0.357553782859722012E-18 -0.169662498452868990E-21 -0.162630325872826002E-18
+ -0.745388993583784058E-19 -0.197866896478604983E-17  0.162630325872826002E-18
+ -0.582335151237331002E-19  0.208708918203459989E-17 -0.508219768352580023E-19
+  0.100691041604855001E-18 -0.138156367653096994E-17  0.225522522206456984E-18
+  0.317637355220363012E-18  0.468614360623540998E-19 -0.230392961653169981E-18
+ -0.332036915323686022E-18 -0.677626357803439970E-20  0.210729224131558998E-18
+  0.668009886874144024E-17 -0.181651509494605009E-17 -0.511607900141597020E-17
+  0.677626357803439970E-20  0.169406589450859991E-17 -0.870194004405784926E-21
+ -0.119153712877039012E-18 -0.291379333855478980E-18  0.481961746987696989E-18
+  0.290320542671411008E-18  0.758687410855676968E-17 -0.385399991000707015E-19
+ -0.444692297308507979E-20  0.647980204649540000E-19  0.108234928791338999E-18
+  0.201157090083095001E-18 -0.197605409177339990E-18  0.198550747370517010E-19
+ -0.101671722281198999E-18  0.310643151866217981E-18 -0.125183016637982994E-16
+ -0.145837243138382015E-19  0.171651875903994001E-19 -0.615441504747079020E-17
+  0.123241691163063998E-18 -0.110059088224217001E-18 -0.109498990014289995E-17
+ -0.483878427864559008E-19  0.347403720981722015E-18 -0.340587449003903991E-18
+  0.741306893449987060E-17 -0.363633250169256998E-19  0.364955640550089989E-18
+  0.328622055261124998E-20  0.494646450528802992E-17  0.588947426258927007E-19
+  0.299974477127637992E-18  0.552209517671485973E-18  0.121661835129705993E-16
+ -0.100268920998511004E-18  0.115831755537026005E-18 -0.310225816931887018E-18
+  0.122421671866663997E-16  0.579370535921941047E-18 -0.727770708280895005E-17
+ -0.135525271560687994E-19  0.698851812321550961E-19  0.225434841061526989E-18
+ -0.346550460915418993E-17  0.144465439621161009E-18 -0.453050130217742988E-18
+  0.304348202371331014E-17  0.166441974135469993E-18 -0.925383494875323000E-19
+ -0.872359232377204073E-17 -0.423516473627150019E-20  0.172794721239877004E-18
+ -0.105032085459533001E-18 -0.633919457725117989E-17  0.457397791517322003E-19
+  0.704731412115577978E-18 -0.674238226014422974E-18 -0.404116110412576017E-17
+ -0.362753453114926982E-18 -0.475570224499606972E-17 -0.176475368004182995E-18
+  0.178340140066433007E-19 -0.163225895913863989E-18  0.482835275563946972E-19
+  0.399079748874743998E-20  0.367605681663466019E-18  0.163914110183507994E-19
+ -0.579031722743040020E-17 -0.228529489169210000E-17  0.277911509994136018E-17
+  0.156800809279949005E-22 -0.101942532784422997E-16 -0.200111533788828004E-19
+  0.681861522539712014E-19 -0.438868945796134002E-19 -0.998186762513412040E-19
+ -0.723376683507983022E-19  0.104239077746036003E-16 -0.222340978775424991E-19
+ -0.146386980184122005E-18  0.409358532980754034E-17  0.223196490323958014E-18
+  0.222899946074357995E-18  0.351381030256605971E-17 -0.227693044133796995E-17
+  0.248805340344110007E-17  0.166759611490690003E-20  0.826704156520197035E-18
+  0.189735380184963011E-18 -0.925912890467357058E-19  0.229545928705915007E-18
+  0.121727104849915995E-16 -0.743059652978834972E-17 -0.681480357713447030E-17
+  0.102237042169716995E-18  0.239339747158542999E-18  0.388946941467333989E-18
+  0.338813178901719991E-18 -0.921571846612678961E-18  0.655264687995926973E-17
+  0.770799982001412947E-19  0.954817889792410057E-18  0.202589700059119992E-19
+ -0.249620609555842002E-17 -0.114772964352958009E-18 -0.135525271560688006E-18
+  0.338813178901720015E-19 -0.451045044412914991E-19  0.101643953670516005E-18
+  0.627651413915437033E-18  0.282061971435682013E-18  0.234119906621088999E-17
+  0.834327453045485954E-19  0.216840434497100983E-18  0.237169225231203999E-18
+ -0.646794358523383970E-17 -0.677626357803440031E-19  0.243521972335611010E-18
+ -0.633315946750199974E-18  0.176182853028893989E-18  0.211758236813574988E-18
+ -0.549982711273340999E-17 -0.421382929258131981E-19 -0.755877126763073964E-19
+ -0.927923781985524985E-20  0.544312695782081001E-18  0.417435145161237000E-19
+ -0.461341442457651995E-17  0.536108318836156013E-19  0.197084421782326988E-18
+ -0.190691339550880991E-18 -0.193154897758039006E-18 -0.127537878015355010E-18
+  0.298736053454965990E-18  0.127528573266872008E-18  0.609637342641941969E-17
+  0.396854887229514001E-20 -0.817053422952606007E-17 -0.752305314909567013E-21
+ -0.193634152214976995E-19  0.401511567862913983E-22  0.140692652691434998E-18
+  0.125895048470979998E-19 -0.964089261394241035E-19  0.637270598203130045E-19
+  0.357904713618655020E-17 -0.192415561086064008E-18 -0.335339833979068015E-18
+ -0.144439750170145000E-20  0.293628202065289017E-18 -0.755801452661721952E-19
+  0.339411342868876015E-17 -0.400752487403498007E-20  0.763312882702686022E-19
+  0.247337834049500987E-18  0.791484376388382995E-19  0.117214281504105993E-18
+  0.710182978617857932E-17  0.342936365526693036E-20  0.555760486582942945E-19
+ -0.781423606007173046E-21  0.298447611490521017E-17 -0.609211529548263984E-17
+  0.371005047806836004E-19  0.187569128895034009E-18 -0.189293009203637990E-18
+ -0.886637171818198977E-19  0.294046943769591991E-17  0.259566158502791993E-21
+  0.291867570968205015E-20  0.719829617311043049E-17  0.113028268052886998E-19
+  0.773855520595252012E-19 -0.378454053841608976E-17 -0.480794137786766970E-17
+ -0.101370917403036996E-16  0.115757992860620994E-21 -0.592809693679449974E-20
+  0.167889785464364989E-18  0.611388863593793994E-17  0.327757493780598990E-21
+  0.452478537105359005E-17  0.863084305453742042E-17  0.186191808557267986E-17
+  0.626132329032225955E-19  0.646801472276652056E-19  0.138210713419341001E-18
+  0.227784990428761008E-19 -0.145303063388947988E-19  0.114819286467261003E-18
+ -0.105892353296588006E-18 -0.137844024253797010E-17  0.171947688292622993E-18
+  0.230392961653169981E-18 -0.409752188234268030E-19 -0.101232067356301001E-16
+  0.159844381569747008E-19 -0.165171424714588999E-18  0.324837135272023987E-18
+  0.247031279622018984E-18  0.105941867811068004E-18  0.165709810725904997E-17
+  0.250228339151884989E-19  0.121222449583768003E-18  0.220343942471932981E-17
+ -0.175540960873553008E-18 -0.443011466303800022E-18 -0.485540461189846004E-17
+ -0.599881968135295963E-17 -0.920317179059558028E-17 -0.319285512590875016E-19
+ -0.109833041734791000E-19 -0.160489582447537996E-18  0.197370257238828010E-18
+  0.281536711746710985E-18  0.502518425980557979E-18  0.685177358743191994E-17
+  0.732594660177182060E-17 -0.814276594997199962E-19 -0.277826806699410993E-18
+ -0.214934610365779000E-18  0.267620059684996000E-17 -0.444713473132189036E-17
+ -0.670701863459635981E-17  0.264697796016969006E-19  0.585130359963270982E-17
+ -0.372694496791892029E-19  0.801293168102568024E-18  0.410641572828885035E-17
+  0.502798757490152989E-17  0.178988649666674003E-18  0.343835745465758998E-14
+ -0.367193526019243979E-14  0.124117838402881993E-13 -0.668735222362701016E-14
+  0.823316024731179930E-17  0.113502414932076005E-18  0.248604170019137005E-18
+ -0.680961550033253983E-18 -0.504233419544565038E-17 -0.217204263911860001E-19
+ -0.210064170919066003E-18 -0.277826806699410993E-18  0.184997289636259991E-18
+  0.124672661923991994E-18  0.142419093985962009E-17  0.318871176989522014E-22
+  0.362796009892000020E-20  0.475428702862992037E-20  0.114993378999595012E-18
+  0.296998248186471012E-18  0.137377009476437004E-17  0.107906108735239008E-20
+  0.344697057730012976E-18  0.233888414833785013E-18 -0.114499755441505997E-18
+  0.343701109734176990E-18  0.239648173231097987E-18  0.662407103109341977E-18
+ -0.377078204440514013E-17 -0.211556892294861001E-20 -0.821331087183475025E-17
+  0.141328819223921000E-20 -0.122986833084708995E-19 -0.578785195122303022E-22
+  0.117628290555894005E-18 -0.482852030302985980E-19  0.152715824767208010E-18
+  0.191404425117076001E-18 -0.265067067422130010E-17 -0.664820294833819956E-19
+  0.731756791723990014E-19  0.711274425411596942E-19 -0.103554248939230999E-18
+  0.528365257801646029E-18  0.607076619005779013E-17 -0.156159844440068998E-18
+  0.131687040427342994E-19 -0.227879910615741999E-18  0.316422725621148005E-18
+  0.120637954277358989E-18  0.119435717232415004E-16  0.515901036912107984E-20
+  0.767837756490412976E-19 -0.195591700332982007E-18  0.102080243701931009E-17
+ -0.582123393000517985E-17 -0.113091510730573003E-19 -0.737094946711110960E-24
+  0.812485213421242952E-19  0.293564361805019011E-18 -0.257901768391306992E-17
+  0.755454438035129986E-21 -0.102984835529547994E-20 -0.299206409188393008E-17
+  0.118414324366844011E-18  0.106238035287570000E-18  0.865029155483104026E-17
+ -0.605451096465977991E-17 -0.908673286122591033E-17 -0.245986784410216017E-21
+ -0.231756677129184016E-18 -0.175645901796425989E-18  0.582748079799118032E-17
+  0.756879467374077992E-22  0.697679309046806012E-17  0.378114738427846991E-17
+  0.185184588830498988E-18  0.240027296595149010E-19  0.612515214688381968E-17
+  0.116178223514405005E-20  0.107214617423943998E-19 -0.748335452127508986E-17
+  0.186841398172223001E-18 -0.219407572781849989E-18 -0.827097614197910050E-17
+  0.295813021938763978E-18  0.167050779066308997E-18  0.000000000000000000E+00
+ -0.185963535048316991E-18 -0.266565349779309023E-18  0.395342870760552980E-17
+  0.710830601573883961E-22  0.714369058868556026E-17  0.340525277702111998E-22
+ -0.382282094159552979E-20  0.149830506514481011E-21  0.883994619572126023E-19
+ -0.758302221231688993E-19  0.109190432258762008E-18 -0.703507417453546983E-20
+ -0.569691861010580966E-17 -0.419320786566922027E-19  0.150383337888478994E-18
+  0.240575431966751987E-21  0.280064166101323972E-19  0.105607927978014003E-18
+  0.524266083943383965E-17  0.504615464422202999E-23  0.269899549811437999E-18
+ -0.169841572392483002E-18  0.110246530172922989E-18  0.595056711102719968E-19
+  0.830106334405030930E-17  0.722278172990987047E-22 -0.260802680060638985E-19
+ -0.210156414482063007E-19 -0.129291638464487995E-17  0.652706764278169958E-22
+  0.409368376430043030E-17  0.000000000000000000E+00  0.231210476845850008E-18
+  0.729891645869777022E-19  0.462908799630395012E-17 -0.770274702091042971E-23
+ -0.714684049245816005E-21  0.462959092211639007E-17 -0.138590428094594993E-19
+ -0.208516916658589005E-19 -0.129291638464487995E-17 -0.149795744801019001E-21
+ -0.553218393675464984E-20 -0.532042569994107024E-20  0.409368376430043030E-17
+  0.000000000000000000E+00 -0.494984878551731986E-20 -0.489690922631392007E-20
+ -0.739443137510889016E-17  0.111000775673736002E-16  0.222679088850805999E-18
+  0.106845003637134005E-18 -0.490050851211016962E-17  0.293770518732017028E-19
+ -0.802142049281503955E-19 -0.297911234386516011E-17  0.270478431405522001E-18
+  0.169875903679790995E-18 -0.867533249122434064E-17 -0.585861028395123963E-17
+  0.499436149037106027E-17 -0.308114995328776013E-22 -0.130599392574966007E-14
+  0.130515766480161008E-14  0.769254928189150031E-19 -0.103944874968919999E-18
+ -0.878111951636581074E-17 -0.350432956441832991E-22  0.633450637144740049E-19
+ -0.820001218346395020E-19 -0.398999557912142013E-21  0.148399850494123999E-19
+  0.909624493601297957E-17  0.302645293357789021E-21 -0.219210703865852003E-18
+  0.308119015134121004E-18 -0.275701318827977019E-18 -0.156380108301380001E-18
+ -0.956776784104850991E-17 -0.220370776999360005E-18  0.316081995140841987E-18
+  0.198833102388086990E-19  0.717056862337457936E-17  0.855578145811417952E-19
+ -0.993211691437939944E-19  0.833220302850867983E-21 -0.217506104734504998E-19
+ -0.778452855386439975E-19  0.409724525050444025E-17 -0.504842952642287961E-21
+ -0.346727880643630006E-18 -0.411653532459296998E-18  0.396489180937604980E-17
+ -0.136437880709918003E-23 -0.124957018075439001E-16  0.302207630763774979E-21
+  0.274303192419421975E-19  0.735547307649483054E-19  0.845852113709716048E-22
+ -0.134921032453032995E-18 -0.229587702135863011E-18 -0.161215647846060990E-19
+ -0.410673872542583016E-17  0.980567686783852952E-19 -0.139877417969035991E-18
+ -0.665224868865051959E-21  0.287046018190561014E-19 -0.154866630141438997E-19
+  0.223870818551690020E-18  0.124778581831986997E-18  0.153250017406549997E-17
+ -0.448632604218867018E-19  0.711507675693612044E-19 -0.677626357803439970E-20
+  0.107945878798088004E-16 -0.300431998479260013E-19  0.433680868994201966E-18
+  0.118313562072480995E-16 -0.637646402693036987E-17  0.271050543121375988E-19
+ -0.101643953670516005E-18  0.677626357803439970E-20  0.813151629364127964E-19
+  0.279520872593919016E-18 -0.745388993583784058E-19 -0.100849860282464999E-19
+ -0.592051421507532003E-19  0.780283607724334012E-19 -0.664858447910062962E-19
+  0.437413849068639009E-19 -0.516544150489818993E-17  0.503227866641761002E-20
+ -0.301136200599181013E-19  0.266177796878770016E-17  0.102904964664959001E-18
+  0.312088382102112999E-18 -0.297903979430933015E-17  0.202876384986286988E-20
+  0.543951385875457012E-19 -0.239374042842221994E-18  0.418443806615245012E-17
+ -0.562255471114576966E-19 -0.295243281097027992E-18 -0.799691946769469941E-19
+ -0.852375263513670959E-17  0.140981303182299992E-19 -0.492761908203678022E-18
+  0.254715677652711988E-18 -0.117430723578436001E-16 -0.309172195626648020E-19
+  0.796210970419041958E-19  0.158818677610181001E-18  0.208899500616592014E-17
+ -0.443506451182351966E-17 -0.105032085459533001E-18  0.115196480826584991E-18
+  0.152282296409787000E-18 -0.133506123685445993E-18  0.772670733674762928E-17
+  0.347283508374263019E-19 -0.532836663382158006E-19  0.582929066917145019E-17
+  0.359989002583077981E-18 -0.155854062294790998E-18 -0.347495266611077003E-18
+  0.220906192643921991E-17  0.498055372985528985E-17 -0.167957369017666995E-18
+  0.338813178901720015E-19 -0.116890546721092989E-18 -0.293751026107791011E-17
+  0.733609941661028944E-19  0.100944390482868002E-16 -0.267894332096603011E-19
+  0.216840434497100983E-18  0.135525271560687994E-19  0.151788304147971006E-17
+ -0.247227741479849002E-19 -0.138913403349704991E-18  0.379470760369926985E-18
+  0.643745039913267969E-18  0.203287907341031991E-19  0.304931861011548002E-19
+ -0.182959116606928994E-18  0.347283508374262995E-18  0.196776341559015009E-18
+ -0.427751638363421972E-19 -0.163649412387490995E-19  0.690331852012255017E-19
+ -0.531407295283666964E-18 -0.422299341979263986E-18 -0.157579974642883004E-19
+  0.192321973960417003E-18  0.223317258693390997E-18  0.423311746425542994E-19
+  0.683894318895060959E-19  0.540270370111050010E-17  0.298115892764110993E-19
+ -0.196617522881404009E-18 -0.811034046995993056E-19  0.129860738725924993E-18
+  0.422987078035115991E-19 -0.456513700878625969E-17 -0.224993126614424009E-19
+ -0.459581548334461984E-18 -0.133004025053625993E-18  0.735806933367970026E-18
+  0.405093507024368993E-18  0.194632289411277007E-18 -0.352809074866117003E-19
+ -0.232722302258119018E-18 -0.601393392550553000E-19 -0.627863172152250006E-19
+ -0.931736241979730072E-20 -0.765971894202063951E-17 -0.399121924746226012E-17
+  0.190077832958559989E-18  0.179066508041831014E-19 -0.574288338238416015E-17
+  0.880914265144471992E-19  0.182959116606928994E-18  0.153140909885616995E-18
+  0.203287907341031991E-19  0.325260651745650993E-18  0.379470760369926985E-18
+ -0.207523072077303992E-18 -0.361852475067037028E-17  0.301543729222530998E-17
+  0.394039727062700980E-17  0.350989277518501004E-19  0.393023287525995020E-18
+  0.272744609015884998E-18 -0.357447903741314974E-17  0.155423101195651015E-19
+ -0.123801276779504007E-16  0.770270586409379010E-19 -0.443845264361252992E-18
+ -0.250721752387272997E-18 -0.393700913883799029E-17 -0.601128694754536031E-19
+  0.103676832743925996E-17 -0.806375365786094077E-17  0.270372916763573015E-17
+  0.271050543121375988E-19 -0.168051336735252995E-17 -0.514996031930614991E-18
+ -0.155854062294790998E-18  0.172794721239877004E-18 -0.387517573368842020E-18
+ -0.207523072077304004E-19  0.390799826039453012E-18 -0.925342135844694988E-19
+  0.106281541774795000E-18  0.967048582329619004E-19 -0.834571884916494946E-17
+ -0.954318272702428028E-19  0.207248448113936002E-18  0.517443990235404036E-17
+ -0.244696568805436984E-18 -0.116783013241462011E-18 -0.578651881405754976E-17
+ -0.731624708190902038E-19 -0.335583865790313019E-18 -0.197623374506268996E-18
+ -0.163191485200382005E-17  0.101432195433701996E-18  0.148336644887909010E-18
+ -0.173324116831910990E-18  0.539199998398406007E-17 -0.169406589450860008E-19
+ -0.844915364886164949E-19 -0.416303458685688009E-19  0.233338717250593997E-17
+  0.273292202581394994E-19 -0.728448334638698051E-19 -0.525160427297666030E-19
+ -0.110323923797504006E-16 -0.193123511973980015E-17 -0.372694496791892029E-19
+  0.115196480826584991E-18  0.242873462735370015E-18  0.910537153843644961E-20
+ -0.105840274005222006E-16 -0.469176843440076996E-20  0.193136746863780989E-18
+ -0.343708102894564009E-17 -0.209217137971811991E-18  0.414199111207353010E-18
+  0.394293836946877000E-18  0.304931861011548002E-19 -0.304931861011547978E-18
+  0.611915129942227994E-19  0.237169225231204005E-19 -0.299849663328021983E-18
+  0.207692478666753981E-17 -0.319490239792480988E-19 -0.133513568310958995E-17
+ -0.374812079160028020E-19 -0.406575814682063982E-19  0.406575814682063982E-19
+ -0.420128341838133016E-18  0.463088794131687014E-19 -0.138913403349704991E-18
+  0.127393755267047002E-17 -0.117906986257798992E-17  0.169406589450859993E-20
+ -0.332036915323686022E-18 -0.227004829864152009E-18 -0.559041745187837983E-19
+ -0.709390093325477015E-19 -0.947618109740748986E-19  0.242489650931144976E-18
+  0.359989002583078005E-19 -0.180312138646758992E-18 -0.229976452627177011E-17
+ -0.157796657189281015E-19  0.349872583691553995E-19  0.723435620126626944E-19
+ -0.457635088955547972E-19 -0.933145550948367955E-19 -0.123819011531837992E-17
+ -0.782909906169189995E-19 -0.184229666027809993E-19 -0.157336369952485999E-18
+  0.346171777630992014E-18  0.105217373916745000E-19 -0.111834818817168995E-17
+ -0.155642304057977988E-19 -0.955956090315283023E-19 -0.117479499317231010E-18
+ -0.362238933849222033E-17  0.115699406639016991E-18 -0.613040095575299991E-19
+  0.294757539477146002E-19  0.720825038113409974E-18  0.197676314065472005E-18
+  0.436963121664811995E-18  0.347865843525499989E-18 -0.652215369385810972E-18
+  0.381842452622238978E-17  0.131349663828520009E-18 -0.153847115333583998E-18
+  0.736833960816516019E-17 -0.262580213648833015E-19 -0.284603070277445010E-18
+ -0.197663079175671012E-19 -0.101643953670516005E-18 -0.298155597433514008E-18
+  0.840256683676266033E-18 -0.957147230397359022E-19 -0.220906192643921991E-17
+  0.447233396150270964E-17  0.315435069557500984E-17  0.371635705607823973E-19
+  0.216840434497100983E-18  0.643745039913267969E-18  0.358125530099118029E-17
+  0.155665051524822986E-19 -0.103592129449201009E-17  0.619922238271740940E-19
+ -0.238439774652085979E-18 -0.407277263841508986E-18 -0.147777239183251985E-16
+ -0.557205404227970984E-19  0.336503690631471986E-18 -0.140650350287169006E-16
+  0.180418017765165996E-18  0.145689666927739995E-18  0.814210420548196008E-18
+ -0.687790753170491972E-18  0.718283939271646976E-18 -0.423516473627150019E-20
+ -0.565295230618739017E-18 -0.740359735459462010E-19 -0.399799551104030001E-18
+  0.152465930505773989E-18 -0.679733946795801001E-17 -0.208960686132527017E-20
+ -0.487083649340625048E-18 -0.251436436436519005E-18 -0.289066536862780991E-19
+  0.571135125743363007E-19 -0.514784273693801031E-18  0.586040920381569047E-19
+ -0.225310763969643986E-18 -0.897854924089557998E-19 -0.374812079160027996E-18
+ -0.124990299279213007E-18 -0.103052145945326007E-17  0.125149117956822995E-18
+ -0.194764638309285997E-18  0.287673564711241993E-18 -0.105034203041900993E-16
+ -0.128113733272213002E-19  0.546336250979023959E-19 -0.391223342513080029E-19
+ -0.282909004382935976E-18 -0.487890977618476995E-18  0.135525271560688006E-18
+  0.948676900924816019E-19 -0.650521303491303046E-18 -0.664073830647370984E-18
+ -0.546505657568474964E-17 -0.508219768352579978E-20 -0.460785923306338999E-17
+ -0.338813178901720015E-19  0.482469966756049011E-17  0.372694496791892029E-19
+  0.184653182501436992E-18  0.528548559086683026E-18  0.178529134292789006E-16
+ -0.885043550762337067E-17  0.203626720519933994E-17  0.109267250195805009E-18
+ -0.243521972335611010E-18 -0.253712837482265010E-19 -0.291379333855478980E-18
+ -0.145689666927739995E-18 -0.109097843606353999E-17  0.118584612615601999E-18
+  0.419895407777638036E-17 -0.436221967835964976E-19  0.166770584104119993E-18
+  0.301564784753128000E-18  0.105479300460810000E-16  0.153246012714406002E-21
+ -0.721442795365178040E-20  0.785191282370076000E-17  0.611660841639171959E-19
+ -0.269408879070079001E-19  0.980216235171613067E-20 -0.642700149301778963E-17
+ -0.852508928293581066E-17  0.697544054984669996E-24  0.250693589472354980E-18
+  0.114838828609231990E-18  0.137684826882562005E-19 -0.881975394890917019E-19
+  0.988901378360807009E-17 -0.584661507095537965E-20 -0.378455001863782989E-18
+  0.568235859210977989E-20 -0.215342517844311983E-20 -0.145015040856607996E-18
+ -0.469036545974539019E-17 -0.514100323766396972E-20 -0.305076832491833999E-18
+ -0.196493615426171004E-18  0.205358492279442992E-18  0.117772935177185998E-18
+  0.205010377842294006E-17  0.202435071204792989E-19 -0.619429451884161010E-19
+  0.515229817082411962E-18  0.798642516943490968E-17 -0.375603283878283021E-18
+ -0.257011107083719992E-18 -0.135643988942254000E-20 -0.402490229080583990E-19
+ -0.873766277585149009E-19  0.487430530707361009E-17  0.786805309075646029E-21
+  0.927211481210010019E-19 -0.946732143119615016E-19 -0.966803394206032053E-17
+  0.210728270163397999E-24 -0.182379220144080012E-17  0.301144665518455994E-21
+  0.598805976239300004E-20  0.413513361367576011E-19  0.379595377543990018E-21
+ -0.148122463346125989E-18 -0.690407715361336988E-19 -0.320930310026529998E-19
+ -0.913936660975153941E-17  0.247663640323067001E-18  0.222959731044695013E-18
+ -0.869297579538561051E-22  0.634741170484694975E-20 -0.391093459440610985E-21
+ -0.235888633395294979E-18 -0.133873622544535990E-18 -0.597176764414821015E-17
+ -0.972390876617005020E-20
diff --git a/EXAMPLE/cg20.cua b/EXAMPLE/cg20.cua
new file mode 100644
index 0000000..ea07dba
--- /dev/null
+++ b/EXAMPLE/cg20.cua
@@ -0,0 +1,918 @@
+complex g20, symm. permuted by SYMMMD                                   sym     
+           914            26           120           768             0
+CUA                      400           400          1920             0
+(16I5)          (16I5)          (5E15.8)            (5E15.8)            
+    1    6   11   16   21   26   31   36   41   46   51   56   61   66   71   76
+   81   86   91   96  101  106  111  116  121  126  131  136  141  146  151  156
+  161  166  171  176  181  186  191  196  201  206  210  214  218  223  228  231
+  235  239  243  248  253  257  262  266  271  276  281  286  291  295  300  304
+  309  313  318  323  328  332  337  342  347  352  357  362  367  372  377  382
+  387  392  397  402  407  412  417  422  427  432  437  442  447  452  457  462
+  467  472  477  482  487  492  497  501  505  510  514  519  524  529  534  539
+  544  549  554  559  564  568  572  576  580  583  588  593  597  601  606  610
+  614  619  624  629  634  639  644  649  654  659  664  669  674  678  682  686
+  691  696  701  706  710  714  718  723  728  732  737  742  747  752  757  762
+  767  772  777  781  786  791  796  800  805  810  815  820  825  830  835  839
+  844  849  854  859  864  869  874  879  884  889  894  899  904  909  914  919
+  923  928  932  937  941  946  951  956  961  966  971  976  981  986  991  996
+ 1001 1006 1011 1015 1020 1024 1029 1033 1038 1043 1048 1053 1058 1063 1068 1073
+ 1078 1083 1088 1093 1098 1103 1108 1113 1117 1122 1126 1130 1133 1138 1142 1147
+ 1152 1156 1161 1165 1170 1174 1179 1183 1188 1193 1198 1203 1208 1212 1217 1221
+ 1226 1230 1235 1240 1245 1249 1254 1259 1264 1269 1274 1279 1284 1288 1293 1298
+ 1303 1308 1313 1317 1322 1327 1332 1337 1342 1347 1352 1357 1362 1367 1372 1377
+ 1382 1387 1392 1397 1402 1407 1412 1417 1422 1427 1432 1437 1442 1446 1451 1455
+ 1459 1464 1468 1472 1477 1482 1487 1492 1497 1501 1506 1510 1515 1519 1522 1526
+ 1530 1535 1539 1544 1549 1554 1559 1563 1568 1573 1578 1583 1587 1592 1596 1601
+ 1605 1610 1615 1620 1625 1630 1635 1640 1645 1650 1655 1660 1665 1670 1674 1679
+ 1684 1689 1694 1699 1704 1709 1714 1719 1724 1729 1734 1738 1743 1748 1753 1758
+ 1763 1768 1773 1778 1783 1788 1792 1797 1802 1807 1811 1816 1821 1826 1831 1836
+ 1841 1846 1851 1856 1861 1866 1871 1876 1881 1886 1891 1896 1901 1906 1911 1916
+ 1921
+    1    9   32  391  395    2    9  392  395  400    3    8  389  393  394    4
+    8    9  392  394    5    7    8  381  389    6    7    8    9   32    5    6
+    7   33  382    3    4    5    6    8    1    2    4    6    9   10   31  396
+  398  399   11   18   29   30   31   12   18   31  397  399   13   17  386  387
+  388   14   17   18   30  387   15   17  383  388  390   16   17   18  390  397
+   13   14   15   16   17   11   12   14   16   18   19   28   33  380  382   20
+   22   28   29   33   21   22   29   31  398   20   21   22   32  391   23   27
+  379  384  385   24   27   28  379  380   25   27   30  385  387   26   27   28
+   29   30   23   24   25   26   27   19   20   24   26   28   11   20   21   26
+   29   11   14   25   26   30   10   11   12   21   31    1    6   22   32   33
+    7   19   20   32   33   34   74   92  370  372   35   40   70  174  176   36
+   40   91  174  177   37   39   40   70   71   38   39   40   90   91   37   38
+   39   74   92   35   36   37   38   40   41   56   70   71   73   42   44   55
+   56   43   44  175  176   42   43   44   45   44   45   56   70  176   46   54
+   55   56   73   47   49   53   48   49   54   55   47   48   49   52   50   51
+   53   69   50   51   52   68   72   49   51   52   53   54   47   50   52   53
+   46   48   52   54   72   42   46   48   55   41   42   45   46   56   57   59
+   67   68   72   58   59   67   74  372   57   58   59   71   73   60   66   67
+  372  373   61   62   65   69   61   62   66   67   68   63   64   65  374   63
+   64   66  371  373   61   63   65   66   60   62   64   65   66   57   58   60
+   62   67   51   57   62   68   69   50   61   68   69   35   37   41   45   70
+   37   41   59   71   74   51   54   57   72   73   41   46   59   72   73   34
+   39   58   71   74   75   89  369  381  389   76   78   92  370  375   77   78
+   89  369  375   76   77   78   88   90   79   87   91  177  178   80   87   88
+   90   91   81   86   87  173  178   82   86  173  379  380   83   85   86   87
+   88   84   85   86  380  382   83   84   85   89  381   81   82   83   84   86
+   79   80   81   83   87   78   80   83   88   89   75   77   85   88   89   38
+   78   80   90   92   36   38   79   80   91   34   39   76   90   92   93  172
+  385  386  387   94   98  168  169  170   95   98  169  172  386   96   98  109
+  170  171   97   98  109  386  388   94   95   96   97   98   99  108  377  383
+  390  100  108  376  377  378  101  107  109  167  171  102  107  109  383  388
+  103  106  107  167  104  106  108  378  105  106  107  108  383  103  104  105
+  106  101  102  103  105  107   99  100  104  105  108   96   97  101  102  109
+  110  166  173  379  384  111  132  138  162  164  112  132  138  168  169  113
+  131  132  163  164  114  130  131  132  168  115  129  130  168  170  116  123
+  128  129  130  117  121  123  128  118  120  131  163  119  120  121  123  118
+  119  120  122  117  119  121  120  122  123  130  131  116  117  119  122  123
+  124  127  128  129  125  127  167  171  126  127  129  170  171  124  125  126
+  127  116  117  124  128  115  116  124  126  129  114  115  116  122  130  113
+  114  118  122  131  111  112  113  114  132  133  137  138  162  165  134  137
+  165  166  384  135  137  138  169  172  136  137  172  384  385  133  134  135
+  136  137  111  112  133  135  138  139  145  174  175  176  140  145  161  174
+  177  141  143  145  175  142  143  153  160  141  142  143  144  143  144  145
+  160  161  139  140  141  144  145  146  152  153  159  160  147  152  159  162
+  165  148  150  152  153  149  150  163  164  148  149  150  151  150  151  152
+  162  164  146  147  148  151  152  142  146  148  153  154  158  159  160  161
+  155  158  161  177  178  156  158  159  165  166  157  158  166  173  178  154
+  155  156  157  158  146  147  154  156  159  142  144  146  154  160  140  144
+  154  155  161  111  133  147  151  162  113  118  149  163  111  113  149  151
+  164  133  134  147  156  165  110  134  156  157  166  101  103  125  167   94
+  112  114  115  168   94   95  112  135  169   94   96  115  126  170   96  101
+  125  126  171   93   95  135  136  172   81   82  110  157  173   35   36  139
+  140  174   43  139  141  175   35   43   45  139  176   36   79  140  155  177
+   79   81  155  157  178  179  183  201  369  375  180  183  200  201  274  181
+  183  369  389  393  182  183  272  274  393  179  180  181  182  183  184  188
+  370  372  373  185  188  199  371  373  186  188  201  370  375  187  188  199
+  200  201  184  185  186  187  188  189  198  200  274  277  190  198  273  276
+  277  191  197  198  199  200  192  193  196  374  192  193  197  199  371  194
+  195  196  275  194  195  197  198  276  192  194  196  197  191  193  195  196
+  197  189  190  191  195  198  185  187  191  193  199  180  187  189  191  200
+  179  180  186  187  201  202  271  272  393  394  203  207  272  274  277  204
+  207  218  273  277  205  207  267  271  272  206  207  218  266  267  203  204
+  205  206  207  208  217  218  266  268  209  217  265  268  270  210  216  217
+  218  273  211  212  215  275  211  212  216  273  276  213  214  215  269  213
+  214  216  217  270  211  213  215  216  210  212  214  215  216  208  209  210
+  214  217  204  206  208  210  218  219  223  271  392  394  220  223  229  267
+  271  221  223  365  392  400  222  223  229  365  366  219  220  221  222  223
+  224  228  229  266  267  225  228  263  266  268  226  228  229  264  366  227
+  228  262  263  264  224  225  226  227  228  220  222  224  226  229  230  249
+  263  265  268  231  239  248  249  265  232  233  238  269  232  233  239  265
+  270  234  236  237  238  235  236  237  247  234  235  236  234  235  237  239
+  248  232  234  238  239  231  233  237  238  239  240  246  249  262  263  241
+  242  245  247  241  242  246  248  249  243  244  245  261  243  244  246  260
+  262  241  243  245  246  240  242  244  245  246  235  241  247  248  231  237
+  242  247  248  230  231  240  242  249  250  259  264  364  366  251  259  260
+  262  264  252  258  259  364  367  253  254  257  261  253  254  258  259  260
+  255  256  257  363  255  256  258  367  368  253  255  257  258  252  254  256
+  257  258  250  251  252  254  259  244  251  254  260  261  243  253  260  261
+  227  240  244  251  262  225  227  230  240  263  226  227  250  251  264  209
+  230  231  233  265  206  208  224  225  266  205  206  220  224  267  208  209
+  225  230  268  213  232  269  270  209  214  233  269  270  202  205  219  220
+  271  182  202  203  205  272  190  204  210  212  273  180  182  189  203  274
+  194  211  275  276  190  195  212  275  276  189  190  203  204  277  278  283
+  289  395  400  279  283  289  361  362  280  282  283  391  395  281  282  283
+  359  361  280  281  282  396  398  278  279  280  281  283  284  288  289  365
+  400  285  288  364  365  366  286  288  289  360  362  287  288  360  364  367
+  284  285  286  287  288  278  279  284  286  289  290  292  298  396  399  291
+  292  298  355  357  290  291  292  358  359  293  297  377  390  397  294  297
+  298  397  399  295  297  356  376  377  296  297  298  355  356  293  294  295
+  296  297  290  291  294  296  298  299  328  351  355  357  300  308  328  355
+  356  301  307  326  327  302  307  308  326  328  303  306  376  378  304  306
+  307  308  305  306  308  356  376  303  304  305  306  301  302  304  307  300
+  302  304  305  308  309  311  326  328  351  310  311  324  326  327  309  310
+  311  325  353  312  323  325  352  353  313  314  322  350  313  314  323  352
+  354  315  316  320  322  315  316  321  323  325  317  319  324  327  318  319
+  320  317  318  319  321  315  318  320  321  316  319  320  321  324  313  315
+  322  323  312  314  316  322  323  310  317  321  324  325  311  312  316  324
+  325  301  302  309  310  326  301  310  317  327  299  300  302  309  328  329
+  338  349  360  362  330  338  360  367  368  331  337  338  348  349  332  333
+  336  363  332  333  337  338  368  334  335  336  350  334  335  337  348  354
+  332  334  336  337  331  333  335  336  337  329  330  331  333  338  339  347
+  358  359  361  340  347  349  361  362  341  346  347  348  349  342  346  348
+  352  354  343  345  346  347  358  344  345  346  352  353  343  344  345  351
+  357  341  342  343  344  346  339  340  341  343  347  331  335  341  342  348
+  329  331  340  341  349  313  334  350  354  299  309  345  351  353  312  314
+  342  344  352  311  312  344  351  353  314  335  342  350  354  291  296  299
+  300  355  295  296  300  305  356  291  299  345  357  358  292  339  343  357
+  358  281  292  339  359  396  286  287  329  330  360  279  281  339  340  361
+  279  286  329  340  362  255  332  363  368  250  252  285  287  364  221  222
+  284  285  365  222  226  250  285  366  252  256  287  330  367  256  330  333
+  363  368   75   77  179  181  369   34   76  184  186  370   64  185  193  371
+  374   34   58   60  184  372   60   64  184  185  373   63  192  371  374   76
+   77  179  186  375  100  295  303  305  376   99  100  293  295  377  100  104
+  303  378   23   24   82  110  379   19   24   82   84  380    5   75   85  381
+  382    7   19   84  381  382   15   99  102  105  383   23  110  134  136  384
+   23   25   93  136  385   13   93   95   97  386   13   14   25   93  387   13
+   15   97  102  388    3    5   75  181  389   15   16   99  293  390    1   22
+  280  391  398    2    4  219  221  392    3  181  182  202  393    3    4  202
+  219  394    1    2  278  280  395   10  282  290  359  396   12   16  293  294
+  397   10   21  282  391  398   10   12  290  294  399    2  221  278  284  400
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+ 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00 4.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
+-1.00000000E+00 1.00000000E+00-1.00000000E+00 1.00000000E+00-1.00000000E+00
+ 1.00000000E+00-1.00000000E+00 1.00000000E+00 4.00000000E+00 1.00000000E+00
diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c
new file mode 100644
index 0000000..77292d7
--- /dev/null
+++ b/EXAMPLE/dcreate_matrix.c
@@ -0,0 +1,230 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * DCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
+                   int *ldb, double **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    double   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix()");
+#endif
+
+    if ( !iam ) {
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doubleMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    dGenXtrue_dist(n, nrhs, xtrue_global, n);
+    dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_D, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = doubleMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/dcreate_matrix_perturbed.c b/EXAMPLE/dcreate_matrix_perturbed.c
new file mode 100644
index 0000000..d4ea5e1
--- /dev/null
+++ b/EXAMPLE/dcreate_matrix_perturbed.c
@@ -0,0 +1,230 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * December 31, 2016
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * DCREATE_MATRIX_PERTURBED read the matrix from data file in
+ * Harwell-Boeing format, and distribute it to processors in a distributed
+ * compressed row format. It also generate the distributed true solution X
+ * and the right-hand side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int dcreate_matrix_perturbed(SuperMatrix *A, int nrhs, double **rhs,
+                   int *ldb, double **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    double   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix()");
+#endif
+
+    if ( !iam ) {
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+    /* Perturbed the 1st and last diagonal of the matrix to lower
+       values. Intention is to change perm_r[].   */
+    nzval[0] *= 0.01;
+    nzval[nnz-1] *= 0.0001; 
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doubleMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    dGenXtrue_dist(n, nrhs, xtrue_global, n);
+    dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_D, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = doubleMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/dreadhb.c b/EXAMPLE/dreadhb.c
new file mode 100644
index 0000000..a540177
--- /dev/null
+++ b/EXAMPLE/dreadhb.c
@@ -0,0 +1,389 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_ddefs.h"
+
+/*
+ * Prototypes
+ */
+static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
+static void dReadValues(FILE *, int_t, double *, int_t, int_t);
+extern void FormFullA(int_t, int_t *, double **, int_t **, int_t **);
+static int DumpLine(FILE *);
+static int ParseIntFormat(char *, int_t *, int_t *);
+static int ParseFloatFormat(char *, int_t *, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ * </pre>
+ */
+
+void
+dreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+	     double **nzval, int_t **rowind, int_t **colptr)
+{
+
+    register int_t i, numer_lines, rhscrd = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int_t sym;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter dreadhb_dist()");
+#endif
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+
+    /* Line 2 */
+    for (i=0; i<5; i++) {
+	fscanf(fp, "%14c", buf); buf[14] = 0;
+	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+	if (i == 3) numer_lines = tmp;
+	if (i == 4 && tmp) rhscrd = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+    
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
+    
+    if (tmp != 0)
+	if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+	if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    fscanf(fp, "%20c", buf);
+    DumpLine(fp);
+
+    /* Line 5: right-hand side */    
+    if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) {
+	printf("%d rows, %d nonzeros\n", *nrow, *nonz);
+	printf("colnum %d, colsize %d\n", colnum, colsize);
+	printf("rownum %d, rowsize %d\n", rownum, rowsize);
+	printf("valnum %d, valsize %d\n", valnum, valsize);
+    }
+#endif
+    
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read colptr[%d] = %d\n", *ncol, (*colptr)[*ncol]);
+#endif
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read rowind[%d] = %d\n", *nonz-1, (*rowind)[*nonz-1]);
+#endif
+    if ( numer_lines ) {
+        dReadValues(fp, *nonz, *nzval, valnum, valsize);
+#if ( DEBUGlevel>=1 )
+	if ( !iam ) printf("read nzval[%d] = %e\n", *nonz-1, (*nzval)[*nonz-1]);
+#endif
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+    fclose(fp);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit dreadhb_dist()");
+#endif
+}
+
+/* Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp); 
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+    
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+	   && *tmp != 'F' && *tmp != 'f') {
+       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); 
+
+    return 0;
+}
+
+static void
+ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    item = atoi(&buf[j*persize]); 
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	    where[i++] = item - 1;
+	}
+    }
+}
+
+void
+dReadValues(FILE *fp, int_t n, double *destination, 
+             int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    destination[i++] = atof(&buf[s]);
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+}
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+extern void
+FormFullA(int_t n, int_t *nonz, double **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    double *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (double*) SUPERLU_MALLOC( *nonz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+    
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+#ifdef DEBUG
+	  if ( fabs(a_val[k]) < 4.047e-300 )
+	      printf("%5d: %e\n", k, a_val[k]);
+#endif
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+#ifdef DEBUG
+	if ( fabs(a_val[k]) < 4.047e-300 )
+	    printf("%5d: %e\n", k, a_val[k]);
+#endif
+	++k;
+      }
+      
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = %d, k = %d\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
diff --git a/EXAMPLE/dreadtriple.c b/EXAMPLE/dreadtriple.c
new file mode 100644
index 0000000..c39969f
--- /dev/null
+++ b/EXAMPLE/dreadtriple.c
@@ -0,0 +1,180 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_ddefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+dreadtriple(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    double **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    i, j, k, jsize, lasta, nnz, nz, new_nonz;
+    double *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    
+    /* 	File format:
+     *    First line:  #rows    #non-zero
+     *    Triplet in the rest of lines:
+     *                 row    col    value
+     */
+
+    /*fscanf(fp, "%d%d%d", m, n, nonz);*/
+#ifdef _LONGINT
+    fscanf(fp, "%ld%ld", n, nonz);
+#else
+    fscanf(fp, "%d%d", n, nonz);
+#endif
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+    *m = *n;
+    printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz);
+    dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+#else
+	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+void dreadrhs(int m, double *b)
+{
+    FILE *fp, *fopen();
+    int i, j;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "dreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf\n", &b[i]);
+      /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
+    /*        readpair_(j, &b[i]);*/
+
+    fclose(fp);
+}
+
+
diff --git a/EXAMPLE/g20.rua b/EXAMPLE/g20.rua
new file mode 100644
index 0000000..382c9c4
--- /dev/null
+++ b/EXAMPLE/g20.rua
@@ -0,0 +1,534 @@
+g20, symm permuted by SYMMMD                                            SYM     
+           530            26           120           384             0
+RUA                      400           400          1920             0
+(16I5)          (16I5)          (5E15.8)            (5E15.8)            
+    1    6   11   16   21   26   31   36   41   46   51   56   61   66   71   76
+   81   86   91   96  101  106  111  116  121  126  131  136  141  146  151  156
+  161  166  171  176  181  186  191  196  201  206  210  214  218  223  228  231
+  235  239  243  248  253  257  262  266  271  276  281  286  291  295  300  304
+  309  313  318  323  328  332  337  342  347  352  357  362  367  372  377  382
+  387  392  397  402  407  412  417  422  427  432  437  442  447  452  457  462
+  467  472  477  482  487  492  497  501  505  510  514  519  524  529  534  539
+  544  549  554  559  564  568  572  576  580  583  588  593  597  601  606  610
+  614  619  624  629  634  639  644  649  654  659  664  669  674  678  682  686
+  691  696  701  706  710  714  718  723  728  732  737  742  747  752  757  762
+  767  772  777  781  786  791  796  800  805  810  815  820  825  830  835  839
+  844  849  854  859  864  869  874  879  884  889  894  899  904  909  914  919
+  923  928  932  937  941  946  951  956  961  966  971  976  981  986  991  996
+ 1001 1006 1011 1015 1020 1024 1029 1033 1038 1043 1048 1053 1058 1063 1068 1073
+ 1078 1083 1088 1093 1098 1103 1108 1113 1117 1122 1126 1130 1133 1138 1142 1147
+ 1152 1156 1161 1165 1170 1174 1179 1183 1188 1193 1198 1203 1208 1212 1217 1221
+ 1226 1230 1235 1240 1245 1249 1254 1259 1264 1269 1274 1279 1284 1288 1293 1298
+ 1303 1308 1313 1317 1322 1327 1332 1337 1342 1347 1352 1357 1362 1367 1372 1377
+ 1382 1387 1392 1397 1402 1407 1412 1417 1422 1427 1432 1437 1442 1446 1451 1455
+ 1459 1464 1468 1472 1477 1482 1487 1492 1497 1501 1506 1510 1515 1519 1522 1526
+ 1530 1535 1539 1544 1549 1554 1559 1563 1568 1573 1578 1583 1587 1592 1596 1601
+ 1605 1610 1615 1620 1625 1630 1635 1640 1645 1650 1655 1660 1665 1670 1674 1679
+ 1684 1689 1694 1699 1704 1709 1714 1719 1724 1729 1734 1738 1743 1748 1753 1758
+ 1763 1768 1773 1778 1783 1788 1792 1797 1802 1807 1811 1816 1821 1826 1831 1836
+ 1841 1846 1851 1856 1861 1866 1871 1876 1881 1886 1891 1896 1901 1906 1911 1916
+ 1921
+    1    9   32  391  395    2    9  392  395  400    3    8  389  393  394    4
+    8    9  392  394    5    7    8  381  389    6    7    8    9   32    5    6
+    7   33  382    3    4    5    6    8    1    2    4    6    9   10   31  396
+  398  399   11   18   29   30   31   12   18   31  397  399   13   17  386  387
+  388   14   17   18   30  387   15   17  383  388  390   16   17   18  390  397
+   13   14   15   16   17   11   12   14   16   18   19   28   33  380  382   20
+   22   28   29   33   21   22   29   31  398   20   21   22   32  391   23   27
+  379  384  385   24   27   28  379  380   25   27   30  385  387   26   27   28
+   29   30   23   24   25   26   27   19   20   24   26   28   11   20   21   26
+   29   11   14   25   26   30   10   11   12   21   31    1    6   22   32   33
+    7   19   20   32   33   34   74   92  370  372   35   40   70  174  176   36
+   40   91  174  177   37   39   40   70   71   38   39   40   90   91   37   38
+   39   74   92   35   36   37   38   40   41   56   70   71   73   42   44   55
+   56   43   44  175  176   42   43   44   45   44   45   56   70  176   46   54
+   55   56   73   47   49   53   48   49   54   55   47   48   49   52   50   51
+   53   69   50   51   52   68   72   49   51   52   53   54   47   50   52   53
+   46   48   52   54   72   42   46   48   55   41   42   45   46   56   57   59
+   67   68   72   58   59   67   74  372   57   58   59   71   73   60   66   67
+  372  373   61   62   65   69   61   62   66   67   68   63   64   65  374   63
+   64   66  371  373   61   63   65   66   60   62   64   65   66   57   58   60
+   62   67   51   57   62   68   69   50   61   68   69   35   37   41   45   70
+   37   41   59   71   74   51   54   57   72   73   41   46   59   72   73   34
+   39   58   71   74   75   89  369  381  389   76   78   92  370  375   77   78
+   89  369  375   76   77   78   88   90   79   87   91  177  178   80   87   88
+   90   91   81   86   87  173  178   82   86  173  379  380   83   85   86   87
+   88   84   85   86  380  382   83   84   85   89  381   81   82   83   84   86
+   79   80   81   83   87   78   80   83   88   89   75   77   85   88   89   38
+   78   80   90   92   36   38   79   80   91   34   39   76   90   92   93  172
+  385  386  387   94   98  168  169  170   95   98  169  172  386   96   98  109
+  170  171   97   98  109  386  388   94   95   96   97   98   99  108  377  383
+  390  100  108  376  377  378  101  107  109  167  171  102  107  109  383  388
+  103  106  107  167  104  106  108  378  105  106  107  108  383  103  104  105
+  106  101  102  103  105  107   99  100  104  105  108   96   97  101  102  109
+  110  166  173  379  384  111  132  138  162  164  112  132  138  168  169  113
+  131  132  163  164  114  130  131  132  168  115  129  130  168  170  116  123
+  128  129  130  117  121  123  128  118  120  131  163  119  120  121  123  118
+  119  120  122  117  119  121  120  122  123  130  131  116  117  119  122  123
+  124  127  128  129  125  127  167  171  126  127  129  170  171  124  125  126
+  127  116  117  124  128  115  116  124  126  129  114  115  116  122  130  113
+  114  118  122  131  111  112  113  114  132  133  137  138  162  165  134  137
+  165  166  384  135  137  138  169  172  136  137  172  384  385  133  134  135
+  136  137  111  112  133  135  138  139  145  174  175  176  140  145  161  174
+  177  141  143  145  175  142  143  153  160  141  142  143  144  143  144  145
+  160  161  139  140  141  144  145  146  152  153  159  160  147  152  159  162
+  165  148  150  152  153  149  150  163  164  148  149  150  151  150  151  152
+  162  164  146  147  148  151  152  142  146  148  153  154  158  159  160  161
+  155  158  161  177  178  156  158  159  165  166  157  158  166  173  178  154
+  155  156  157  158  146  147  154  156  159  142  144  146  154  160  140  144
+  154  155  161  111  133  147  151  162  113  118  149  163  111  113  149  151
+  164  133  134  147  156  165  110  134  156  157  166  101  103  125  167   94
+  112  114  115  168   94   95  112  135  169   94   96  115  126  170   96  101
+  125  126  171   93   95  135  136  172   81   82  110  157  173   35   36  139
+  140  174   43  139  141  175   35   43   45  139  176   36   79  140  155  177
+   79   81  155  157  178  179  183  201  369  375  180  183  200  201  274  181
+  183  369  389  393  182  183  272  274  393  179  180  181  182  183  184  188
+  370  372  373  185  188  199  371  373  186  188  201  370  375  187  188  199
+  200  201  184  185  186  187  188  189  198  200  274  277  190  198  273  276
+  277  191  197  198  199  200  192  193  196  374  192  193  197  199  371  194
+  195  196  275  194  195  197  198  276  192  194  196  197  191  193  195  196
+  197  189  190  191  195  198  185  187  191  193  199  180  187  189  191  200
+  179  180  186  187  201  202  271  272  393  394  203  207  272  274  277  204
+  207  218  273  277  205  207  267  271  272  206  207  218  266  267  203  204
+  205  206  207  208  217  218  266  268  209  217  265  268  270  210  216  217
+  218  273  211  212  215  275  211  212  216  273  276  213  214  215  269  213
+  214  216  217  270  211  213  215  216  210  212  214  215  216  208  209  210
+  214  217  204  206  208  210  218  219  223  271  392  394  220  223  229  267
+  271  221  223  365  392  400  222  223  229  365  366  219  220  221  222  223
+  224  228  229  266  267  225  228  263  266  268  226  228  229  264  366  227
+  228  262  263  264  224  225  226  227  228  220  222  224  226  229  230  249
+  263  265  268  231  239  248  249  265  232  233  238  269  232  233  239  265
+  270  234  236  237  238  235  236  237  247  234  235  236  234  235  237  239
+  248  232  234  238  239  231  233  237  238  239  240  246  249  262  263  241
+  242  245  247  241  242  246  248  249  243  244  245  261  243  244  246  260
+  262  241  243  245  246  240  242  244  245  246  235  241  247  248  231  237
+  242  247  248  230  231  240  242  249  250  259  264  364  366  251  259  260
+  262  264  252  258  259  364  367  253  254  257  261  253  254  258  259  260
+  255  256  257  363  255  256  258  367  368  253  255  257  258  252  254  256
+  257  258  250  251  252  254  259  244  251  254  260  261  243  253  260  261
+  227  240  244  251  262  225  227  230  240  263  226  227  250  251  264  209
+  230  231  233  265  206  208  224  225  266  205  206  220  224  267  208  209
+  225  230  268  213  232  269  270  209  214  233  269  270  202  205  219  220
+  271  182  202  203  205  272  190  204  210  212  273  180  182  189  203  274
+  194  211  275  276  190  195  212  275  276  189  190  203  204  277  278  283
+  289  395  400  279  283  289  361  362  280  282  283  391  395  281  282  283
+  359  361  280  281  282  396  398  278  279  280  281  283  284  288  289  365
+  400  285  288  364  365  366  286  288  289  360  362  287  288  360  364  367
+  284  285  286  287  288  278  279  284  286  289  290  292  298  396  399  291
+  292  298  355  357  290  291  292  358  359  293  297  377  390  397  294  297
+  298  397  399  295  297  356  376  377  296  297  298  355  356  293  294  295
+  296  297  290  291  294  296  298  299  328  351  355  357  300  308  328  355
+  356  301  307  326  327  302  307  308  326  328  303  306  376  378  304  306
+  307  308  305  306  308  356  376  303  304  305  306  301  302  304  307  300
+  302  304  305  308  309  311  326  328  351  310  311  324  326  327  309  310
+  311  325  353  312  323  325  352  353  313  314  322  350  313  314  323  352
+  354  315  316  320  322  315  316  321  323  325  317  319  324  327  318  319
+  320  317  318  319  321  315  318  320  321  316  319  320  321  324  313  315
+  322  323  312  314  316  322  323  310  317  321  324  325  311  312  316  324
+  325  301  302  309  310  326  301  310  317  327  299  300  302  309  328  329
+  338  349  360  362  330  338  360  367  368  331  337  338  348  349  332  333
+  336  363  332  333  337  338  368  334  335  336  350  334  335  337  348  354
+  332  334  336  337  331  333  335  336  337  329  330  331  333  338  339  347
+  358  359  361  340  347  349  361  362  341  346  347  348  349  342  346  348
+  352  354  343  345  346  347  358  344  345  346  352  353  343  344  345  351
+  357  341  342  343  344  346  339  340  341  343  347  331  335  341  342  348
+  329  331  340  341  349  313  334  350  354  299  309  345  351  353  312  314
+  342  344  352  311  312  344  351  353  314  335  342  350  354  291  296  299
+  300  355  295  296  300  305  356  291  299  345  357  358  292  339  343  357
+  358  281  292  339  359  396  286  287  329  330  360  279  281  339  340  361
+  279  286  329  340  362  255  332  363  368  250  252  285  287  364  221  222
+  284  285  365  222  226  250  285  366  252  256  287  330  367  256  330  333
+  363  368   75   77  179  181  369   34   76  184  186  370   64  185  193  371
+  374   34   58   60  184  372   60   64  184  185  373   63  192  371  374   76
+   77  179  186  375  100  295  303  305  376   99  100  293  295  377  100  104
+  303  378   23   24   82  110  379   19   24   82   84  380    5   75   85  381
+  382    7   19   84  381  382   15   99  102  105  383   23  110  134  136  384
+   23   25   93  136  385   13   93   95   97  386   13   14   25   93  387   13
+   15   97  102  388    3    5   75  181  389   15   16   99  293  390    1   22
+  280  391  398    2    4  219  221  392    3  181  182  202  393    3    4  202
+  219  394    1    2  278  280  395   10  282  290  359  396   12   16  293  294
+  397   10   21  282  391  398   10   12  290  294  399    2  221  278  284  400
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
diff --git a/EXAMPLE/g4.rua b/EXAMPLE/g4.rua
new file mode 100644
index 0000000..8c6766d
--- /dev/null
+++ b/EXAMPLE/g4.rua
@@ -0,0 +1,21 @@
+g4, symm. permuted by SYMMMD                                            sym     
+            17             1             3            13             0
+RUA                       16            16            64             0
+(17I3)          (26I3)          (5E15.8)            (5E15.8)            
+  1  4  8 11 15 19 22 27 31 35 38 43 47 51 55 60 65
+  1 13 14  2  6 14 15  3  5 12  4  5  6 15  3  4  5 16  2  4  6  7 11 13 14 15
+  8 10 11 12  9 10 11 13  8  9 10  7  8  9 11 16  3  8 12 16  1  7  9 13  1  2
+  7 14  2  4  7 15 16  5 11 12 15 16
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00
+ 4.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00 4.00000000E+00-1.00000000E+00-1.00000000E+00
+-1.00000000E+00-1.00000000E+00-1.00000000E+00 4.00000000E+00
\ No newline at end of file
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
new file mode 100644
index 0000000..3ebca24
--- /dev/null
+++ b/EXAMPLE/pddrive.c
@@ -0,0 +1,234 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PDGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE.
+ *
+ * This example illustrates how to use PDGSSVX with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pddrive -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *b, *xtrue;
+    int    m, n;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact              = DOFACT;
+        options.Equil             = YES;
+        options.ParSymbFact       = NO;
+        options.ColPerm           = METIS_AT_PLUS_A;
+        options.RowPerm           = LargeDiag;
+        options.ReplaceTinyPivot  = YES;
+        options.IterRefine        = DOUBLE;
+        options.Trans             = NOTRANS;
+        options.SolveInitialized  = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat         = YES;
+     */
+    set_default_options_dist(&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO; 
+    options.ReplaceTinyPivot = NO;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+
+    /* Check the accuracy of the solution. */
+    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    ScalePermstructFree(&ScalePermstruct);
+    Destroy_LU(n, &grid, &LUstruct);
+    LUstructFree(&LUstruct);
+    if ( options.SolveInitialized ) {
+        dSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
new file mode 100644
index 0000000..9c01607
--- /dev/null
+++ b/EXAMPLE/pddrive1.c
@@ -0,0 +1,247 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PDGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE1.
+ *
+ * This example illustrates how to use PDGSSVX to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * PDGSSVX, and reuse the following data structures
+ * in the subsequent call to PDGSSVX:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pddrive1 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *b, *xtrue, *b1;
+    int    i, j, m, n;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+    if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("\tSolve the first system:\n");
+    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("\tSolve the system with a different B:\n");
+    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b1, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    ScalePermstructFree(&ScalePermstruct);
+    Destroy_LU(n, &grid, &LUstruct);
+    LUstructFree(&LUstruct);
+    if ( options.SolveInitialized ) {
+        dSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c
new file mode 100644
index 0000000..66d4b4a
--- /dev/null
+++ b/EXAMPLE/pddrive1_ABglobal.c
@@ -0,0 +1,285 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for pdgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pddrive1_ABglobal.
+ *
+ * This example illustrates how to use pdgssvx_ABglobal to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * pdgssvx_ABglobal, and reuse the following data structures
+ * in the subsequent call to pdgssvx_ABglobal:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * On an IBM SP, the program may be run by typing:
+ *    poe pddrive1_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *a, *b, *b1, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]");
+    if ( !(b1 = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    if ( !(xtrue = doubleMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    dGenXtrue_dist(n, nrhs, xtrue, ldx);
+    dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with a different B.\n");
+	dinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    Destroy_LU(n, &grid, &LUstruct);
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
new file mode 100644
index 0000000..0cf3191
--- /dev/null
+++ b/EXAMPLE/pddrive2.c
@@ -0,0 +1,273 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PDGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * December 31, 2016 version 5.1.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE2.
+ *
+ * This example illustrates how to use  to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PDGSSVX:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pddrive2 -r <proc rows> -c <proc columns> g20.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *b, *b1, *xtrue, *xtrue1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    /* prototypes */
+    extern int dcreate_matrix_perturbed
+        (SuperMatrix *, int, double **, int *, double **, int *,
+         FILE *, gridinfo_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
+       ------------------------------------------------------------*/
+    dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+    Astore = (NRformat_loc *) A.Store;
+    m_loc = Astore->m_loc;
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.               */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+
+    if (iam==0) {
+	print_options_dist(&options);
+#if ( PRNTlevel>=2 )
+	PrintInt10("perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("perm_c", n, ScalePermstruct.perm_c);
+#endif
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    dcreate_matrix_perturbed(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, &grid);
+
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Solve the linear system. */
+    pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
+
+#if ( PRNTlevel>=2 )
+    if (iam==0) {
+	PrintInt10("new perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("new perm_c", n, ScalePermstruct.perm_c);
+    }
+#endif
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        dSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue1);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c
new file mode 100644
index 0000000..28c943b
--- /dev/null
+++ b/EXAMPLE/pddrive2_ABglobal.c
@@ -0,0 +1,305 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for pdgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pddrive2_ABglobal.
+ *
+ * This example illustrates how to use pdgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once.  The following data structures will be reused in the
+ * subsequent call to pdgssvx_ABglobal:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe pddrive2_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       Process 0 reads the matrix A, and then broadcasts it to all
+       the other processes.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doubleMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    dGenXtrue_dist(n, nrhs, xtrue, ldx);
+    dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    dallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.               */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Solve the linear system. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with the same sparsity pattern.\n");
+	dinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
new file mode 100644
index 0000000..e591f39
--- /dev/null
+++ b/EXAMPLE/pddrive3.c
@@ -0,0 +1,277 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PDGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3.
+ *
+ * This example illustrates how to use PDGSSVX to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to PDGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PDGSSVX, and reused in the subsequent call.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pddrive3 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *b, *b1, *xtrue, *nzval, *nzval1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc, fst_row;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Save a copy of the matrix A. */
+    Astore = (NRformat_loc *) A.Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nzval = Astore->nzval;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval1 = doubleMalloc_dist(nnz_loc);
+    colind1 = intMalloc_dist(nnz_loc);
+    rowptr1 = intMalloc_dist(m_loc+1);
+    for (i = 0; i < nnz_loc; ++i) {
+        nzval1[i] = nzval[i];
+        colind1[i] = colind[i];
+    }
+    for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval1, colind1, rowptr1,
+				   SLU_NR_loc, SLU_D, SLU_GE);
+
+    /* Solve the linear system. */
+    pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam )
+        printf("Solve a system with the same pattern and similar values.\n");
+    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        dSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c
new file mode 100644
index 0000000..8455456
--- /dev/null
+++ b/EXAMPLE/pddrive3_ABglobal.c
@@ -0,0 +1,310 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for pdgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pddrive3A_ABglobal.
+ *
+ * This example illustrates how to use pdgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to pdgssvx_ABglobal:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to pdgssvx_ABglobal, and reused in the subsequent call.
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe pddrive3_ABglobal -r <proc rows> -c <proc columns> <input_matrix>  -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    dGenXtrue_dist(n, nrhs, xtrue, ldx);
+    dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    dallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Solve the linear system. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve a system with the same pattern and similar values.\n");
+	dinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
new file mode 100644
index 0000000..d0192ec
--- /dev/null
+++ b/EXAMPLE/pddrive4.c
@@ -0,0 +1,288 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE4.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n 10 pddrive4 big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid1, grid2;
+    double   *berr;
+    double   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n;
+    int      nprow, npcol, ldumap, p;
+    int_t    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam >= 10 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid1);
+	
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+    	    print_options_dist(&options);
+        }
+
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                         nrhs, b, ldb, xtrue, ldx, &grid1);
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        ScalePermstructFree(&ScalePermstruct);
+	Destroy_LU(n, &grid1, &LUstruct);
+	LUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            dSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid2);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                         nrhs, b, ldb, xtrue, ldx, &grid2);
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        ScalePermstructFree(&ScalePermstruct);
+	Destroy_LU(n, &grid2, &LUstruct);
+	LUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            dSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c
new file mode 100644
index 0000000..34e13ac
--- /dev/null
+++ b/EXAMPLE/pddrive4_ABglobal.c
@@ -0,0 +1,364 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pddrive4_ABglobal.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe pddrive4_ABglobal <input_file> -procs 10
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid1, grid2;
+    double   *berr;
+    double   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol, ldumap, p;
+    int_t    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam >= 10 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    dallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_D, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	dGenXtrue_dist(n, nrhs, xtrue, ldx);
+	dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+	    print_options_dist(&options);
+        }
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	Destroy_LU(n, &grid1, &LUstruct);
+	ScalePermstructFree(&ScalePermstruct);
+	LUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    dallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_D, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	dGenXtrue_dist(n, nrhs, xtrue, ldx);
+	dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	Destroy_LU(n, &grid2, &LUstruct);
+	ScalePermstructFree(&ScalePermstruct);
+	LUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c
new file mode 100644
index 0000000..cfb9349
--- /dev/null
+++ b/EXAMPLE/pddrive_ABglobal.c
@@ -0,0 +1,264 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for pdgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pddrive_ABglobal.
+ *
+ * This example illustrates how to use pdgssvx_ABglobal with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx_ABglobal
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe pddrive_ABglobal -r <proc rows> -c <proc columns> <input_file> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    double   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    dGenXtrue_dist(n, nrhs, xtrue, ldx);
+    dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    Destroy_LU(n, &grid, &LUstruct);
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
new file mode 100644
index 0000000..33e0a9d
--- /dev/null
+++ b/EXAMPLE/pzdrive.c
@@ -0,0 +1,233 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for PZGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE.
+ *
+ * This example illustrates how to use PZGSSVX with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pzgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pzdrive -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *b, *xtrue;
+    int    m, n;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact              = DOFACT;
+        options.Equil             = YES;
+        options.ParSymbFact       = NO;
+        options.ColPerm           = METIS_AT_PLUS_A;
+        options.RowPerm           = LargeDiag;
+        options.ReplaceTinyPivot  = YES;
+        options.IterRefine        = DOUBLE;
+        options.Trans             = NOTRANS;
+        options.SolveInitialized  = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat         = YES;
+     */
+    set_default_options_dist(&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO; 
+    options.ReplaceTinyPivot = NO;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+
+    /* Check the accuracy of the solution. */
+    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    ScalePermstructFree(&ScalePermstruct);
+    Destroy_LU(n, &grid, &LUstruct);
+    LUstructFree(&LUstruct);
+    if ( options.SolveInitialized ) {
+        zSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
new file mode 100644
index 0000000..402a133
--- /dev/null
+++ b/EXAMPLE/pzdrive1.c
@@ -0,0 +1,246 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for PZGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE1.
+ *
+ * This example illustrates how to use PZGSSVX to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * PZGSSVX, and reuse the following data structures
+ * in the subsequent call to PZGSSVX:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pzdrive1 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *b, *xtrue, *b1;
+    int    i, j, m, n;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+    if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("\tSolve the first system:\n");
+    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("\tSolve the system with a different B:\n");
+    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		     nrhs, b1, ldb, xtrue, ldx, &grid);
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    ScalePermstructFree(&ScalePermstruct);
+    Destroy_LU(n, &grid, &LUstruct);
+    LUstructFree(&LUstruct);
+    if ( options.SolveInitialized ) {
+        zSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c
new file mode 100644
index 0000000..f2ee46d
--- /dev/null
+++ b/EXAMPLE/pzdrive1_ABglobal.c
@@ -0,0 +1,284 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for pzgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pzdrive1_ABglobal.
+ *
+ * This example illustrates how to use pzgssvx_ABglobal to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * pzgssvx_ABglobal, and reuse the following data structures
+ * in the subsequent call to pzgssvx_ABglobal:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * On an IBM SP, the program may be run by typing:
+ *    poe pzdrive1_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *a, *b, *b1, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]");
+    if ( !(b1 = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    if ( !(xtrue = doublecomplexMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    zGenXtrue_dist(n, nrhs, xtrue, ldx);
+    zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with a different B.\n");
+	zinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    Destroy_LU(n, &grid, &LUstruct);
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
new file mode 100644
index 0000000..b75f6ef
--- /dev/null
+++ b/EXAMPLE/pzdrive2.c
@@ -0,0 +1,272 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for PZGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * December 31, 2016 version 5.1.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE2.
+ *
+ * This example illustrates how to use  to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PZGSSVX:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pzdrive2 -r <proc rows> -c <proc columns> g20.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *b, *b1, *xtrue, *xtrue1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    /* prototypes */
+    extern int zcreate_matrix_perturbed
+        (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *,
+         FILE *, gridinfo_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
+       ------------------------------------------------------------*/
+    zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+    Astore = (NRformat_loc *) A.Store;
+    m_loc = Astore->m_loc;
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.               */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+
+    if (iam==0) {
+	print_options_dist(&options);
+#if ( PRNTlevel>=2 )
+	PrintInt10("perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("perm_c", n, ScalePermstruct.perm_c);
+#endif
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    zcreate_matrix_perturbed(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, &grid);
+
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Solve the linear system. */
+    pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
+
+#if ( PRNTlevel>=2 )
+    if (iam==0) {
+	PrintInt10("new perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("new perm_c", n, ScalePermstruct.perm_c);
+    }
+#endif
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        zSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue1);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c
new file mode 100644
index 0000000..7079f2b
--- /dev/null
+++ b/EXAMPLE/pzdrive2_ABglobal.c
@@ -0,0 +1,304 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for pzgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pzdrive2_ABglobal.
+ *
+ * This example illustrates how to use pzgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once.  The following data structures will be reused in the
+ * subsequent call to pzgssvx_ABglobal:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe pzdrive2_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       Process 0 reads the matrix A, and then broadcasts it to all
+       the other processes.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doublecomplexMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    zGenXtrue_dist(n, nrhs, xtrue, ldx);
+    zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    zallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.               */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Solve the linear system. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with the same sparsity pattern.\n");
+	zinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
new file mode 100644
index 0000000..f251587
--- /dev/null
+++ b/EXAMPLE/pzdrive3.c
@@ -0,0 +1,276 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for PZGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3.
+ *
+ * This example illustrates how to use PZGSSVX to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to PZGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PZGSSVX, and reused in the subsequent call.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> pzdrive3 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *b, *b1, *xtrue, *nzval, *nzval1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc, fst_row;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( !iam ) {
+	printf("Input matrix file: %s\n", *cpp);
+        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+    if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Save a copy of the matrix A. */
+    Astore = (NRformat_loc *) A.Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nzval = Astore->nzval;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval1 = doublecomplexMalloc_dist(nnz_loc);
+    colind1 = intMalloc_dist(nnz_loc);
+    rowptr1 = intMalloc_dist(m_loc+1);
+    for (i = 0; i < nnz_loc; ++i) {
+        nzval1[i] = nzval[i];
+        colind1[i] = colind[i];
+    }
+    for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval1, colind1, rowptr1,
+				   SLU_NR_loc, SLU_Z, SLU_GE);
+
+    /* Solve the linear system. */
+    pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam )
+        printf("Solve a system with the same pattern and similar values.\n");
+    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        zSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c
new file mode 100644
index 0000000..9d87af1
--- /dev/null
+++ b/EXAMPLE/pzdrive3_ABglobal.c
@@ -0,0 +1,309 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for pzgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pzdrive3A_ABglobal.
+ *
+ * This example illustrates how to use pzgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to pzgssvx_ABglobal:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to pzgssvx_ABglobal, and reused in the subsequent call.
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe pzdrive3_ABglobal -r <proc rows> -c <proc columns> <input_matrix>  -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    zGenXtrue_dist(n, nrhs, xtrue, ldx);
+    zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    zallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Solve the linear system. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve a system with the same pattern and similar values.\n");
+	zinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
new file mode 100644
index 0000000..8a1caad
--- /dev/null
+++ b/EXAMPLE/pzdrive4.c
@@ -0,0 +1,287 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE4.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n 10 pzdrive4 big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid1, grid2;
+    double   *berr;
+    doublecomplex   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n;
+    int      nprow, npcol, ldumap, p;
+    int_t    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam >= 10 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid1);
+	
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+    	    print_options_dist(&options);
+        }
+
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                         nrhs, b, ldb, xtrue, ldx, &grid1);
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        ScalePermstructFree(&ScalePermstruct);
+	Destroy_LU(n, &grid1, &LUstruct);
+	LUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            zSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid2);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                         nrhs, b, ldb, xtrue, ldx, &grid2);
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        ScalePermstructFree(&ScalePermstruct);
+	Destroy_LU(n, &grid2, &LUstruct);
+	LUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            zSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c
new file mode 100644
index 0000000..94147f7
--- /dev/null
+++ b/EXAMPLE/pzdrive4_ABglobal.c
@@ -0,0 +1,363 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pzdrive4_ABglobal.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe pzdrive4_ABglobal <input_file> -procs 10
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid1, grid2;
+    double   *berr;
+    doublecomplex   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol, ldumap, p;
+    int_t    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam >= 10 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    zallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_Z, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	zGenXtrue_dist(n, nrhs, xtrue, ldx);
+	zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+	    print_options_dist(&options);
+        }
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	Destroy_LU(n, &grid1, &LUstruct);
+	ScalePermstructFree(&ScalePermstruct);
+	LUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    zallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_Z, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	zGenXtrue_dist(n, nrhs, xtrue, ldx);
+	zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = doubleMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+	/* Initialize ScalePermstruct and LUstruct. */
+	ScalePermstructInit(m, n, &ScalePermstruct);
+	LUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	Destroy_LU(n, &grid2, &LUstruct);
+	ScalePermstructFree(&ScalePermstruct);
+	LUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c
new file mode 100644
index 0000000..06ea87b
--- /dev/null
+++ b/EXAMPLE/pzdrive_ABglobal.c
@@ -0,0 +1,260 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for pzgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program pzdrive_ABglobal.
+ *
+ * This example illustrates how to use pzgssvx_ABglobal with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pzgssvx_ABglobal
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe pzdrive_ABglobal -r <proc rows> -c <proc columns> <input_file> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    gridinfo_t grid;
+    double   *berr;
+    doublecomplex   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    /* prototypes */
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )
+	goto out;
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    zGenXtrue_dist(n, nrhs, xtrue, ldx);
+    zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    ScalePermstructInit(m, n, &ScalePermstruct);
+    LUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    Destroy_LU(n, &grid, &LUstruct);
+    ScalePermstructFree(&ScalePermstruct);
+    LUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzgsmv.c b/EXAMPLE/pzgsmv.c
new file mode 100644
index 0000000..0beda04
--- /dev/null
+++ b/EXAMPLE/pzgsmv.c
@@ -0,0 +1,374 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief 
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+void pzgsmv_init
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
+			  The type of A can be:
+			  Stype = NR_loc; Dtype = D; Mtype = GE. */
+ int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
+ gridinfo_t *grid,     /* Input */
+ pzgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
+ )
+{
+    NRformat_loc *Astore;
+    int iam, p, procs;
+    int *SendCounts, *RecvCounts;
+    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
+    int_t TotalIndSend, TotalValSend;
+    int_t *colind, *rowptr;
+    int_t *ind_tosend = NULL, *ind_torecv = NULL;
+    int_t *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, *spa, *itemp;
+    doublecomplex *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzgsmv_init()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    m = A->nrow;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = Astore->nzval;
+    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for SendCounts[]");
+    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
+    RecvCounts = SendCounts + procs;
+    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
+        ABORT("Malloc fails for ptr_ind_tosend[]");
+    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
+    if ( !(extern_start = intMalloc_dist(m_loc)) )
+        ABORT("Malloc fails for extern_start[]");
+    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
+       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
+       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
+       LOCAL PART OF X.
+       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
+        ABORT("Malloc fails for spa[]");
+    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+        k = extern_start[i];
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
+	    jcol = colind[j];
+            p = row_to_proc[jcol];
+	    if ( p != iam ) { /* External */
+	        if ( spa[jcol] == 0 ) { /* First time see this index */
+		    ++SendCounts[p];
+		    spa[jcol] = 1;
+                }
+	    } else { /* Swap to beginning the part of A corresponding
+			to the local part of X */
+	        l = colind[k];
+		t = nzval[k];
+		colind[k] = jcol;
+		nzval[k] = nzval[j];
+		colind[j] = l;
+		nzval[j] = t;
+		++k;
+	    }
+	}
+	extern_start[i] = k;
+    }
+
+    /* ------------------------------------------------------------
+       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
+       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    /* Build pointers to ind_tosend[]. */
+    ptr_ind_tosend[0] = 0;
+    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
+        TotalIndSend += SendCounts[p]; /* Total to send. */
+	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
+    }
+#if 0
+    ptr_ind_tosend[iam] = 0; /* Local part of X */
+#endif
+    if ( TotalIndSend ) {
+        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
+	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
+    }
+
+    /* Build SPA to aid global to local translation. */
+    for (i = 0; i < n; ++i) spa[i] = EMPTY;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
+	        p = row_to_proc[jcol];
+		if ( p == iam ) { /* Local */
+		  /*assert(jcol>=fst_row);*/
+		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
+		} else {          /* External */
+		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
+		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
+		  ++ptr_ind_tosend[p];
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
+       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    colind[j] = spa[jcol];
+	}
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE EXTERNAL INDICES OF X.
+       ------------------------------------------------------------*/
+    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
+		 grid->comm);
+
+    /* Build pointers to ind_torecv[]. */
+    ptr_ind_torecv[0] = 0;
+    for (p = 0, TotalValSend = 0; p < procs; ++p) {
+        TotalValSend += RecvCounts[p]; /* Total to receive. */
+	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
+    }
+    if ( TotalValSend ) {
+        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
+	    ABORT("Malloc fails for ind_torecv[]");
+    }
+
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
+        if ( SendCounts[p] ) {
+	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
+		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
+	}
+	if ( RecvCounts[p] ) {
+	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
+		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
+	}
+    }
+    for (p = 0; p < procs; ++p) {
+        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
+	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
+    }
+
+    /* Allocate storage for the X values to to transferred. */
+    if ( TotalIndSend &&
+         !(val_torecv = doublecomplexMalloc_dist(TotalIndSend)) )
+        ABORT("Malloc fails for val_torecv[].");
+    if ( TotalValSend &&
+         !(val_tosend = doublecomplexMalloc_dist(TotalValSend)) )
+        ABORT("Malloc fails for val_tosend[].");
+
+    gsmv_comm->extern_start = extern_start;
+    gsmv_comm->ind_tosend = ind_tosend;
+    gsmv_comm->ind_torecv = ind_torecv;
+    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
+    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
+    gsmv_comm->SendCounts = SendCounts;
+    gsmv_comm->RecvCounts = RecvCounts;
+    gsmv_comm->val_tosend = val_tosend;
+    gsmv_comm->val_torecv = val_torecv;
+    gsmv_comm->TotalIndSend = TotalIndSend;
+    gsmv_comm->TotalValSend = TotalValSend;
+    
+    SUPERLU_FREE(spa);
+    SUPERLU_FREE(send_req);
+
+#if ( DEBUGlevel>=1 )
+    PrintInt10("pzgsmv_init::rowptr", m_loc+1, rowptr);
+    PrintInt10("pzgsmv_init::extern_start", m_loc, extern_start);
+    CHECK_MALLOC(iam, "Exit pzgsmv_init()");
+#endif
+
+} /* PZGSMV_INIT */
+
+
+/*
+ * Performs sparse matrix-vector multiplication.
+ */
+void
+pzgsmv
+(
+ int_t  abs,               /* Input. Do abs(A)*abs(x). */
+ SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
+			      The column indices are translated into
+			      the relative positions in the gathered x-vector.
+			      The type of A can be:
+			      Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. */
+ gridinfo_t *grid,         /* Input */
+ pzgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
+ doublecomplex x[],       /* Input. The distributed source vector */
+ doublecomplex ax[]       /* Output. The distributed destination vector */
+)
+{
+    NRformat_loc *Astore;
+    int iam, procs;
+    int_t i, j, p, m, m_loc, n, fst_row, jcol;
+    int_t *colind, *rowptr;
+    int   *SendCounts, *RecvCounts;
+    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, TotalValSend;
+    doublecomplex *nzval, *val_tosend, *val_torecv;
+    doublecomplex zero = {0.0, 0.0}, temp;
+    double *ax_abs = (double *) ax;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzgsmv()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A_internal->Store;
+    m = A_internal->nrow;
+    n = A_internal->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = (doublecomplex *) Astore->nzval;
+    extern_start = gsmv_comm->extern_start;
+    ind_torecv = gsmv_comm->ind_torecv;
+    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
+    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
+    SendCounts = gsmv_comm->SendCounts;
+    RecvCounts = gsmv_comm->RecvCounts;
+    val_tosend = (doublecomplex *) gsmv_comm->val_tosend;
+    val_torecv = (doublecomplex *) gsmv_comm->val_torecv;
+    TotalValSend = gsmv_comm->TotalValSend;
+
+    /* ------------------------------------------------------------
+       COPY THE X VALUES INTO THE SEND BUFFER.
+       ------------------------------------------------------------*/
+    for (i = 0; i < TotalValSend; ++i) {
+        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
+	val_tosend[i] = x[j];
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE X VALUES.
+       ------------------------------------------------------------*/
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        if ( RecvCounts[p] ) {
+	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
+                      SuperLU_MPI_DOUBLE_COMPLEX, p, iam,
+                      grid->comm, &send_req[p]);
+	}
+	if ( SendCounts[p] ) {
+	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
+                      SuperLU_MPI_DOUBLE_COMPLEX, p, p,
+                      grid->comm, &recv_req[p]);
+	}
+    }
+    for (p = 0; p < procs; ++p) {
+        if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+    }
+    
+    /* ------------------------------------------------------------
+       PERFORM THE ACTUAL MULTIPLICATION.
+       ------------------------------------------------------------*/
+    if ( abs ) { /* Perform abs(A)*abs(x) */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+            ax_abs[i] = 0.0;
+
+	    /* Multiply the local part. */
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax_abs[i] += z_abs1(&nzval[j]) * z_abs1(&x[jcol]);
+	    }
+
+	    /* Multiply the external part. */
+	    for (; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax_abs[i] += z_abs1(&nzval[j]) * z_abs(&val_torecv[jcol]);
+	    }
+	}
+    } else {
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = zero;
+
+	    /* Multiply the local part. */
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+                zz_mult(&temp, &nzval[j], &x[jcol]);
+                z_add(&ax[i], &ax[i], &temp);
+	    }
+
+	    /* Multiply the external part. */
+	    for (; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+                zz_mult(&temp, &nzval[j], &val_torecv[jcol]);
+                z_add(&ax[i], &ax[i], &temp);
+	    }
+	}
+    }
+
+    SUPERLU_FREE(send_req);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgsmv()");
+#endif
+
+} /* PZGSMV */
+
+void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm)
+{
+    int_t *it;
+    doublecomplex *dt;
+    SUPERLU_FREE(gsmv_comm->extern_start);
+    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
+    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
+    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
+    SUPERLU_FREE(gsmv_comm->SendCounts);
+    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
+    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
+}
+
diff --git a/EXAMPLE/pzgstrs_Bglobal_Bsend.c b/EXAMPLE/pzgstrs_Bglobal_Bsend.c
new file mode 100644
index 0000000..be9d45d
--- /dev/null
+++ b/EXAMPLE/pzgstrs_Bglobal_Bsend.c
@@ -0,0 +1,1031 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Solves a system of distributed linear equations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+#define ISEND_IRECV
+
+#define BSEND 1
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+static void gather_diag_to_all(int_t, int_t, doublecomplex [], Glu_persist_t *,
+                               LocalLU_t *, gridinfo_t *, int_t, int_t [],
+                               int_t [], doublecomplex [], int_t, doublecomplex []);
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pzgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by pzgstrf.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pzgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) doublecomplex*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>
+ */
+
+void
+pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
+                doublecomplex *B, int_t ldb, int nrhs, 
+                SuperLUStat_t *stat, int *info)
+{
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex alpha = {1.0, 0.0};
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
+    doublecomplex *x;     /* X component at step k. */
+    doublecomplex *lusup, *dest;
+    doublecomplex *recvbuf, *tempv;
+    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    doublecomplex **Lnzval_bc_ptr;
+    MPI_Status status;
+#if defined(ISEND_IRECV) || defined(BSEND)
+    MPI_Request *send_req;
+    int test_flag;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+#ifdef BSEND
+    if(!iam) {
+      printf("Using MPI_Bsend in complex triangular solve\n");
+      fflush(stdout);
+    }
+#endif
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    stat->ops[SOLVE] = 0.0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#if defined(ISEND_IRECV) || defined(BSEND)
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+    for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL;
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    k = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = doublecomplexMalloc_dist(k)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doublecomplexMalloc_dist(k)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Copy B into X on the diagonal processes.
+     */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */
+	    lsum[il - LSUM_H].i = 0;
+	    kcol = PCOL( k, grid );
+	    if ( mycol == kcol ) { /* Diagonal process. */
+		jj = X_BLK( lk );
+		x[jj - XK_H].r = k; /* Block number prepended in the header. */
+		x[jj - XK_H].i = 0;
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
+			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
+	    }
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#else
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			if ( send_req[p] != MPI_REQUEST_NULL ) 
+			    MPI_Wait( &send_req[p], &status );
+#endif
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  SuperLU_MPI_DOUBLE_COMPLEX, 
+                                  pi, Xk, grid->comm, &send_req[p]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  SuperLU_MPI_DOUBLE_COMPLEX, 
+                                  pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  SuperLU_MPI_DOUBLE_COMPLEX, 
+                                  pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req,stat);
+#ifdef ISEND_IRECV
+		/* Wait for previous Isends to complete. */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY )
+			/*MPI_Wait( &send_req[p], &status );*/
+			MPI_Test( &send_req[p], &test_flag, &status );
+		}
+#endif
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#if 1
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+		 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+#else
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+		  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &request );
+	MPI_Wait( &request, &status );
+#endif
+
+	k = (*recvbuf).r;
+
+
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM:
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      z_add(&x[i + ii + j*knsupc],
+			    &x[i + ii + j*knsupc],
+			    &tempv[i + j*knsupc]);
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#else
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		      + 10 * knsupc * nrhs; /* complex division */
+
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p)
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			  MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			  if ( send_req[p] != MPI_REQUEST_NULL )
+			    MPI_Wait( &send_req[p], &status );
+#endif
+			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
+				    SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, 
+				    &send_req[p]);
+#else
+#ifdef BSEND
+			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+#ifdef ISEND_IRECV
+		  /* Wait for the previous Isends to complete. */
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY )
+			  MPI_Test( &send_req[p], &test_flag, &status );
+		  }
+#endif
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=1 )	      
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( PRNTlevel==2 )
+    printf("\n(%d) .. After L-solve: y =\n", iam);
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+    /* MPI_Barrier( grid->comm );  Drain messages in the forward solve. */
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nlb is the number of local block rows. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb)
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#else
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			if ( send_req[p] != MPI_REQUEST_NULL )
+			  MPI_Wait( &send_req[p], &status );
+#endif
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[p]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+#ifdef ISEND_IRECV
+		/* Wait for the previous Isends to complete. */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY )
+			MPI_Test( &send_req[p], &test_flag, &status );
+		}
+#endif
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+	
+	k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM:
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			z_add(&x[i + ii + j*knsupc],
+			      &x[i + ii + j*knsupc],
+			      &tempv[i + j*knsupc]);
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#else
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p)
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			    MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			    if ( send_req[p] != MPI_REQUEST_NULL )
+			        MPI_Wait( &send_req[p], &status );
+#endif
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				      &send_req[p] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+#ifdef ISEND_IRECV
+		    /* Wait for the previous Isends to complete. */
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY )
+			    /*MPI_Wait( &send_req[p], &status );*/
+			    MPI_Test( &send_req[p], &test_flag, &status );
+		    }
+#endif
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=1 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+
+    /* Copy the solution X into B (on all processes). */
+    {
+	int_t num_diag_procs, *diag_procs, *diag_len;
+	doublecomplex *work;
+
+	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		       &diag_procs, &diag_len);
+	jj = diag_len[0];
+	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
+	if ( !(work = doublecomplexMalloc_dist(((size_t)jj)*nrhs)) )
+	    ABORT("Malloc fails for work[]");
+	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
+			   grid, num_diag_procs, diag_procs, diag_len,
+			   B, ldb, work);
+	SUPERLU_FREE(diag_procs);
+	SUPERLU_FREE(diag_len);
+	SUPERLU_FREE(work);
+    }
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (p = 0; p < Pr; ++p) {
+        if ( send_req[p] != MPI_REQUEST_NULL )
+	    MPI_Wait( &send_req[p], &status );
+    }
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
+#endif
+
+} /* PZGSTRS_BGLOBAL */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * </pre>
+ */
+static void
+gather_diag_to_all(int_t n, int_t nrhs, doublecomplex x[],
+		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+		   gridinfo_t *grid, int_t num_diag_procs,
+		   int_t diag_procs[], int_t diag_len[],
+		   doublecomplex y[], int_t ldy, doublecomplex work[])
+{
+    int_t i, ii, j, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    doublecomplex *x_col, *y_col;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
+		x_col = &x[ii];
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
+		    lwork += knsupc;
+		    x_col += knsupc;
+		}
+	    }
+	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p]*nrhs, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    y_col = &y[ii];
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
+		lwork += knsupc;
+		y_col += ldy;
+	    }
+	}
+    }
+} /* GATHER_DIAG_TO_ALL */
+
diff --git a/EXAMPLE/pzgstrs_lsum_Bsend.c b/EXAMPLE/pzgstrs_lsum_Bsend.c
new file mode 100644
index 0000000..5bc3042
--- /dev/null
+++ b/EXAMPLE/pzgstrs_lsum_Bsend.c
@@ -0,0 +1,423 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Performs block modifications
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+#if 0
+#define ISEND_IRECV
+#else
+#define BSEND
+#endif
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+
+void zlsum_fmod
+/************************************************************************/
+(
+ doublecomplex *lsum,    /* Sum of local modifications.                        */
+ doublecomplex *x,       /* X array (local)                                    */
+ doublecomplex *xk,      /* X[k].                                              */
+ doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int   knsupc,    /* Size of supernode k.                               */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t nlb,       /* Number of L blocks.                                */
+ int_t lptr,      /* Starting position in lsub[*].                      */
+ int_t luptr,     /* Starting position in lusup[*].                     */
+ int_t *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[],
+ SuperLUStat_t *stat
+)
+{
+
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+    doublecomplex *lusup, *lusup1;
+    doublecomplex *dest;
+    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
+    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
+    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    lsub = Llu->Lrowind_bc_ptr[lk];
+    lusup = Llu->Lnzval_bc_ptr[lk];
+    nsupr = lsub[1];
+
+    for (lb = 0; lb < nlb; ++lb) {
+	ik = lsub[lptr]; /* Global block number, row-wise. */
+	nbrow = lsub[lptr+1];
+#ifdef _CRAY
+	CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+	      &alpha, &lusup[luptr], &nsupr, xk,
+	      &knsupc, &beta, rtemp, &nbrow );
+#else
+	zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow );
+#endif
+	stat->ops[SOLVE] += 8 * nbrow * nrhs * knsupc + 2 * nbrow * nrhs;
+   
+	lk = LBi( ik, grid ); /* Local block number, row-wise. */
+	iknsupc = SuperSize( ik );
+	il = LSUM_BLK( lk );
+	dest = &lsum[il];
+	lptr += LB_DESCRIPTOR;
+	rel = xsup[ik]; /* Global row index of block ik. */
+	for (i = 0; i < nbrow; ++i) {
+	    irow = lsub[lptr++] - rel; /* Relative row. */
+	    RHS_ITERATE(j)
+		z_sub(&dest[irow + j*iknsupc],
+		      &dest[irow + j*iknsupc],
+		      &rtemp[i + j*nbrow]);
+	}
+	luptr += nbrow;
+		    
+	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
+	    ikcol = PCOL( ik, grid );
+	    p = PNUM( myrow, ikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+#if 1
+	        MPI_Test( &send_req[myrow], &test_flag, &status );
+#else
+	        if ( send_req[myrow] != MPI_REQUEST_NULL ) 
+		    MPI_Wait( &send_req[myrow], &status );
+#endif
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm, &send_req[myrow] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+		  	  SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( lk );
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			z_add(&x[i + ii + j*iknsupc],
+			      &x[i + ii + j*iknsupc],
+			      &lsum[i + il + j*iknsupc]);
+		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
+		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( ik, grid );/* Local block number, column-wise. */
+		    lsub1 = Llu->Lrowind_bc_ptr[lk];
+		    lusup1 = Llu->Lnzval_bc_ptr[lk];
+		    nsupr1 = lsub1[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+			  lusup1, &nsupr1, &x[ii], &iknsupc);
+#else
+		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+			   lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+		
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p)
+			if ( fsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, ikcol, grid );
+#ifdef ISEND_IRECV
+#if 1	      
+			    MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			    if ( send_req[p] != MPI_REQUEST_NULL ) 
+			        MPI_Wait( &send_req[p], &status );
+#endif
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				      &send_req[p] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+
+		    /*
+		     * Perform local block modifications.
+		     */
+		    nlb1 = lsub1[0] - 1;
+		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
+		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
+
+		    zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
+			       fmod, nlb1, lptr1, luptr1, xsup,
+			       grid, Llu, send_req, stat);
+#ifdef ISEND_IRECV
+		    /* Wait for previous Isends to complete. */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( fsendx_plist[lk][p] != EMPTY )
+			    /*MPI_Wait( &send_req[p], &status );*/
+			    MPI_Test( &send_req[p], &test_flag, &status );
+		    }
+#endif
+		} /* if frecv[lk] == 0 */
+	    } /* if iam == p */
+	} /* if fmod[lk] == 0 */
+
+    } /* for lb ... */
+
+} /* zLSUM_FMOD */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+ * </pre>
+ */
+void zlsum_bmod
+/************************************************************************/
+(
+ doublecomplex *lsum,        /* Sum of local modifications.                    */
+ doublecomplex *x,           /* X array (local).                               */
+ doublecomplex *xk,          /* X[k].                                          */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[],
+ SuperLUStat_t *stat
+ )
+{
+
+    doublecomplex alpha = {1.0, 0.0};
+    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+           j, jj, lk, lk1, nub, ub, uptr;
+    int_t  *usub;
+    doublecomplex *uval, *dest, *y;
+    doublecomplex temp;
+    int_t  *lsub;
+    doublecomplex *lusup;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *brecv = Llu->brecv;
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    knsupc = SuperSize( k );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    nub = Urbs[lk];
+
+    for (ub = 0; ub < nub; ++ub) {
+	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+	usub = Llu->Ufstnz_br_ptr[ik];
+	uval = Llu->Unzval_br_ptr[ik];
+	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+	i += UB_DESCRIPTOR;
+	il = LSUM_BLK( ik );
+	gik = ik * grid->nprow + myrow;   /* Global block number, row-wise. */
+	iknsupc = SuperSize( gik );
+	ikfrow = FstBlockC( gik );
+	iklrow = FstBlockC( gik+1 );
+
+	RHS_ITERATE(j) {
+	    dest = &lsum[il + j*iknsupc];
+	    y = &xk[j*knsupc];
+	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+	    for (jj = 0; jj < knsupc; ++jj) {
+		fnz = usub[i + jj];
+		if ( fnz < iklrow ) { /* Nonzero segment. */
+		    /* AXPY */
+		    for (irow = fnz; irow < iklrow; ++irow) {
+			zz_mult(&temp, &uval[uptr], &y[jj]);
+			z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow],
+			      &temp);
+			++uptr;
+		    }
+		    stat->ops[SOLVE] += 8 * (iklrow - fnz);
+		}
+	    } /* for jj ... */
+	}
+
+	--bmod[ik];
+	if ( !(bmod[ik]) ) { /* Local accumulation done. */
+	    gikcol = PCOL( gik, grid );
+	    p = PNUM( myrow, gikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+#if 1
+	        MPI_Test( &send_req[myrow], &test_flag, &status );
+#else
+	        if ( send_req[myrow] != MPI_REQUEST_NULL ) 
+		    MPI_Wait( &send_req[myrow], &status );
+#endif
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			  SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm, &send_req[myrow] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			  SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( ik );
+		dest = &x[ii];
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			z_add(&dest[i + j*iknsupc], &dest[i + j*iknsupc],
+			      &lsum[i + il + j*iknsupc]);
+		if ( !brecv[ik] ) { /* Becomes a leaf node. */
+		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
+		    lk1 = LBj( gik, grid ); /* Local block number. */
+		    lsub = Llu->Lrowind_bc_ptr[lk1];
+		    lusup = Llu->Lnzval_bc_ptr[lk1];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &iknsupc);
+#else
+		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
+			+ 10 * iknsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, gik);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p)
+			if ( bsendx_plist[lk1][p] != EMPTY ) {
+			    pi = PNUM( p, gikcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			    MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			    if ( send_req[p] != MPI_REQUEST_NULL ) 
+				MPI_Wait( &send_req[p], &status );
+#endif
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				     &send_req[p] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+
+		    /*
+		     * Perform local block modifications.
+		     */
+		    if ( Urbs[lk1] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+#ifdef ISEND_IRECV
+		    /* Wait for the previous Isends to complete. */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( bsendx_plist[lk1][p] != EMPTY )
+			    /*MPI_Wait( &send_req[p], &status );*/
+			    MPI_Test( &send_req[p], &test_flag, &status );
+		    }
+#endif
+		} /* if brecv[ik] == 0 */
+	    }
+	} /* if bmod[ik] == 0 */
+
+    } /* for ub ... */
+
+} /* zlSUM_BMOD */
+
diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c
new file mode 100644
index 0000000..8efa328
--- /dev/null
+++ b/EXAMPLE/pzutil.c
@@ -0,0 +1,549 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
+ */
+int pzCompRow_loc_to_CompCol_global
+(
+ int_t need_value, /* Input. Whether need to gather numerical values */
+ SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
+ gridinfo_t *grid, /* Input */
+ SuperMatrix *GA   /* Output */
+)
+{
+    NRformat_loc *Astore;
+    NCformat *GAstore;
+    doublecomplex *a, *a_loc;
+    int_t *colind, *rowptr;
+    int_t *colptr_loc, *rowind_loc;
+    int_t m_loc, n, i, j, k, l;
+    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
+    doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
+    doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
+    int_t *colcnt, *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+			   local block rows.
+			   Use n_loc+1 pointers for each block. */
+    int_t *colptr_blk;  /* The column pointers for each block, after
+			   redistribution to the local block columns. 
+			   Use n_loc+1 pointers for each block. */
+    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
+    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
+    int_t *fst_rows, *n_locs;
+    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
+    int   it, n_loc, procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global");
+#endif
+
+    /* Initialization. */
+    n = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a = Astore->nzval;
+    rowptr = Astore->rowptr;
+    colind = Astore->colind;
+    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
+
+    /* ------------------------------------------------------------
+       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
+       ------------------------------------------------------------*/
+    zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
+                             &rowind_loc, &colptr_loc);
+    /* Change local row index numbers to global numbers. */
+    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
+
+#if ( DEBUGlevel>=2 )
+    printf("Proc %d\n", grid->iam);
+    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
+    PrintInt10("colptr_loc", n+1, colptr_loc);
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
+	  ABORT("Malloc fails for fst_rows[]");
+    n_locs = fst_rows + procs;
+    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
+		  grid->comm);
+    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
+    n_locs[procs-1] = n - fst_rows[procs-1];
+    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
+	  ABORT("Malloc fails for recvcnts[]");
+    sendcnts = recvcnts + procs;
+    rdispls = sendcnts + procs;
+    sdispls = rdispls + procs;
+    itemp_32 = sdispls + procs;
+
+    /* All-to-all transfer column pointers of each block.
+       Now the matrix view is P-by-P block-partition. */
+    /* n column starts for each column, and procs column ends for each block */
+    if ( !(colptr_send = intMalloc_dist(n + procs)) )
+	   ABORT("Malloc fails for colptr_send[]");
+    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
+	   ABORT("Malloc fails for colptr_blk[]");
+    for (i = 0, j = 0; i < procs; ++i) {
+        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
+	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
+	sendcnts[i] = n_locs[i] + 1;
+#if ( DEBUGlevel>=1 )
+	assert(j == fst_rows[i]);
+#endif
+	sdispls[i] = j + i;
+	recvcnts[i] = n_loc + 1;
+	rdispls[i] = i * (n_loc + 1);
+	j += n_locs[i]; /* First column of next block in colptr_loc[] */
+    }
+    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
+		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+    /* Adjust colptr_blk[] so that they contain the local indices of the
+       column pointers in the receive buffer. */
+    nnz = 0; /* The running sum of the nonzeros counted by far */
+    k = 0;
+    for (i = 0; i < procs; ++i) {
+	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
+	    colnnz = colptr_blk[j+1] - colptr_blk[j];
+	    /*assert(k<=j);*/
+	    colptr_blk[k] = nnz;
+	    nnz += colnnz; /* Start of the next column */
+	    ++k;
+	}
+	colptr_blk[k++] = nnz; /* Add an END marker for each block */
+    }
+    /*assert(k == (n_loc+1)*procs);*/
+
+    /* Now prepare to transfer row indices and values. */
+    sdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
+	sdispls[i+1] = sdispls[i] + sendcnts[i];
+    }
+    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
+    for (i = 0; i < procs; ++i) {
+        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
+	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
+    }
+    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
+    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
+
+    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
+    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
+        ABORT("Malloc fails for rowind_recv[]");
+    rowind_buf = rowind_recv + k;
+    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
+		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+        if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) )
+	    ABORT("Malloc fails for rowind_recv[]");
+	a_buf = a_recv + k;
+	MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX,
+                      a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
+                      grid->comm);
+    }
+      
+    /* Reset colptr_loc[] to point to the n_loc global columns. */
+    colptr_loc[0] = 0;
+    itemp = colptr_send;
+    for (j = 0; j < n_loc; ++j) {
+        colnnz = 0;
+	for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
+	    colnnz += colptr_blk[k+1] - colptr_blk[k];
+	}
+	colptr_loc[j+1] = colptr_loc[j] + colnnz;
+	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
+    }
+    itemp[n_loc] = colptr_loc[n_loc];
+      
+    /* Merge blocks of row indices into columns of row indices. */
+    for (i = 0; i < procs; ++i) {
+        k = i * (n_loc + 1);
+	for (j = 0; j < n_loc; ++j) { /* i-th block */
+	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+	        rowind_buf[itemp[j]] = rowind_recv[l];
+		++itemp[j];
+	    }
+	}
+    }
+
+    if ( need_value ) {
+        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
+        for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1);
+	    for (j = 0; j < n_loc; ++j) { /* i-th block */
+	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+		    a_buf[itemp[j]] = a_recv[l];
+		    ++itemp[j];
+		}
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
+       ------------------------------------------------------------*/
+    GA->nrow  = A->nrow;
+    GA->ncol  = A->ncol;
+    GA->Stype = SLU_NC;
+    GA->Dtype = A->Dtype;
+    GA->Mtype = A->Mtype;
+    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
+    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
+
+    /* First gather the size of each piece. */
+    nnz_loc = colptr_loc[n_loc];
+    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
+    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
+    GAstore->nnz = nnz;
+    
+    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
+    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
+      
+    /* Allgatherv for row indices. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + itemp[i];
+        itemp_32[i] = itemp[i];
+    }
+    itemp_32[procs-1] = itemp[procs-1];
+    it = nnz_loc;
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+      if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
+          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
+    } else GAstore->nzval = NULL;
+
+    /* Now gather the column pointers. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + n_locs[i];
+        itemp_32[i] = n_locs[i];
+    }
+    itemp_32[procs-1] = n_locs[procs-1];
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+
+    /* Recompute column pointers. */
+    for (i = 1; i < procs; ++i) {
+        k = rdispls[i];
+	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
+	itemp[i] += itemp[i-1]; /* prefix sum */
+    }
+    GAstore->colptr[n] = nnz;
+
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+        printf("After pdCompRow_loc_to_CompCol_global()\n");
+	zPrint_CompCol_Matrix_dist(GA);
+    }
+#endif
+
+    SUPERLU_FREE(a_loc);
+    SUPERLU_FREE(rowind_loc);
+    SUPERLU_FREE(colptr_loc);
+    SUPERLU_FREE(fst_rows);
+    SUPERLU_FREE(recvcnts);
+    SUPERLU_FREE(colptr_send);
+    SUPERLU_FREE(colptr_blk);
+    SUPERLU_FREE(rowind_recv);
+    if ( need_value) SUPERLU_FREE(a_recv);
+#if ( DEBUGlevel>=1 )
+    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
+    CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
+#endif
+    return 0;
+} /* pzCompRow_loc_to_CompCol_global */
+
+
+/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B.
+ */
+int pzPermute_Dense_Matrix
+(
+ int_t fst_row,
+ int_t m_loc,
+ int_t row_to_proc[],
+ int_t perm[],
+ doublecomplex X[], int ldx,
+ doublecomplex B[], int ldb,
+ int nrhs,
+ gridinfo_t *grid
+)
+{
+    int_t i, j, k, l;
+    int p, procs;
+    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *send_ibuf, *recv_ibuf;
+    doublecomplex *send_dbuf, *recv_dbuf;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()");
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
+        ABORT("Malloc fails for sendcnts[].");
+    sendcnts_nrhs = sendcnts + procs;
+    recvcnts = sendcnts_nrhs + procs;
+    recvcnts_nrhs = recvcnts + procs;
+    sdispls = recvcnts_nrhs + procs;
+    sdispls_nrhs = sdispls + procs;
+    rdispls = sdispls_nrhs + procs;
+    rdispls_nrhs = rdispls + procs;
+    ptr_to_ibuf = rdispls_nrhs + procs;
+    ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        p = row_to_proc[perm[i]];
+	++sendcnts[p];
+    }
+    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
+    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
+    for (i = 1; i < procs; ++i) {
+        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
+	sdispls_nrhs[i] = sdispls[i] * nrhs;
+	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
+	rdispls_nrhs[i] = rdispls[i] * nrhs;
+	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
+	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
+    }
+    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
+    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
+    /*assert(k == m_loc);*/
+    /*assert(l == m_loc);*/
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+
+    for (i = 0; i < procs; ++i) {
+        ptr_to_ibuf[i] = sdispls[i];
+	ptr_to_dbuf[i] = sdispls_nrhs[i];
+    }
+
+    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        j = perm[i];
+	p = row_to_proc[j];
+	send_ibuf[ptr_to_ibuf[p]] = j;
+	j = ptr_to_dbuf[p];
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    send_dbuf[j++] = X[i-fst_row + k*ldx];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+	  
+    /* Transfer the (permuted) row indices and numerical values. */
+    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
+		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  grid->comm);
+
+    /* Copy the buffer into b. */
+    for (i = 0, l = 0; i < m_loc; ++i) {
+        j = recv_ibuf[i] - fst_row; /* Relative row number */
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    B[j + k*ldb] = recv_dbuf[l++];
+	}
+    }
+
+    SUPERLU_FREE(sendcnts);
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()");
+#endif
+    return 0;
+} /* pzPermute_Dense_Matrix */
+
+
+/*! \brief Initialize the data structure for the solution phase.
+ */
+int zSolveInit(superlu_options_t *options, SuperMatrix *A, 
+	       int_t perm_r[], int_t perm_c[], int_t nrhs,
+	       LUstruct_t *LUstruct, gridinfo_t *grid,
+	       SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *row_to_proc, *inv_perm_c, *itemp;
+    NRformat_loc *Astore;
+    int_t        i, fst_row, m_loc, p;
+    int          procs;
+
+    Astore = (NRformat_loc *) A->Store;
+    fst_row = Astore->fst_row;
+    m_loc = Astore->m_loc;
+    procs = grid->nprow * grid->npcol;
+    
+    if ( !grid->iam ) printf("@@@ enter zSolveInit, A->nrow %d\n", A->nrow);
+
+    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
+	ABORT("Malloc fails for row_to_proc[]");
+    if ( !grid->iam ) { printf("@@@ malloc(1) zSolveInit\n"); fflush(stdout); }
+    SOLVEstruct->row_to_proc = row_to_proc;
+
+    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
+        ABORT("Malloc fails for inv_perm_c[].");
+    if ( !grid->iam ) { printf("@@@ malloc(2) zSolveInit\n"); fflush(stdout); }
+
+    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
+    SOLVEstruct->inv_perm_c = inv_perm_c;
+
+    if ( !grid->iam ) printf("@@@ after malloc zSolveInit\n");
+
+    /* ------------------------------------------------------------
+       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
+       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
+       
+       NOTE: For those processes that do not own any row, it must
+             must be set so that fst_row == A->nrow. 
+       ------------------------------------------------------------*/
+    if ( !(itemp = intMalloc_dist(procs+1)) )
+        ABORT("Malloc fails for itemp[]");
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = A->nrow;
+    for (p = 0; p < procs; ++p) {
+        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
+    }
+
+    if ( !grid->iam ) printf("@@@ after allgather zSolveInit\n");
+
+#define DEBUGlevel 2
+
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+      printf("fst_row = %d\n", fst_row);
+      PrintInt10("row_to_proc", A->nrow, row_to_proc);
+      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
+    }
+#endif
+    SUPERLU_FREE(itemp);
+
+#if 0
+    /* Compute the mapping between rows and processes. */
+    /* XSL NOTE: What happens if # of mapped processes is smaller
+       than total Procs?  For the processes without any row, let
+       fst_row be EMPTY (-1). Make sure this case works! */
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = n;
+    for (p = 0; p < procs; ++p) {
+        j = itemp[p];
+	if ( j != EMPTY ) {
+	    k = itemp[p+1];
+	    if ( k == EMPTY ) k = n;
+	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
+	}
+    }
+#endif    
+
+    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
+		   &SOLVEstruct->num_diag_procs,
+		   &SOLVEstruct->diag_procs,
+		   &SOLVEstruct->diag_len);
+
+    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
+	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+        ABORT("Malloc fails for gstrs_comm[]");
+    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+		 LUstruct->Glu_persist, SOLVEstruct);
+
+    if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
+           SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
+        ABORT("Malloc fails for gsmv_comm[]");
+    SOLVEstruct->A_colind_gsmv = NULL;
+    
+    options->SolveInitialized = YES;
+    return 0;
+} /* zSolveInit */
+
+/*! \brief Release the resources used for the solution phase.
+ */
+void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *it;
+    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+    if ( options->RefineInitialized ) {
+        pzgsmv_finalize(SOLVEstruct->gsmv_comm);
+	options->RefineInitialized = NO;
+    }
+    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+    SUPERLU_FREE(SOLVEstruct->row_to_proc);
+    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+    SUPERLU_FREE(SOLVEstruct->diag_procs);
+    SUPERLU_FREE(SOLVEstruct->diag_len);
+    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
+    options->SolveInitialized = NO;
+} /* zSolveFinalize */
+
+/*! \brief Check the inf-norm of the error vector 
+ */
+void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
+		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+{
+    double err, xnorm, temperr, tempxnorm;
+    doublecomplex *x_work, *xtrue_work;
+    doublecomplex temp;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+        z_sub(&temp, &x_work[i], &xtrue_work[i]);
+	err = SUPERLU_MAX(err, slud_z_abs(&temp));
+	xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i]));
+      }
+
+      /* get the golbal max err & xnrom */
+      temperr = err;
+      tempxnorm = xnorm;
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+
+      err = err / xnorm;
+      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
diff --git a/EXAMPLE/sp_ienv.c b/EXAMPLE/sp_ienv.c
new file mode 100644
index 0000000..1bcbe77
--- /dev/null
+++ b/EXAMPLE/sp_ienv.c
@@ -0,0 +1,119 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Chooses machine-dependent parameters for the local environment
+ */
+/*
+ * File name:		sp_ienv.c
+ * History:             Modified from lapack routine ILAENV
+ */
+#include "superlu_ddefs.h"
+#include "machines.h"
+
+/*! \brief
+
+</pre>
+    Purpose   
+    =======   
+
+    sp_ienv_dist() is inquired to choose machine-dependent parameters for the
+    local environment. See ISPEC for a description of the parameters.   
+
+    This version provides a set of parameters which should give good,   
+    but not optimal, performance on many of the currently available   
+    computers.  Users are encouraged to modify this subroutine to set   
+    the tuning parameters for their particular machine using the option   
+    and problem size information in the arguments.   
+
+    Arguments   
+    =========   
+
+    ISPEC   (input) int
+            Specifies the parameter to be returned as the value of SP_IENV_DIST.   
+            = 1: the panel size w; a panel consists of w consecutive
+	         columns of matrix A in the process of Gaussian elimination.
+		 The best value depends on machine's cache characters.
+            = 2: the relaxation parameter relax; if the number of
+	         nodes (columns) in a subtree of the elimination tree is less
+		 than relax, this subtree is considered as one supernode,
+		 regardless of the their row structures.
+            = 3: the maximum size for a supernode, which must be greater
+                 than or equal to relaxation parameter (see case 2);
+	    = 4: the minimum row dimension for 2-D blocking to be used;
+	    = 5: the minimum column dimension for 2-D blocking to be used;
+	    = 6: the estimated fills factor for the adjacency structures 
+	         of L and U, compared with A;
+	    = 7: the minimum value of the product M*N*K for a GEMM call
+	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+	    
+   (SP_IENV_DIST) (output) int
+            >= 0: the value of the parameter specified by ISPEC   
+            < 0:  if SP_IENV_DIST = -k, the k-th argument had an illegal value.
+  
+    ===================================================================== 
+</pre>
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int_t
+sp_ienv_dist(int_t ispec)
+{
+    // printf(" this function called\n");
+    int i;
+
+    char* ttemp;
+
+    switch (ispec) {
+#if ( MACH==CRAY_T3E )
+	case 2: return (6);
+	case 3: return (30);
+
+#elif ( MACH==IBM )
+	case 2: return (20);
+	case 3: return (100);
+#else
+	case 2: 
+            ttemp = getenv("NREL");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 2;
+            
+	case 3: 
+            ttemp = getenv("NSUP");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 128;
+
+#endif
+        case 6: return (5);
+        case 7:
+	    ttemp = getenv ("N_GEMM");
+	    if (ttemp) return atoi (ttemp);
+	    else return 10000;
+
+    }
+
+    /* Invalid value for ISPEC */
+    i = 1;
+    xerr_dist("sp_ienv", &i);
+    return 0;
+
+
+} /* sp_ienv_dist */
+
diff --git a/EXAMPLE/zcreate_matrix.c b/EXAMPLE/zcreate_matrix.c
new file mode 100644
index 0000000..87774cf
--- /dev/null
+++ b/EXAMPLE/zcreate_matrix.c
@@ -0,0 +1,229 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * ZCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) doublecomplex**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) doublecomplex**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                   int *ldb, doublecomplex **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    doublecomplex   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zcreate_matrix()");
+#endif
+
+    if ( !iam ) {
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0].r = 0.1; nzval[0].i = 0.0;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    zGenXtrue_dist(n, nrhs, xtrue_global, n);
+    zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) zPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_Z, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = doublecomplexMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit zcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/zcreate_matrix_perturbed.c b/EXAMPLE/zcreate_matrix_perturbed.c
new file mode 100644
index 0000000..92b7963
--- /dev/null
+++ b/EXAMPLE/zcreate_matrix_perturbed.c
@@ -0,0 +1,229 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * December 31, 2016
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * ZCREATE_MATRIX_PERTURBED read the matrix from data file in
+ * Harwell-Boeing format, and distribute it to processors in a distributed
+ * compressed row format. It also generate the distributed true solution X
+ * and the right-hand side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) doublecomplex**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) doublecomplex**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int zcreate_matrix_perturbed(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                   int *ldb, doublecomplex **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    doublecomplex   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zcreate_matrix()");
+#endif
+
+    if ( !iam ) {
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+    /* Perturbed the 1st and last diagonal of the matrix to lower
+       values. Intention is to change perm_r[].   */
+    nzval[0].r *= 0.01; nzval[0].i *= 0.01;
+    nzval[nnz-1].r *= 0.0001; nzval[nnz-1].i *= 0.0001;
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    zGenXtrue_dist(n, nrhs, xtrue_global, n);
+    zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) zPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_Z, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = doublecomplexMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit zcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/zlook_ahead_update.c b/EXAMPLE/zlook_ahead_update.c
new file mode 100644
index 0000000..05c3fcd
--- /dev/null
+++ b/EXAMPLE/zlook_ahead_update.c
@@ -0,0 +1,230 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/************************************************************************/
+/*! @file 
+ * \brief Look-ahead update of the Schur complement.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ */
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    doublecomplex zero = {0.0, 0.0};
+
+    arrive_at_ublock
+        (j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         iukp0, rukp0, usub, perm_u, xsup, grid);
+    j++;
+    jj0++;
+    jj = iukp;
+    lptr = lptr0;
+    luptr = luptr0;
+
+    while (usub[jj] == klst) ++jj;
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+    full = 1;
+    for (; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize != ldu)
+                full = 0;
+            if (segsize > ldu)
+                ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+    if (0) {
+        tempu = &uval[rukp];
+    }
+    else  {                    /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+        printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+                iam, full, k, jb, ldu, ncols, nsupc);
+        ++num_copy;
+#endif
+        tempu = bigU;
+        for (jj = iukp; jj < iukp + nsupc; ++jj) {
+            segsize = klst - usub[jj];
+            if (segsize) {
+                lead_zero = ldu - segsize;
+                for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (i = 0; i < segsize; ++i) {
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        tempu = bigU;
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    }                           /* if full ... */
+
+    nbrow = lsub[1];
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+// double ttx =SuperLU_timer_();
+    
+#pragma omp parallel for \
+                    private(lb,lptr,luptr,ib,tempv ) \
+                    default(shared) schedule(dynamic)
+    for (lb = 0; lb < nlb; lb++) {
+        
+        int_t temp_nbrow;
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int i = 0; i < lb; ++i) {
+            ib = lsub[lptr];    /* Row block L(i,k). */
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+            
+        }
+        
+        int_t thread_id = omp_get_thread_num ();
+        doublecomplex * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread = indirect + ldt * thread_id;
+        int *indirect2_thread   = indirect2 + ldt*thread_id;        
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+        assert (temp_nbrow <= nbrow);
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+        /* calling gemm */
+#if defined (USE_VENDOR_BLAS)
+        zgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        zgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+        /* Now scattering the output*/
+        if (ib < jb) {    /* A(i,j) is in U. */
+            zscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            zscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread, 
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+    } /* parallel for lb = 0, ... */
+
+    rukp += usub[iukp - 1];     /* Move to block U(k,j+1) */
+    iukp += nsupc;
+
+    /* ==================================== *
+     * == factorize and send if possible == *
+     * ==================================== */
+    kk = jb;
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+    /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+        /* double ttt1 = SuperLU_timer_(); */
+#if ( VAMPIR>=1 )
+        VT_begin (5);
+#endif
+
+        PZGSTRF2(options, nsupers, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+#if ( VAMPIR>=1 )
+        VT_end (5);
+#endif
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Process column *kcol+1* multicasts numeric values of L(:,k+1)
+           to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+#if ( VAMPIR>=1 )
+                VT_begin (1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( VAMPIR>=1 )
+                VT_end (1);
+#endif
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d\n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj);
+		if (kk==3) {
+		    PrintInt10("..send lsub", msgcnt[0], lsub1);
+ 		    PrintDoublecomplex("..send lusup", msgcnt[1], lusup1);
+		}
+#endif
+            }   /*if ( ToSendR[lk][pj] != EMPTY ) */
+        }       /* for pj ... */
+    }           /*if( look_ahead[kk] == k0 && kcol == mycol ) */
+}               /* while j < nub and perm_u[j] <k0+NUM_LOOK_AHEAD */
+
diff --git a/EXAMPLE/zreadhb.c b/EXAMPLE/zreadhb.c
new file mode 100644
index 0000000..3930623
--- /dev/null
+++ b/EXAMPLE/zreadhb.c
@@ -0,0 +1,292 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+#include "dcomplex.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_zdefs.h"
+
+/*
+ * Prototypes
+ */
+static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
+static void zReadValues(FILE *, int_t, doublecomplex *, int_t, int_t);
+static int DumpLine(FILE *);
+static int ParseIntFormat(char *, int_t *, int_t *);
+static int ParseFloatFormat(char *, int_t *, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ * </pre>
+ */
+
+void
+zreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+	     doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+
+    register int_t i, numer_lines, rhscrd = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter zreadhb_dist()");
+#endif
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+
+    /* Line 2 */
+    for (i=0; i<5; i++) {
+	fscanf(fp, "%14c", buf); buf[14] = 0;
+	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+	if (i == 3) numer_lines = tmp;
+	if (i == 4 && tmp) rhscrd = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+    
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
+    
+    if (tmp != 0)
+	if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+	if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    zallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    fscanf(fp, "%20c", buf);
+    DumpLine(fp);
+
+    /* Line 5: right-hand side */    
+    if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) {
+	printf("%d rows, %d nonzeros\n", *nrow, *nonz);
+	printf("colnum %d, colsize %d\n", colnum, colsize);
+	printf("rownum %d, rowsize %d\n", rownum, rowsize);
+	printf("valnum %d, valsize %d\n", valnum, valsize);
+    }
+#endif
+    
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read colptr[%d] = %d\n", *ncol, (*colptr)[*ncol]);
+#endif
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read rowind[%d] = %d\n", *nonz-1, (*rowind)[*nonz-1]);
+#endif
+    if ( numer_lines ) {
+        zReadValues(fp, *nonz, *nzval, valnum, valsize);
+    }
+
+    fclose(fp);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit zreadhb_dist()");
+#endif
+}
+
+/* Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp); 
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+    
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+	   && *tmp != 'F' && *tmp != 'f') {
+       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); 
+
+    return 0;
+}
+
+static void
+ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    item = atoi(&buf[j*persize]); 
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	    where[i++] = item - 1;
+	}
+    }
+}
+
+/* Read complex numbers as pairs of (real, imaginary) */
+void
+zReadValues(FILE *fp, int_t n, doublecomplex *destination, 
+             int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    register int_t pair;
+    register double realpart;
+    char tmp, buf[100];
+    
+    i = 0;
+    pair = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    if ( pair == 0 ) {
+	  	/* The value is real part */
+		realpart = atof(&buf[s]);
+		pair = 1;
+	    } else {
+		/* The value is imaginary part */
+	        destination[i].r = realpart;
+		destination[i++].i = atof(&buf[s]);
+		pair = 0;
+	    }
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+}
+
diff --git a/EXAMPLE/zreadtriple.c b/EXAMPLE/zreadtriple.c
new file mode 100644
index 0000000..73671af
--- /dev/null
+++ b/EXAMPLE/zreadtriple.c
@@ -0,0 +1,177 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_zdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+zreadtriple(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    i, j, k, jsize, lasta, nnz, nz, new_nonz;
+    doublecomplex *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    
+    /* 	File format:
+     *    First line:  #rows    #non-zero
+     *    Triplet in the rest of lines:
+     *                 row    col    value
+     */
+
+    /*fscanf(fp, "%d%d%d", m, n, nonz);*/
+#ifdef _LONGINT
+    fscanf(fp, "%ld%ld", n, nonz);
+#else
+    fscanf(fp, "%d%d", n, nonz);
+#endif
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+    *m = *n;
+    printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz);
+    zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(doublecomplex))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#else
+	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+void zreadrhs(int m, doublecomplex *b)
+{
+    FILE *fp, *fopen();
+    int i, j;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf%lf\n", &(b[i].r), &(b[i].i));
+
+    fclose(fp);
+}
+
+
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
new file mode 100644
index 0000000..8d8952f
--- /dev/null
+++ b/FORTRAN/Makefile
@@ -0,0 +1,48 @@
+#######################################################################
+#
+#  This makefile creates the Fortran example programs for the
+#  linear equation routines in SuperLU_DIST.
+#
+#  Creation date:   July 29, 2003   version 2.0
+#  Modified:        Oct. 22, 2012   version 3.2
+#
+#######################################################################
+.SUFFIXES: 
+.SUFFIXES: .f90 .c .o
+include ../make.inc
+INCLUDEDIR = -I../SRC
+
+#F90FLAGS	= $(FFLAGS) -qfree -qsuffix=f=f90  -qflag=w:w
+
+F_MOD	= superlupara.o superlu_mod.o
+C_DWRAP	= dcreate_dist_matrix.o superlu_c2f_dwrap.o
+C_ZWRAP	= zcreate_dist_matrix.o superlu_c2f_zwrap.o
+
+F_DEXM	= $(F_MOD) dhbcode1.o f_pddrive.o
+F_ZEXM	= $(F_MOD) zhbcode1.o f_pzdrive.o
+F_5x5 	= $(F_MOD) f_5x5.o sp_ienv.o 
+
+all: f_pddrive f_pzdrive f_5x5
+
+f_pddrive: $(F_DEXM) $(C_DWRAP) $(DSUPERLULIB)
+	$(FORTRAN) $(LOADOPTS) $(F_DEXM) $(C_DWRAP) $(LIBS) -o $@
+
+f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB)
+	$(FORTRAN) $(LOADOPTS) $(F_5x5) $(C_DWRAP) $(LIBS) -o $@
+
+f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB)
+	$(FORTRAN) $(LOADOPTS) $(F_ZEXM) $(C_ZWRAP) $(LIBS) -o $@
+
+.c.o:
+	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE)
+
+.f90.o:
+	$(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE)
+
+.f.o:
+	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
+
+clean:	
+	rm -f *.o *.mod f_*drive f_5x5
+
+
diff --git a/FORTRAN/README b/FORTRAN/README
new file mode 100644
index 0000000..18f0a7d
--- /dev/null
+++ b/FORTRAN/README
@@ -0,0 +1,28 @@
+		Fortran 90 Interface
+
+This directory contains Fortran-90 wrapper routines for SuperLU_DIST.
+The directory contains the following files:
+    superlu_mod.f90    -  Fortran 90 module that defines the wrapper functions
+                             to access SuperLU_DIST's data structures.
+    superlupara.f90    -  It contains parameters that correspond to
+                             SuperLU_DIST's enumerate constants.
+    superlu_c2f_wrap.c -  All the C wrapper functions, callable from Fortran.
+    hbcode1.f90        -  Fortran routine to read a Harwell-Boeing matrix.
+
+To compile the code, type 'make'
+
+There are two examples in the directory.
+
+1. f_5x5.f90:
+   A small 5x5 example appeared in the SuperLU Users Guide, Section 2.2.
+   To run the code on a Cray XT or XE, type 'aprun -n 2 f_5x5'
+   (The example is set up to use 2 processors.)
+
+2. f_pddrive.f90 / f_pzdrive.f90:
+   An example Fortran driver routine that reads a matrix from a file
+   'g20.rua' in Harwell-Boeing format.
+   To run the code on a Cray XT or XE, type 'aprun -n 4 f_pddrive'
+   (The example is set up to use 4 processors.)
+
+   The complex version:
+   % aprun -n 4 f_pzdrive
diff --git a/FORTRAN/c_fortran_pdgssvx_ABglobal.c b/FORTRAN/c_fortran_pdgssvx_ABglobal.c
new file mode 100644
index 0000000..9e3f439
--- /dev/null
+++ b/FORTRAN/c_fortran_pdgssvx_ABglobal.c
@@ -0,0 +1,215 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * July 10, 2003
+ */
+
+#include "superlu_ddefs.h"
+
+#define HANDLE_SIZE  8
+
+typedef struct {
+    ScalePermstruct_t *ScalePermstruct;
+    LUstruct_t *LUstruct;
+} factors_dist_t;
+
+int
+c_fortran_pdgssvx_ABglobal_(int *iopt, int_t *n, int_t *nnz, int *nrhs,
+			    double *values, int_t *rowind, int_t *colptr,
+			    double *b, int *ldb, int grid_handle[HANDLE_SIZE],
+			    double *berr, int factors[HANDLE_SIZE], int *info)
+
+{
+/* 
+ * Purpose
+ * =======
+ *
+ * This is a Fortran wrapper to use pdgssvx_ABglobal().
+ *
+ * Arguments
+ * =========
+ *
+ * iopt (input) int
+ *      Specifies the operation to be performed:
+ *      = 1, performs LU decomposition for the first time
+ *      = 2, performs a subsequent LU decomposition for a new matrix
+ *           with the same sparsity pattern
+ *      = 3, performs triangular solve
+ *      = 4, frees all the storage in the end
+ *
+ * n    (input) int, order of the matrix A
+ *
+ * nnz  (input) int, number of nonzeros in matrix A
+ *
+ * nrhs (input) int, number of right-hand sides in the system AX = B
+ *
+ * values/rowind/colptr (input) column compressed data structure for A
+ *
+ * b    (input/output) double
+ *      On input, the right-hand side matrix of dimension (ldb, nrhs)
+ *      On output, the solution matrix
+ * 
+ * ldb  (input) int, leading dimension of the matrix B
+ *
+ * grid_handle (input) int array of size 8, holds a pointer to the process
+ *      grid structure, which is created and freed separately.
+ *
+ * berr  (output) double, the backward error of each right-hand side
+ *
+ * factors (input/output) int array of size 8
+ *      If iopt == 1, it is an output and contains the pointer pointing to
+ *                    the structure of the factored matrices.
+ *      Otherwise, it it an input.
+ *
+ * info (output) int
+ *
+ */
+    superlu_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    ScalePermstruct_t *ScalePermstruct;
+    LUstruct_t *LUstruct;
+    int_t    nprow, npcol;
+    int      iam;
+    int      report;
+    int      i;
+    gridinfo_t *grid;
+    factors_dist_t *LUfactors;
+
+    /*
+     * Set option for printing statistics.
+     * report = 0: no reporting
+     * report = 1: reporting
+     */    	
+    report = 0;
+
+    /* Locate the process grid. */
+    grid = (gridinfo_t *) grid_handle[0];
+    iam = (*grid).iam;
+    nprow = (int_t) grid->nprow;
+    npcol = (int_t) grid->npcol;
+
+    if ( *iopt == 1 ) { /* LU decomposition */
+
+        if ( !iam ) printf(".. Process grid: %d X %d\n", nprow, npcol);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+
+	dCreate_CompCol_Matrix_dist(&A, *n, *n, *nnz, values, rowind, colptr,
+			            SLU_NC, SLU_D, SLU_GE);
+
+	/* Set options. */
+	set_default_options(&options);
+
+	/* Initialize ScalePermstruct and LUstruct. */
+        ScalePermstruct =
+            (ScalePermstruct_t *) SUPERLU_MALLOC(sizeof(ScalePermstruct_t));
+        ScalePermstructInit(*n, *n, ScalePermstruct);
+        LUstruct = (LUstruct_t *) SUPERLU_MALLOC(sizeof(LUstruct_t));
+        LUstructInit(*n, *n, LUstruct);
+
+	/* Call global routine with nrhs=0 to perform the factorization. */
+	pdgssvx_ABglobal(&options, &A, ScalePermstruct, NULL, *ldb, 0, 
+	                 grid, LUstruct, berr, &stat, info);
+
+	if ( *info == 0 ) {
+          if ( report == 1 ) PStatPrint(&options, &stat, grid);
+	} else {
+	    printf("pdgssvx_ABglobal() error returns INFO= %d\n", *info);
+	}
+	
+	/* Save the LU factors in the factors handle */
+	LUfactors = (factors_dist_t*) SUPERLU_MALLOC(sizeof(factors_dist_t));
+	LUfactors->ScalePermstruct = ScalePermstruct;
+	LUfactors->LUstruct = LUstruct;
+	factors[0] = (int) LUfactors;
+
+	/* Free un-wanted storage */
+	Destroy_SuperMatrix_Store_dist(&A);
+        PStatFree(&stat);
+
+    } else if ( *iopt == 2 ) {
+        /* Factor a modified matrix with the same sparsity pattern using
+	   existing permutations and L U storage */
+
+	/* Extract the LU factors in the factors handle */
+	LUfactors = (factors_dist_t*) factors[0];
+	ScalePermstruct = LUfactors->ScalePermstruct;
+	LUstruct = LUfactors->LUstruct;
+
+	PStatInit(&stat);
+
+	/* Reset SuperMatrix pointers. */
+	dCreate_CompCol_Matrix_dist(&A, *n, *n, *nnz, values, rowind, colptr,
+			            SLU_NC, SLU_D, SLU_GE);
+
+	/* Set options. */
+	set_default_options(&options);
+        options.Fact = SamePattern_SameRowPerm;
+
+	/* Call the routine with nrhs=0 to perform the factorization. */
+	pdgssvx_ABglobal(&options, &A, ScalePermstruct, NULL, *ldb, 0, 
+	                 grid, LUstruct, berr, &stat, info);
+
+	if ( *info == 0 ) {
+          if ( report == 1 ) PStatPrint(&options, &stat, grid);
+	} else {
+	    printf("pdgssvx_ABglobal() error returns INFO= %d\n", *info);
+	}
+	
+	/* Free un-wanted storage */
+	Destroy_SuperMatrix_Store_dist(&A);
+        PStatFree(&stat);
+
+    } else if ( *iopt == 3 ) { /* Triangular solve */
+
+	/* Extract the LU factors in the factors handle */
+	LUfactors = (factors_dist_t*) factors[0];
+	ScalePermstruct = LUfactors->ScalePermstruct;
+	LUstruct = LUfactors->LUstruct;
+
+	PStatInit(&stat);
+
+	/* Reset SuperMatrix pointers. */
+	dCreate_CompCol_Matrix_dist(&A, *n, *n, *nnz, values, rowind, colptr,
+			            SLU_NC, SLU_D, SLU_GE);
+
+	/* Set options. */
+	set_default_options(&options);
+        options.Fact = FACTORED;
+
+        /* Solve the system A*X=B, overwriting B with X. */
+	pdgssvx_ABglobal(&options, &A, ScalePermstruct, b, *ldb, *nrhs, 
+	                 grid, LUstruct, berr, &stat, info);
+
+	/* Free un-wanted storage */
+	Destroy_SuperMatrix_Store_dist(&A);
+        PStatFree(&stat);
+
+    } else if ( *iopt == 4 ) { /* Free storage */
+
+	/* Free the LU factors in the factors handle */
+	LUfactors = (factors_dist_t*) factors[0];
+	Destroy_LU(*n, grid, LUfactors->LUstruct);
+        LUstructFree(LUfactors->LUstruct);
+	ScalePermstructFree(LUfactors->ScalePermstruct);
+	SUPERLU_FREE(LUfactors->ScalePermstruct);
+	SUPERLU_FREE(LUfactors->LUstruct);
+        SUPERLU_FREE(LUfactors);
+
+    } else {
+	fprintf(stderr, "Invalid iopt=%d passed to c_fortran_pdgssvx_ABglobal()\n", *iopt);
+	exit(-1);
+    }
+}
diff --git a/FORTRAN/c_fortran_slugrid.c b/FORTRAN/c_fortran_slugrid.c
new file mode 100644
index 0000000..85b11b9
--- /dev/null
+++ b/FORTRAN/c_fortran_slugrid.c
@@ -0,0 +1,56 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+#include "superlu_ddefs.h"
+
+#define HANDLE_SIZE 8
+
+void
+c_fortran_slugrid_(int *iopt, MPI_Comm *slu_comm, int *nprow, int *npcol,
+		   int grid_handle[HANDLE_SIZE])
+/*
+ * This routine provides a fortran call for initializing and 
+ * freeing the SuperLU_DIST processor grid.  The pointer for the grid
+ * structure is returned in grid_handle.
+ *
+ * The input option, iopt, controls the functionality:
+ *   iopt=1:  allocate and define a new process grid
+ *   iopt=2:  free an existing process grid
+ *
+ * slu_comm is the base communication handle
+ * nprow is the number of processors per process grid row
+ * npcol is the number of processors per process grid column
+ */
+
+{
+    gridinfo_t *grid;
+
+    if ( *iopt == 1 ) {
+      /* Allocate the grid structure. */
+      grid = (gridinfo_t *) SUPERLU_MALLOC(sizeof(gridinfo_t));
+
+      /* Initialize the process grid. */
+      superlu_gridinit(*slu_comm, *nprow, *npcol, grid);
+
+      /* Set the handle passed from fortran, so that the
+       * process grid can be reused. */
+      grid_handle[0] = (int) grid;
+
+    } else if ( *iopt == 2 ) {
+      /* Locate and free the process grid. */
+      grid = (gridinfo_t *) grid_handle[0];
+      superlu_gridexit(grid);
+      SUPERLU_FREE(grid);
+
+    } else {
+      fprintf(stderr, "Invalid iopt=%d passed to c_fortran_slugrid()\n", *iopt);
+      exit(-1);
+    }
+}
diff --git a/FORTRAN/dcreate_dist_matrix.c b/FORTRAN/dcreate_dist_matrix.c
new file mode 100644
index 0000000..3a6dde9
--- /dev/null
+++ b/FORTRAN/dcreate_dist_matrix.c
@@ -0,0 +1,206 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Distribute the input matrix in a distributed compressed row format.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 3.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ *
+ *
+ * Purpose
+ * =======
+ * 
+ * DCREATE_DIST_MATRIX reads the global matrix from three input arrays
+ * and distribute it to the processes in a distributed compressed row format.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A             (output) SuperMatrix*
+ *               Local matrix A in NR_loc format. 
+ *
+ * M             (input) int_t
+ *               The row number of the global matrix. 
+ *
+ * N             (input) int_t
+ *               The col number of the global matrix. 
+ *
+ * NNZ           (input) int_t
+ *               The number nonzeros in the global matrix. 
+ *
+ * NZVAL_G       (input) double*
+ *               Nonzero values of the global matrix. 
+ *
+ * ROWIND_G      (input) int_t*
+ *               Row indices of the global matrix. 
+ *
+ * COLPTR_G      (input) int_t*
+ *               Columns pointers of the global matrix. 
+ *
+ * GRID          (input) gridinof_t*
+ *               The 2D process mesh.
+ *
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+int dcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
+			double *nzval_g, int_t *rowind_g, int_t *colptr_g,
+			gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    int_t    *rowind, *colptr;	 /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    iam, row, col, i, j, relpos;
+    char     trans[1];
+    int_t    *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_dist_matrix()");
+#endif
+ 
+    if ( !iam ) {
+
+        /* Allocate storage for compressed column representation. */
+        dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	/* Copy the global matrix. */
+#if 0
+	/* and ADJUST to 0-based indexing 
+           which is required by the C routines.*/
+#endif
+        for(i=0; i<nnz; i++){
+	  nzval[i]=nzval_g[i];
+	  rowind[i]=rowind_g[i]; /* - 1;*/
+        }
+        for(i=0; i<n+1; i++)
+	  colptr[i]=colptr_g[i]; /* - 1;*/
+
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+      m_loc = m_loc+1;
+      m_loc_fst = m_loc;
+      if (iam == (grid->nprow * grid->npcol - 1)) 
+	m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_D, SLU_GE);
+
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_D, SLU_GE);
+    
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()");
+#endif
+    return 0;
+}
+
+
diff --git a/FORTRAN/dhbcode1.f90 b/FORTRAN/dhbcode1.f90
new file mode 100644
index 0000000..6278ef3
--- /dev/null
+++ b/FORTRAN/dhbcode1.f90
@@ -0,0 +1,50 @@
+!> @file
+!! \brief Fortran code for reading a sparse matrix in Harwell-Boeing format.
+!!
+!
+      subroutine dhbcode1(nrow, ncol, nnzero, values, rowind, colptr)
+
+!     ================================================================
+!     ... SAMPLE CODE FOR READING A SPARSE MATRIX IN STANDARD FORMAT
+!     ================================================================
+
+      CHARACTER      TITLE*72, KEY*8, MXTYPE*3, PTRFMT*16, &
+                     INDFMT*16, VALFMT*20, RHSFMT*20
+
+      INTEGER        TOTCRD, PTRCRD, INDCRD, VALCRD, RHSCRD,  NROW,  &
+                     NCOL  , NNZERO, NELTVL
+
+      INTEGER        COLPTR (*), ROWIND (*)
+
+      REAL*8         VALUES (*)
+
+!    ------------------------
+!     ... READ IN HEADER BLOCK
+!     ------------------------
+
+      READ ( 5, 1000 ) TITLE , KEY , TOTCRD, PTRCRD, INDCRD, VALCRD, &
+                       RHSCRD, MXTYPE, NROW  , NCOL  , NNZERO, NELTVL, &
+                       PTRFMT, INDFMT, VALFMT, RHSFMT
+ 1000 FORMAT ( A72, A8 / 5I14 / A3, 11X, 4I14 / 2A16, 2A20 )
+
+!     -------------------------
+!     ... READ MATRIX STRUCTURE
+!     -------------------------
+
+      READ ( 5, PTRFMT ) ( COLPTR (I), I = 1, NCOL+1 )
+
+      READ ( 5, INDFMT ) ( ROWIND (I), I = 1, NNZERO )
+
+      IF  ( VALCRD .GT. 0 )  THEN
+
+!         ----------------------
+!         ... READ MATRIX VALUES
+!         ----------------------
+
+          READ ( 5, VALFMT ) ( VALUES (I), I = 1, NNZERO )
+
+      ENDIF
+
+      return
+      end
+
diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.f90
new file mode 100644
index 0000000..fec77ad
--- /dev/null
+++ b/FORTRAN/f_5x5.f90
@@ -0,0 +1,226 @@
+
+! -- Distributed SuperLU routine (version 2.0) --
+! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+! July 20, 2004
+!
+!
+      program f_5x5
+! 
+! Purpose
+! =======
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+! The input matrix is a small 5x5 example appeared in SuperLU Users' Guide,,
+! Section 2.2:
+!
+!   [ s     u  u    ]     [ 19      21  21    ]
+!   [ l  u          ]     [ 12  21            ]
+!   [    l  p       ]  =  [     12  16        ]
+!   [          e  u ]     [             5  21 ]
+!   [ l  l        r ]     [ 12  12         18 ]
+!
+! It is set up to use 2 processors:
+!    processor 1 contains the first 2 rows
+!    processor 2 contains the last 3 rows
+!
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pdgssvx
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+      use superlu_mod
+!      implicit none
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 )
+      integer colind(maxnz), rowptr(maxn+1)
+      real*8  nzval(maxnz), b(maxn), berr(maxnrhs)
+      integer n, m, nnz, nrhs, ldb, nprow, npcol, init
+      integer*4 iam, info, i, ierr, ldb4
+      integer nnz_loc, m_loc, fst_row
+      real*8  s, u, p, e, r, l
+
+      integer(superlu_ptr) :: grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo_handle(grid)
+      call f_create_options_handle(options)
+      call f_create_ScalePerm_handle(ScalePermstruct)
+      call f_create_LUstruct_handle(LUstruct)
+      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 1
+      npcol = 2
+      call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam)
+      if ( iam >= nprow * npcol ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid ', nprow, ' X ', npcol
+         write(*,*) ' default integer size ', kind(0) 
+      endif
+!
+!*************************************************************************
+! Set up the input matrix A
+!*************************************************************************
+! The input matrix is a small 5x5 example appeared in SuperLU Users' Guide:
+!
+!   [ s     u  u    ]     [ 19      21  21    ]
+!   [ l  u          ]     [ 12  21            ]
+!   [    l  p       ]  =  [     12  16        ]
+!   [          e  u ]     [             5  21 ]
+!   [ l  l        r ]     [ 12  12         18 ]
+!
+! It is set up to use 2 processors:
+!    processor 1 contains the first 2 rows
+!    processor 2 contains the last 3 rows
+!
+      m = 5
+      n = 5
+      nnz = 12
+      s = 19.0
+      u = 21.0
+      p = 16.0
+      e = 5.0
+      r = 18.0
+      l = 12.0
+!      
+      if ( iam == 0 ) then
+! Processor 0 owns the first 2 rows of the matrix
+! NOTE: 0-based indexing must be used for the C routines.
+         nnz_loc   = 5
+         m_loc     = 2
+         fst_row   = 0         ! 0-based indexing
+         nzval(1)  = s
+         colind(1) = 0         ! 0-based indexing
+         nzval(2)  = u
+         colind(2) = 2
+         nzval(3)  = u
+         colind(3) = 3
+         nzval(4)  = l
+         colind(4) = 0
+         nzval(5)  = u
+         colind(5) = 1
+         rowptr(1) = 0         ! 0-based indexing
+         rowptr(2) = 3
+         rowptr(3) = 5
+      else
+! Processor 1 owns the last 3 rows of the matrix
+         nnz_loc   = 7
+         m_loc     = 3
+         fst_row   = 2         ! 0-based indexing
+         nzval(1)  = l
+         colind(1) = 1
+         nzval(2)  = p
+         colind(2) = 2
+         nzval(3)  = e
+         colind(3) = 3
+         nzval(4)  = u
+         colind(4) = 4
+         nzval(5)  = l
+         colind(5) = 0
+         nzval(6)  = l
+         colind(6) = 1
+         nzval(7)  = r
+         colind(7) = 4
+         rowptr(1) = 0         ! 0-based indexing
+         rowptr(2) = 2
+         rowptr(3) = 4
+         rowptr(4) = 7
+      endif
+
+      if ( iam == 0 ) then 
+         write(*,*) ' Matrix A was set up'
+      endif
+
+! Create the distributed compressed row matrix pointed to by the F90 handle A
+      call f_dCreate_CompRowLoc_Mat_dist(A, m, n, nnz_loc, m_loc, fst_row, &
+           nzval, colind, rowptr, SLU_NR_loc, SLU_D, SLU_GE)
+
+! Setup the right hand side
+      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+      do i = 1, ldb
+         b(i) = 1.0
+      enddo
+      nrhs = 1
+      ldb4 = ldb
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Modify one or more options
+      call set_superlu_options(options,ColPerm=NATURAL)
+      call set_superlu_options(options,RowPerm=NOROWPERM)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A,nrow=m,ncol=n)
+      call f_ScalePermstructInit(m, n, ScalePermstruct)
+      call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0 .and. iam == 1) then
+         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_SuperMat_Store_dist(A)
+      call f_ScalePermstructFree(ScalePermstruct)
+      call f_Destroy_LU(n, grid, LUstruct)
+      call f_LUstructFree(LUstruct)
+      call get_superlu_options(options, SolveInitialized=init)
+      if (init == YES) then
+         call f_dSolveFinalize(options, SOLVEstruct)
+      endif
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.f90
new file mode 100644
index 0000000..33803d9
--- /dev/null
+++ b/FORTRAN/f_pddrive.f90
@@ -0,0 +1,161 @@
+
+
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!! </pre>
+!
+      program f_pddrive
+! 
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE.
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+! 
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pdgssvx
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+!
+      use superlu_mod
+!      implicit none
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      integer rowind(maxnz), colptr(maxn)
+      real*8  values(maxnz), b(maxn), berr(maxnrhs)
+      integer n, m, nnz, nprow, npcol, ldb, init
+      integer*4 iam, info, i, ierr, ldb4, nrhs
+
+      integer(superlu_ptr) :: grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo_handle(grid)
+      call f_create_options_handle(options)
+      call f_create_ScalePerm_handle(ScalePermstruct)
+      call f_create_LUstruct_handle(LUstruct)
+      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 2
+      npcol = 2
+      call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam)
+      if ( iam >= nprow * npcol ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid ', nprow, ' X ', npcol
+      endif
+
+! Read Harwell-Boeing matrix, and adjust the pointers and indices
+! to 0-based indexing, as required by C routines.
+      if ( iam == 0 ) then 
+         open(file = "../EXAMPLE/g20.rua", status = "old", unit = 5)
+         call dhbcode1(m, n, nnz, values, rowind, colptr)
+         close(unit = 5)
+!
+         do i = 1, n+1
+            colptr(i) = colptr(i) - 1
+         enddo
+         do i = 1, nnz
+            rowind(i) = rowind(i) - 1
+         enddo
+      endif
+
+! Distribute the matrix to the process gird
+      call  f_dcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
+
+! Setup the right hand side
+      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+      do i = 1, ldb
+         b(i) = 1.0
+      enddo
+      nrhs = 1
+      ldb4 = ldb
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Change one or more options
+!      call set_superlu_options(options,Fact=FACTORED)
+!      call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A, nrow=m, ncol=n)
+      call f_ScalePermstructInit(m, n, ScalePermstruct)
+      call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0) then
+         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_CompRowLoc_Mat_dist(A)
+      call f_ScalePermstructFree(ScalePermstruct)
+      call f_Destroy_LU(n, grid, LUstruct)
+      call f_LUstructFree(LUstruct)
+      call get_superlu_options(options, SolveInitialized=init)
+      if (init == YES) then
+         call f_dSolveFinalize(options, SOLVEstruct)
+      endif
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/f_pddrive_ABglobal.f b/FORTRAN/f_pddrive_ABglobal.f
new file mode 100644
index 0000000..5e15361
--- /dev/null
+++ b/FORTRAN/f_pddrive_ABglobal.f
@@ -0,0 +1,76 @@
+!
+! -- Distributed SuperLU routine (version 2.0) --
+! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+! July 10, 2003
+!
+!
+      program f_pddrive_ABglobal
+      include 'mpif.h'
+      implicit none
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      integer rowind(maxnz), colptr(maxn)
+      real*8  values(maxnz), b(maxn), berr(maxnrhs)
+      integer n, nnz, nrhs, ldb, i, ierr, info, iopt
+      integer nprow, npcol
+      integer factors_handle(8), grid_handle(8)
+!
+      call mpi_init(ierr)
+!
+!     Read Harwell-Boeing matrix
+      call hbcode1(n, n, nnz, values, rowind, colptr)
+!
+!     Adjust to 0-based indexing which is required by the C routines.
+      do i = 1, n+1
+         colptr(i) = colptr(i) - 1;
+      end do
+      do i = 1, nnz
+         rowind(i) = rowind(i) - 1;
+      end do
+
+      nrhs = 1
+      ldb = n
+      do i = 1, n
+         b(i) = 1.0
+      enddo
+!
+      iopt = 1
+      nprow = 2
+      npcol = 2
+      call c_fortran_slugrid(iopt, MPI_COMM_WORLD, nprow, npcol,
+     $     grid_handle)
+!
+! Only performs LU factorization
+!
+      iopt = 1
+      call c_fortran_pdgssvx_ABglobal(iopt, n, nnz, nrhs,
+     $     values, rowind, colptr, b, ldb, grid_handle, berr,
+     $     factors_handle, info)
+!
+! Now performs triangular solve with the existing factors
+!
+      iopt = 3
+      call c_fortran_pdgssvx_ABglobal(iopt, n, nnz, nrhs,
+     $     values, rowind, colptr, b, ldb, grid_handle, berr,
+     $     factors_handle, info)
+!
+      if (info .eq. 0) then
+         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+      else
+         write(*,*) 'INFO from c_fortran_pdgssvx_ABglobal = ', info
+      endif
+!
+! Now free the storage associated with the handles
+!
+      iopt = 4
+      call c_fortran_pdgssvx_ABglobal(iopt, n, nnz, nrhs,
+     $     values, rowind, colptr, b, ldb, grid_handle, berr,
+     $     factors_handle, info)
+      iopt = 2
+      call c_fortran_slugrid(iopt, MPI_COMM_WORLD, nprow, npcol,
+     $     grid_handle)
+!
+      call mpi_finalize(ierr)
+!
+      stop
+      end
diff --git a/FORTRAN/f_pddrive_old.f90 b/FORTRAN/f_pddrive_old.f90
new file mode 100644
index 0000000..9a3dd48
--- /dev/null
+++ b/FORTRAN/f_pddrive_old.f90
@@ -0,0 +1,159 @@
+!
+! -- Distributed SuperLU routine (version 2.0) --
+! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+! July 29, 2003
+!
+!
+      program f_pddrive
+      use superlu_mod
+      include 'mpif.h'
+      implicit none
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      integer rowind(maxnz), colptr(maxn)
+      real*8  values(maxnz), b(maxn), berr(maxnrhs)
+      integer n, m, nnz, nrhs, ldb, i, ierr, info, iam, m_loc, nnz_loc, fst_row
+      integer nprow, npcol
+      integer init
+
+      integer(superlu_ptr) :: grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A
+
+      integer(superlu_ptr) :: stat
+
+
+! Default process rows
+      nprow = 1  
+! Default process columns
+      npcol = 1 
+! Number of right-hand side 
+      nrhs = 1  
+
+! INITIALIZE MPI ENVIRONMENT 
+      call mpi_init(ierr)
+
+! Check Malloc
+      call f_check_malloc(iam)
+
+! create C structures used in superlu
+      call f_create_gridinfo(grid)
+      call f_create_options(options)
+      call f_create_ScalePermstruct(ScalePermstruct)
+      call f_create_LUstruct(LUstruct)
+      call f_create_SOLVEstruct(SOLVEstruct)
+      call f_create_SuperMatrix(A)
+
+! initialize the SuperLU process grid
+      nprow = 2
+      npcol = 2
+      call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam)
+      if ( iam >= nprow * npcol ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid ', nprow, ' X ', npcol
+      endif
+
+
+! Read Harwell-Boeing matrix
+      if ( iam == 0 ) then 
+         call hbcode1(m, n, nnz, values, rowind, colptr)
+      endif
+
+
+! Distribute the matrix to the gird
+      call  f_dcreate_matrix_dis(A, m, n, nnz, values, rowind, colptr, grid)
+
+! Get m_loc
+      call  get_CompRowLoc_Matrix(A, nrow_loc=m_loc);
+
+! Setup the right hand side
+      nrhs = 1
+      ldb = m_loc
+      do i = 1, ldb
+         b(i) = 1.0
+      enddo
+
+
+! set the default input options
+      call f_set_default_options(options)
+
+! set one or more option
+!      call set_superlu_options(options,Fact=FACTORED)
+
+
+! initialize ScalePermstruct and LUstruct
+
+! get the m and n 
+      call get_SuperMatrix(A,nrow=m,ncol=n)
+      call f_ScalePermstructInit(m, n, ScalePermstruct)
+      call f_LUstructInit(m, n, LUstruct)
+
+! initialize the statistics variables
+      call f_create_SuperLUStat(stat)
+      call f_PStatInit(stat)
+
+
+! call the linear equation solver
+      call f_pdgssvx(options, A, &
+               ScalePermstruct, b, &
+               ldb, nrhs, grid, &
+               LUstruct, SOLVEstruct, berr, &
+               stat, info)
+
+      if (info == 0) then
+         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+
+! free memory
+      call f_PStatFree(stat)
+      call f_destroy_SuperLUStat(stat)
+
+! deallocate SuperLU allocated storage
+
+      call f_Destroy_CompRowLoc_Matrix_dis(A)
+      call f_ScalePermstructFree(ScalePermstruct)
+!      call get_SuperMatrix(A,ncol=n)
+      call f_Destroy_LU(n, grid, LUstruct)
+      call f_LUstructFree(LUstruct)
+      call get_superlu_options(options, SolveInitialized=init)
+      if (init == YES) then
+         call f_dSolveFinalize(options, SOLVEstruct)
+      endif
+
+
+! release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! destroy C structures in superlu_matrix_type
+      call f_destroy_gridinfo(grid)
+      call f_destroy_options(options)
+      call f_destroy_ScalePermstruct(ScalePermstruct)
+      call f_destroy_LUstruct(LUstruct)
+      call f_destroy_SOLVEstruct(SOLVEstruct)
+      call f_destroy_SuperMatrix(A)
+
+! TERMINATES THE MPI EXECUTION ENVIRONMENT
+      call mpi_finalize(ierr)
+!
+
+! Check Malloc
+      call f_check_malloc(iam)
+
+
+      stop
+      end
+
+
+
+
diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.f90
new file mode 100644
index 0000000..9c9db5b
--- /dev/null
+++ b/FORTRAN/f_pzdrive.f90
@@ -0,0 +1,160 @@
+
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!! </pre>
+!
+      program f_pzdrive
+! 
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE.
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+! 
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pdgssvx
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+!
+      use superlu_mod
+!      implicit none
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      integer rowind(maxnz), colptr(maxn)
+      double complex  values(maxnz), b(maxn), berr(maxnrhs)
+      integer n, m, nnz, nprow, npcol, ldb, init
+      integer*4 iam, info, i, ierr, ldb4, nrhs
+
+      integer(superlu_ptr) :: grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo_handle(grid)
+      call f_create_options_handle(options)
+      call f_create_ScalePerm_handle(ScalePermstruct)
+      call f_create_LUstruct_handle(LUstruct)
+      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 2
+      npcol = 2
+      call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam)
+      if ( iam >= nprow * npcol ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid ', nprow, ' X ', npcol
+      endif
+
+! Read Harwell-Boeing matrix, and adjust the pointers and indices
+! to 0-based indexing, as required by C routines.
+      if ( iam == 0 ) then 
+         open(file = "../EXAMPLE/cg20.cua", status = "old", unit = 5)
+         call zhbcode1(m, n, nnz, values, rowind, colptr)
+         close(unit = 5)
+!
+         do i = 1, n+1
+            colptr(i) = colptr(i) - 1
+         enddo
+         do i = 1, nnz
+            rowind(i) = rowind(i) - 1
+         enddo
+      endif
+
+! Distribute the matrix to the process gird
+      call  f_zcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
+
+! Setup the right hand side
+      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+      do i = 1, ldb
+         b(i) = 1.0
+      enddo
+      nrhs = 1
+      ldb4 = ldb
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Change one or more options
+!      call set_superlu_options(options,Fact=FACTORED)
+!      call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A, nrow=m, ncol=n)
+      call f_ScalePermstructInit(m, n, ScalePermstruct)
+      call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_pzgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0) then
+         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_CompRowLoc_Mat_dist(A)
+      call f_ScalePermstructFree(ScalePermstruct)
+      call f_Destroy_LU(n, grid, LUstruct)
+      call f_LUstructFree(LUstruct)
+      call get_superlu_options(options, SolveInitialized=init)
+      if (init == YES) then
+         call f_zSolveFinalize(options, SOLVEstruct)
+      endif
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/hbcode1.f.bak b/FORTRAN/hbcode1.f.bak
new file mode 100644
index 0000000..df63c6c
--- /dev/null
+++ b/FORTRAN/hbcode1.f.bak
@@ -0,0 +1,46 @@
+      subroutine hbcode1(nrow, ncol, nnzero, values, rowind, colptr)
+
+C     ================================================================
+C     ... SAMPLE CODE FOR READING A SPARSE MATRIX IN STANDARD FORMAT
+C     ================================================================
+
+      CHARACTER      TITLE*72 , KEY*8    , MXTYPE*3 ,
+     1               PTRFMT*16, INDFMT*16, VALFMT*20, RHSFMT*20
+
+      INTEGER        TOTCRD, PTRCRD, INDCRD, VALCRD, RHSCRD,
+     1               NROW  , NCOL  , NNZERO, NELTVL
+
+      INTEGER        COLPTR (*), ROWIND (*)
+
+      REAL*8         VALUES (*)
+
+C    ------------------------
+C     ... READ IN HEADER BLOCK
+C     ------------------------
+
+      READ ( *, 1000 ) TITLE , KEY   ,
+     1                     TOTCRD, PTRCRD, INDCRD, VALCRD, RHSCRD,
+     2                     MXTYPE, NROW  , NCOL  , NNZERO, NELTVL,
+     3                     PTRFMT, INDFMT, VALFMT, RHSFMT
+ 1000 FORMAT ( A72, A8 / 5I14 / A3, 11X, 4I14 / 2A16, 2A20 )
+
+C     -------------------------
+C     ... READ MATRIX STRUCTURE
+C     -------------------------
+
+      READ ( *, PTRFMT ) ( COLPTR (I), I = 1, NCOL+1 )
+
+      READ ( *, INDFMT ) ( ROWIND (I), I = 1, NNZERO )
+
+      IF  ( VALCRD .GT. 0 )  THEN
+
+C         ----------------------
+C         ... READ MATRIX VALUES
+C         ----------------------
+
+          READ ( *, VALFMT ) ( VALUES (I), I = 1, NNZERO )
+
+      ENDIF
+
+      return
+      end
diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c
new file mode 100644
index 0000000..3366671
--- /dev/null
+++ b/FORTRAN/sp_ienv.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Chooses machine-dependent parameters for the local environment
+ */
+/*
+ * File name:		sp_ienv.c
+ * History:             Modified from lapack routine ILAENV
+ */
+#include "superlu_ddefs.h"
+#include "machines.h"
+
+/*! \brief
+
+</pre>
+    Purpose   
+    =======   
+
+    sp_ienv_dist() is inquired to choose machine-dependent parameters for the
+    local environment. See ISPEC for a description of the parameters.   
+
+    This version provides a set of parameters which should give good,   
+    but not optimal, performance on many of the currently available   
+    computers.  Users are encouraged to modify this subroutine to set   
+    the tuning parameters for their particular machine using the option   
+    and problem size information in the arguments.   
+
+    Arguments   
+    =========   
+
+    ISPEC   (input) int
+            Specifies the parameter to be returned as the value of SP_IENV_DIST.   
+            = 1: the panel size w; a panel consists of w consecutive
+	         columns of matrix A in the process of Gaussian elimination.
+		 The best value depends on machine's cache characters.
+            = 2: the relaxation parameter relax; if the number of
+	         nodes (columns) in a subtree of the elimination tree is less
+		 than relax, this subtree is considered as one supernode,
+		 regardless of the their row structures.
+            = 3: the maximum size for a supernode, which must be greater
+                 than or equal to relaxation parameter (see case 2);
+	    = 4: the minimum row dimension for 2-D blocking to be used;
+	    = 5: the minimum column dimension for 2-D blocking to be used;
+	    = 6: the estimated fills factor for the adjacency structures 
+	         of L and U, compared with A;
+	    = 7: the minimum value of the product M*N*K for a GEMM call
+	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+	    
+   (SP_IENV_DIST) (output) int
+            >= 0: the value of the parameter specified by ISPEC   
+            < 0:  if SP_IENV_DIST = -k, the k-th argument had an illegal value.
+  
+    ===================================================================== 
+</pre>
+*/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int_t
+sp_ienv_dist(int_t ispec)
+{
+    // printf(" this function called\n");
+    int i;
+
+    char* ttemp;
+
+    switch (ispec) {
+#if ( MACH==CRAY_T3E )
+	case 2: return (6);
+	case 3: return (30);
+
+#elif ( MACH==IBM )
+	case 2: return (20);
+	case 3: return (100);
+#else
+	case 2: 
+            ttemp = getenv("NREL");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 1;
+            
+	case 3: 
+            ttemp = getenv("NSUP");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 128;
+
+#endif
+        case 6: return (5);
+        case 7:
+	    ttemp = getenv ("N_GEMM");
+	    if (ttemp) return atoi (ttemp);
+	    else return 10000;
+
+    }
+
+    /* Invalid value for ISPEC */
+    i = 1;
+    xerr_dist("sp_ienv", &i);
+    return 0;
+
+
+} /* sp_ienv_dist */
+
diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c
new file mode 100644
index 0000000..5853089
--- /dev/null
+++ b/FORTRAN/superlu_c2f_dwrap.c
@@ -0,0 +1,332 @@
+
+
+/*! @file 
+ * \brief C interface functions for the Fortran90 wrapper.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ * April 5, 2015
+ */
+
+#include "superlu_ddefs.h"
+#include "Cnames.h"
+
+/* kind of integer to hold a pointer.  Use int.
+   This might need to be changed on systems with large memory.
+   If changed, be sure to change it in superlupara.f90 too */
+
+#if 0
+typedef int fptr;  /* 32-bit */
+#else
+typedef long long int fptr;  /* 64-bit */
+#endif
+
+
+/* some MPI implementations may require conversion between a Fortran
+   communicator and a C communicator.  This routine is used to perform the
+   conversion.  It may need different forms for different MPI libraries. */
+
+/* NO_MPI2 should be defined on the compiler command line if the MPI
+   library does not provide MPI_Comm_f2c */
+
+MPI_Comm f2c_comm(int *f_comm)
+{
+#ifndef NO_MPI2
+
+/* MPI 2 provides a standard way of doing this */
+   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
+#else
+
+/* will probably need some special cases here */
+/* when in doubt, just return the input */
+   return (MPI_Comm)(*f_comm);
+#endif
+}
+
+
+/* functions that create memory for a struct and return a handle */
+
+void f_create_gridinfo_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
+}
+
+void f_create_options_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
+}
+
+void f_create_ScalePerm_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(ScalePermstruct_t));
+}
+
+void f_create_LUstruct_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(LUstruct_t));
+}
+
+void f_create_SOLVEstruct_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SOLVEstruct_t));
+}
+
+void f_create_SuperMatrix_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
+}
+
+void f_create_SuperLUStat_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+}
+
+/* functions that free the memory allocated by the above functions */
+
+void f_destroy_gridinfo_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_options_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_ScalePerm_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_LUstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SOLVEstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperMatrix_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperLUStat_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+/* functions that get or set values in a C struct.
+   This is not the complete set of structs for which a user might want
+   to get/set a component, and there may be missing components. */
+
+void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
+{
+  *iam=((gridinfo_t *) *grid)->iam;
+  *npcol=((gridinfo_t *) *grid)->npcol;
+  *nprow=((gridinfo_t *) *grid)->nprow;
+}
+
+void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
+{
+   *nrow = ((SuperMatrix *) *A)->nrow;
+   *ncol = ((SuperMatrix *) *A)->ncol;
+}
+
+void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
+{
+   ((SuperMatrix *) *A)->nrow = *nrow;
+   ((SuperMatrix *) *A)->ncol = *ncol;
+}
+
+void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+			     int_t *m_loc, int_t *fst_row)
+{
+  *m=((SuperMatrix *) *A)->nrow;
+  *n=((SuperMatrix *) *A)->ncol;
+  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
+  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
+  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
+}
+
+void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+			     int_t *m_loc, int_t *fst_row)
+{
+  ((SuperMatrix *) *A)->nrow = *m;
+  ((SuperMatrix *) *A)->ncol = *n;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
+}
+
+void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
+   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
+   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
+   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
+   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
+   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
+   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
+   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
+   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
+   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
+   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
+}
+
+void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
+    l_options->Fact = (fact_t) *Fact;
+   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
+   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
+   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
+   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
+   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
+   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
+   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
+   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
+   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
+   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
+}
+
+/* wrappers for SuperLU functions */
+
+void f_set_default_options(fptr *options)
+{
+   set_default_options_dist((superlu_dist_options_t *) *options);
+}
+
+void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
+{
+  
+   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
+                       int_t *usermap, int_t *ldumap,
+	 fptr *grid)
+{
+   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridexit(fptr *grid)
+{
+   superlu_gridexit((gridinfo_t *) *grid);
+}
+
+void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+{
+   ScalePermstructInit(*m, *n, (ScalePermstruct_t *) *ScalePermstruct);
+}
+
+void f_ScalePermstructFree(fptr *ScalePermstruct)
+{
+   ScalePermstructFree((ScalePermstruct_t *) *ScalePermstruct);
+}
+
+void f_PStatInit(fptr *stat)
+{
+   PStatInit((SuperLUStat_t *) *stat);
+}
+
+void f_PStatFree(fptr *stat)
+{
+   PStatFree((SuperLUStat_t *) *stat);
+}
+
+void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+{
+   extern void LUstructInit(const int_t, LUstruct_t *);
+
+   LUstructInit(*m, (LUstruct_t *) *LUstruct);
+}
+
+void f_LUstructFree(fptr *LUstruct)
+{
+   extern void LUstructFree(LUstruct_t *);
+
+   LUstructFree((LUstruct_t *) *LUstruct);
+}
+
+void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+{
+   Destroy_LU(*n, (gridinfo_t *) *grid, (LUstruct_t *) *LUstruct);
+}
+
+void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+				   int_t *m_loc, int_t *fst_row, double *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
+{
+   dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, (double *) nzval, colind, rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
+}
+
+void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+{
+   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+}
+
+void f_Destroy_SuperMat_Store_dist(fptr *A)
+{
+   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+}
+
+void f_dSolveFinalize(fptr *options, fptr *SOLVEstruct)
+{
+   dSolveFinalize((superlu_dist_options_t *) *options,
+                  (SOLVEstruct_t *) *SOLVEstruct);
+}
+
+void f_pdgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, double *B,
+               int *ldb, int *nrhs, fptr *grid, fptr *LUstruct,
+               fptr *SOLVEstruct, double *berr, fptr *stat, int *info)
+{
+    pdgssvx((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	    (ScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	    (gridinfo_t *) *grid, (LUstruct_t *) *LUstruct,
+	    (SOLVEstruct_t *) *SOLVEstruct, berr,
+	    (SuperLUStat_t *) *stat, info);
+
+    PStatPrint((superlu_dist_options_t *) *options, (SuperLUStat_t *) *stat,
+	       (gridinfo_t *) *grid);
+}
+
+/* Create the distributed matrix */
+
+void f_dcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
+			   double *nzval, int_t *rowind, int_t *colptr,
+			   fptr *grid)
+{
+   int dcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, double *,
+			   int_t * , int_t *, gridinfo_t *);
+
+   dcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
+		       (double *) nzval, (int_t *) rowind, (int_t *) colptr,
+		       (gridinfo_t *) *grid);
+
+}
+
+/* Check malloc */
+
+void f_check_malloc(int *iam)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC((int_t) *iam, "Check Malloc");
+#endif
+}
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
new file mode 100644
index 0000000..290b590
--- /dev/null
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -0,0 +1,331 @@
+
+/*! @file 
+ * \brief C interface functions for the Fortran90 wrapper.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ * April 5, 2015
+ */
+
+#include "superlu_zdefs.h"
+#include "Cnames.h"
+
+/* kind of integer to hold a pointer.  Use int.
+   This might need to be changed on systems with large memory.
+   If changed, be sure to change it in superlupara.f90 too */
+
+#if 0
+typedef int fptr;  /* 32-bit */
+#else
+typedef long long int fptr;  /* 64-bit */
+#endif
+
+
+/* some MPI implementations may require conversion between a Fortran
+   communicator and a C communicator.  This routine is used to perform the
+   conversion.  It may need different forms for different MPI libraries. */
+
+/* NO_MPI2 should be defined on the compiler command line if the MPI
+   library does not provide MPI_Comm_f2c */
+
+MPI_Comm f2c_comm(int *f_comm)
+{
+#ifndef NO_MPI2
+
+/* MPI 2 provides a standard way of doing this */
+   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
+#else
+
+/* will probably need some special cases here */
+/* when in doubt, just return the input */
+   return (MPI_Comm)(*f_comm);
+#endif
+}
+
+
+/* functions that create memory for a struct and return a handle */
+
+void f_create_gridinfo_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
+}
+
+void f_create_options_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
+}
+
+void f_create_ScalePerm_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(ScalePermstruct_t));
+}
+
+void f_create_LUstruct_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(LUstruct_t));
+}
+
+void f_create_SOLVEstruct_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SOLVEstruct_t));
+}
+
+void f_create_SuperMatrix_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
+}
+
+void f_create_SuperLUStat_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+}
+
+/* functions that free the memory allocated by the above functions */
+
+void f_destroy_gridinfo_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_options_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_ScalePerm_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_LUstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SOLVEstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperMatrix_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperLUStat_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+/* functions that get or set values in a C struct.
+   This is not the complete set of structs for which a user might want
+   to get/set a component, and there may be missing components. */
+
+void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
+{
+  *iam=((gridinfo_t *) *grid)->iam;
+  *npcol=((gridinfo_t *) *grid)->npcol;
+  *nprow=((gridinfo_t *) *grid)->nprow;
+}
+
+void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
+{
+   *nrow = ((SuperMatrix *) *A)->nrow;
+   *ncol = ((SuperMatrix *) *A)->ncol;
+}
+
+void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
+{
+   ((SuperMatrix *) *A)->nrow = *nrow;
+   ((SuperMatrix *) *A)->ncol = *ncol;
+}
+
+void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+			     int_t *m_loc, int_t *fst_row)
+{
+  *m=((SuperMatrix *) *A)->nrow;
+  *n=((SuperMatrix *) *A)->ncol;
+  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
+  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
+  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
+}
+
+void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+			     int_t *m_loc, int_t *fst_row)
+{
+  ((SuperMatrix *) *A)->nrow = *m;
+  ((SuperMatrix *) *A)->ncol = *n;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
+  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
+}
+
+void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
+   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
+   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
+   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
+   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
+   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
+   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
+   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
+   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
+   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
+   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
+}
+
+void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
+    l_options->Fact = (fact_t) *Fact;
+   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
+   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
+   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
+   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
+   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
+   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
+   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
+   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
+   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
+   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
+}
+
+/* wrappers for SuperLU functions */
+
+void f_set_default_options(fptr *options)
+{
+   set_default_options_dist((superlu_dist_options_t *) *options);
+}
+
+void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
+{
+  
+   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
+                       int_t *usermap, int_t *ldumap,
+	 fptr *grid)
+{
+   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridexit(fptr *grid)
+{
+   superlu_gridexit((gridinfo_t *) *grid);
+}
+
+void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+{
+   ScalePermstructInit(*m, *n, (ScalePermstruct_t *) *ScalePermstruct);
+}
+
+void f_ScalePermstructFree(fptr *ScalePermstruct)
+{
+   ScalePermstructFree((ScalePermstruct_t *) *ScalePermstruct);
+}
+
+void f_PStatInit(fptr *stat)
+{
+   PStatInit((SuperLUStat_t *) *stat);
+}
+
+void f_PStatFree(fptr *stat)
+{
+   PStatFree((SuperLUStat_t *) *stat);
+}
+
+void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+{
+   extern void LUstructInit(const int_t, LUstruct_t *);
+
+   LUstructInit(*m, (LUstruct_t *) *LUstruct);
+}
+
+void f_LUstructFree(fptr *LUstruct)
+{
+   extern void LUstructFree(LUstruct_t *);
+
+   LUstructFree((LUstruct_t *) *LUstruct);
+}
+
+void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+{
+   Destroy_LU(*n, (gridinfo_t *) *grid, (LUstruct_t *) *LUstruct);
+}
+
+void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
+				   int_t *m_loc, int_t *fst_row, doublecomplex *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
+{
+   zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, (doublecomplex *) nzval, colind, rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
+}
+
+void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+{
+   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+}
+
+void f_Destroy_SuperMat_Store_dist(fptr *A)
+{
+   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+}
+
+void f_zSolveFinalize(fptr *options, fptr *SOLVEstruct)
+{
+   zSolveFinalize((superlu_dist_options_t *) *options,
+                  (SOLVEstruct_t *) *SOLVEstruct);
+}
+
+void f_pzgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, doublecomplex *B,
+               int *ldb, int *nrhs, fptr *grid, fptr *LUstruct,
+               fptr *SOLVEstruct, double *berr, fptr *stat, int *info)
+{
+    pzgssvx((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	    (ScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	    (gridinfo_t *) *grid, (LUstruct_t *) *LUstruct,
+	    (SOLVEstruct_t *) *SOLVEstruct, berr,
+	    (SuperLUStat_t *) *stat, info);
+
+    PStatPrint((superlu_dist_options_t *) *options, (SuperLUStat_t *) *stat,
+	       (gridinfo_t *) *grid);
+}
+
+/* Create the distributed matrix */
+
+void f_zcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
+			   doublecomplex *nzval, int_t *rowind, int_t *colptr,
+			   fptr *grid)
+{
+   int zcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
+			   int_t * , int_t *, gridinfo_t *);
+
+   zcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
+		       (doublecomplex *) nzval, (int_t *) rowind, (int_t *) colptr,
+		       (gridinfo_t *) *grid);
+
+}
+
+/* Check malloc */
+
+void f_check_malloc(int *iam)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC((int_t) *iam, "Check Malloc");
+#endif
+}
diff --git a/FORTRAN/superlu_mod.f90 b/FORTRAN/superlu_mod.f90
new file mode 100644
index 0000000..bdfa819
--- /dev/null
+++ b/FORTRAN/superlu_mod.f90
@@ -0,0 +1,163 @@
+!> @file
+!! \brief This module contains Fortran-side wrappers for the SuperLU
+!! get/set functions.
+!
+
+module superlu_mod
+
+!----------------------------------------------------
+! This module contains Fortran-side wrappers for the SuperLU get/set
+! functions, with optional arguments so the user doesn't have to provide
+! the full set of components.
+!----------------------------------------------------
+
+use superlupara_mod
+
+implicit none
+contains
+
+subroutine get_GridInfo(grid, iam, nprow, npcol)
+  integer(superlu_ptr) :: grid
+  integer*4, optional :: iam
+  integer, optional :: nprow, npcol
+  integer :: l_iam, l_nprow, l_npcol
+
+  call  f_get_gridinfo(grid, l_iam, l_nprow, l_npcol)
+
+  if (present(iam)) iam = l_iam
+  if (present(nprow)) nprow = l_nprow
+  if (present(npcol)) npcol = l_npcol
+
+end subroutine get_GridInfo
+
+subroutine get_SuperMatrix(A, nrow, ncol)
+  integer(superlu_ptr) :: A
+  integer, optional :: nrow, ncol
+  integer :: l_nrow, l_ncol
+
+  call f_get_SuperMatrix(A, l_nrow, l_ncol)
+
+  if (present(nrow)) nrow = l_nrow
+  if (present(ncol)) ncol = l_ncol
+
+end subroutine get_SuperMatrix
+
+subroutine set_SuperMatrix(A, nrow, ncol)
+  integer(superlu_ptr) :: A
+  integer, optional :: nrow, ncol
+  integer :: l_nrow, l_ncol
+
+  call f_get_SuperMatrix(A, l_nrow, l_ncol)
+  
+  if (present(nrow)) l_nrow = nrow
+  if (present(ncol)) l_ncol = ncol
+
+  call f_set_SuperMatrix(A, l_nrow, l_ncol)
+
+end subroutine set_SuperMatrix
+
+subroutine get_CompRowLoc_Matrix(A, nrow, ncol, nnz_loc, nrow_loc, fst_row)
+  integer(superlu_ptr) :: A
+  integer, optional :: nrow, ncol, nnz_loc, nrow_loc, fst_row
+  integer :: l_nrow, l_ncol, l_nnz_loc, l_nrow_loc, l_fst_row
+
+  call f_get_CompRowLoc_Matrix(A, l_nrow, l_ncol, l_nnz_loc, l_nrow_loc, &
+                               l_fst_row)
+
+  if (present(nrow)) nrow = l_nrow
+  if (present(ncol)) ncol = l_ncol
+  if (present(nnz_loc)) nnz_loc = l_nnz_loc
+  if (present(nrow_loc)) nrow_loc = l_nrow_loc
+  if (present(fst_row)) fst_row = l_fst_row
+
+end subroutine get_CompRowLoc_Matrix
+
+subroutine set_CompRowLoc_Matrix(A, nrow, ncol, nnz_loc, nrow_loc, fst_row)
+  integer(superlu_ptr) :: A
+  integer, optional :: nrow, ncol, nnz_loc, nrow_loc, fst_row
+  integer :: l_nrow, l_ncol, l_nnz_loc, l_nrow_loc, l_fst_row
+
+  call f_set_CompRowLoc_Matrix(A, l_nrow, l_ncol, l_nnz_loc, l_nrow_loc, &
+                               l_fst_row)
+
+  if (present(nrow)) l_nrow = nrow
+  if (present(ncol)) l_ncol = ncol
+  if (present(nnz_loc)) l_nnz_loc = nnz_loc
+  if (present(nrow_loc)) l_nrow_loc = nrow_loc
+  if (present(fst_row)) l_fst_row = fst_row
+
+end subroutine set_CompRowLoc_Matrix
+
+
+subroutine get_superlu_options(opt, Fact, Equil, ParSymbFact, ColPerm, &
+     RowPerm, IterRefine, Trans, ReplaceTinyPivot, SolveInitialized, &
+     RefineInitialized, PrintStat)
+  integer(superlu_ptr) :: opt
+  integer, optional :: Fact, Equil, ParSymbFact, ColPerm, RowPerm, &
+       IterRefine, Trans, ReplaceTinyPivot, SolveInitialized, &
+       RefineInitialized, PrintStat
+!
+  integer :: l_Fact, l_Equil, l_ParSymbFact, l_ColPerm, l_RowPerm, &
+             l_IterRefine, l_Trans, l_ReplaceTinyPivot, l_SolveInitialized, &
+             l_RefineInitialized, l_PrintStat
+
+  call f_get_superlu_options(opt, l_Fact, l_Equil, l_ParSymbFact, l_ColPerm, &
+                             l_RowPerm, l_IterRefine, l_Trans,  &
+                             l_ReplaceTinyPivot, l_SolveInitialized, &
+                             l_RefineInitialized, l_PrintStat)
+
+  if (present(Fact)) Fact = l_Fact
+  if (present(Equil)) Equil = l_Equil
+  if (present(ParSymbFact)) ParSymbFact = l_ParSymbFact
+  if (present(ColPerm)) ColPerm = l_ColPerm
+  if (present(RowPerm)) RowPerm = l_RowPerm
+  if (present(IterRefine)) IterRefine = l_IterRefine
+  if (present(Trans)) Trans = l_Trans
+  if (present(ReplaceTinyPivot)) ReplaceTinyPivot = l_ReplaceTinyPivot
+  if (present(SolveInitialized)) SolveInitialized = l_SolveInitialized
+  if (present(RefineInitialized)) RefineInitialized = l_RefineInitialized
+  if (present(PrintStat)) PrintStat = l_PrintStat
+
+end subroutine get_superlu_options
+
+
+subroutine set_superlu_options(opt, Fact, Equil, ParSymbFact, ColPerm, &
+     RowPerm, IterRefine, Trans, ReplaceTinyPivot, SolveInitialized, &
+     RefineInitialized, PrintStat)
+  integer(superlu_ptr) :: opt
+  integer, optional :: Fact, Equil, ParSymbFact, ColPerm, RowPerm, &
+       IterRefine, Trans, ReplaceTinyPivot, SolveInitialized, &
+       RefineInitialized, PrintStat
+!
+  integer :: l_Fact, l_Equil, l_ParSymbFact, l_ColPerm, l_RowPerm, &
+             l_IterRefine, l_Trans, l_ReplaceTinyPivot, l_SolveInitialized, &
+             l_RefineInitialized, l_PrintStat
+
+  call f_get_superlu_options(opt, l_Fact, l_Equil, l_ParSymbFact, l_ColPerm, &
+                             l_RowPerm, l_IterRefine, l_Trans,  &
+                             l_ReplaceTinyPivot, l_SolveInitialized, &
+                             l_RefineInitialized, l_PrintStat)
+
+  if (present(Fact)) l_Fact = Fact
+  if (present(Equil)) l_Equil = Equil
+  if (present(ParSymbFact)) l_ParSymbFact = ParSymbFact
+  if (present(ColPerm)) l_ColPerm = ColPerm
+  if (present(RowPerm)) l_RowPerm = RowPerm
+  if (present(IterRefine)) l_IterRefine = IterRefine
+  if (present(Trans)) l_Trans = Trans
+  if (present(ReplaceTinyPivot)) l_ReplaceTinyPivot = ReplaceTinyPivot
+  if (present(SolveInitialized)) l_SolveInitialized = SolveInitialized
+  if (present(RefineInitialized)) l_RefineInitialized = RefineInitialized
+  if (present(PrintStat)) l_PrintStat = PrintStat
+
+  call f_set_superlu_options(opt, l_Fact, l_Equil, l_ParSymbFact, &
+                             l_ColPerm, l_RowPerm, l_IterRefine, l_Trans, &
+                             l_ReplaceTinyPivot, l_SolveInitialized, &
+                             l_RefineInitialized, l_PrintStat)
+
+end subroutine set_superlu_options
+
+end module superlu_mod
+
+
+
diff --git a/FORTRAN/superlupara.f90 b/FORTRAN/superlupara.f90
new file mode 100644
index 0000000..d246ae8
--- /dev/null
+++ b/FORTRAN/superlupara.f90
@@ -0,0 +1,91 @@
+!> @file
+!! \brief This module contains some parameter used in SuperLU for
+!! Fortran90 user.
+!
+
+module superlupara_mod
+
+!----------------------------------------------------
+! This module contains some parameter used in SUPERLU for Fortran90 user.
+!----------------------------------------------------
+
+
+implicit none
+public superlu_ptr
+
+!----------------------------------------------------
+! kind of integer to hold a SuperLU pointer.  Use default integer.
+! This might need to be changed on systems with large memory.
+! If changed, be sure to change it in superlu_c2f_wrap.c too.
+!
+! integer, parameter :: superlu_ptr = kind(0) ! default integer size: 32-bit
+integer, parameter :: superlu_ptr = 8 ! 64-bit
+
+!----------------------------------------------------
+! The following parameters are defined:
+
+! These values come from superlu_defs.h.  If the values in there change with
+! the version of SuperLU, then they need to be changed here, too.
+
+integer, parameter, public :: &
+                      NO                      = 0, & ! yes_no_t
+                      YES                     = 1, &
+                      DOFACT                  = 0, & ! fact_t
+                      SamePattern             = 1, &
+                      SamePattern_SameRowPerm = 2, &
+                      FACTORED                = 3, &
+                      NOROWPERM               = 0, & ! rowperm_t
+                      LargeDiag               = 1, &
+                      MY_PERMR                = 2, &
+                      NATURAL                 = 0, & ! colperm_t
+                      MMD_ATA                 = 1, &
+                      MMD_AT_PLUS_A           = 2, &
+                      COLAMD                  = 3, &
+                      METIS_AT_PLUS_A         = 4, &
+                      PARMETIS                = 5, &
+                      ZOLTAN                  = 6, &
+                      MY_PERMC                = 7, &
+                      NOTRANS                 = 0, & ! trans_t
+                      TRANS                   = 1, &
+                      CONJ                    = 2, &
+                      NOEQUIL                 = 0, & ! DiagScale_t  Need?
+                      ROW                     = 1, &
+                      COL                     = 2, &
+                      BOTH                    = 3, &
+                      NOREFINE                = 0, & ! IterRefine_t
+                      SINGLE                  = 1, &
+                      DOUBLE                  = 2, &
+                      EXTRA                   = 3, &
+                      LUSUP                   = 0, & ! MemType  Need?
+                      UCOL                    = 1, &
+                      LSUB                    = 2, &
+                      USUB                    = 3, &
+                      SYSTEM                  = 0, & ! LU_space_t  Need?
+                      USER                    = 1
+integer, parameter, public :: &
+                      SLU_NC                  = 0, & ! Stype_t
+                      SLU_NCP                 = 1, &
+                      SLU_NR                  = 2, &
+                      SLU_SC                  = 3, &
+                      SLU_SCP                 = 4, &
+                      SLU_SR                  = 5, &
+                      SLU_DN                  = 6, &
+                      SLU_NR_loc              = 7, &
+                      SLU_S                   = 0, & ! Dtype_t
+                      SLU_D                   = 1, &
+                      SLU_C                   = 2, &
+                      SLU_Z                   = 3, &
+                      SLU_GE                  = 0, & ! Mtype_t
+                      SLU_TRLU                = 1, &
+                      SLU_TRUU                = 2, &
+                      SLU_TRL                 = 3, &
+                      SLU_TRU                 = 4, &
+                      SLU_SYL                 = 5, &
+                      SLU_SYU                 = 6, &
+                      SLU_HEL                 = 7, &
+                      SLU_HEU                 = 8
+
+
+!----------------------------------------------------
+
+end module superlupara_mod
diff --git a/FORTRAN/zcreate_dist_matrix.c b/FORTRAN/zcreate_dist_matrix.c
new file mode 100644
index 0000000..617c85b
--- /dev/null
+++ b/FORTRAN/zcreate_dist_matrix.c
@@ -0,0 +1,205 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Distribute the input matrix in a distributed compressed row format.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 3.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ *
+ *
+ * Purpose
+ * =======
+ * 
+ * ZCREATE_DIST_MATRIX reads the global matrix from three input arrays
+ * and distribute it to the processes in a distributed compressed row format.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A             (output) SuperMatrix*
+ *               Local matrix A in NR_loc format. 
+ *
+ * M             (input) int_t
+ *               The row number of the global matrix. 
+ *
+ * N             (input) int_t
+ *               The col number of the global matrix. 
+ *
+ * NNZ           (input) int_t
+ *               The number nonzeros in the global matrix. 
+ *
+ * NZVAL_G       (input) doublecomplex*
+ *               Nonzero values of the global matrix. 
+ *
+ * ROWIND_G      (input) int_t*
+ *               Row indices of the global matrix. 
+ *
+ * COLPTR_G      (input) int_t*
+ *               Columns pointers of the global matrix. 
+ *
+ * GRID          (input) gridinof_t*
+ *               The 2D process mesh.
+ *
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
+			doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g,
+			gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    int_t    *rowind, *colptr;	 /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    iam, row, col, i, j, relpos;
+    char     trans[1];
+    int_t    *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()");
+#endif
+ 
+    if ( !iam ) {
+
+        /* Allocate storage for compressed column representation. */
+        zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	/* Copy the global matrix. */
+#if 0
+	/* and ADJUST to 0-based indexing 
+           which is required by the C routines.*/
+#endif
+        for(i=0; i<nnz; i++){
+	  nzval[i]=nzval_g[i];
+	  rowind[i]=rowind_g[i]; /* - 1;*/
+        }
+        for(i=0; i<n+1; i++)
+	  colptr[i]=colptr_g[i]; /* - 1;*/
+
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+      m_loc = m_loc+1;
+      m_loc_fst = m_loc;
+      if (iam == (grid->nprow * grid->npcol - 1)) 
+	m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_Z, SLU_GE);
+
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_Z, SLU_GE);
+    
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()");
+#endif
+    return 0;
+}
+
+
diff --git a/FORTRAN/zhbcode1.f90 b/FORTRAN/zhbcode1.f90
new file mode 100644
index 0000000..c7029df
--- /dev/null
+++ b/FORTRAN/zhbcode1.f90
@@ -0,0 +1,50 @@
+!> @file
+!! \brief Fortran code for reading a sparse matrix in Harwell-Boeing format.
+!!
+!
+      subroutine zhbcode1(nrow, ncol, nnzero, values, rowind, colptr)
+
+!     ================================================================
+!     ... SAMPLE CODE FOR READING A SPARSE MATRIX IN STANDARD FORMAT
+!     ================================================================
+
+      CHARACTER      TITLE*72, KEY*8, MXTYPE*3, PTRFMT*16, &
+                     INDFMT*16, VALFMT*20, RHSFMT*20
+
+      INTEGER        TOTCRD, PTRCRD, INDCRD, VALCRD, RHSCRD,  NROW,  &
+                     NCOL  , NNZERO, NELTVL
+
+      INTEGER        COLPTR (*), ROWIND (*)
+
+      double complex         VALUES (*)
+
+!    ------------------------
+!     ... READ IN HEADER BLOCK
+!     ------------------------
+
+      READ ( 5, 1000 ) TITLE , KEY , TOTCRD, PTRCRD, INDCRD, VALCRD, &
+                       RHSCRD, MXTYPE, NROW  , NCOL  , NNZERO, NELTVL, &
+                       PTRFMT, INDFMT, VALFMT, RHSFMT
+ 1000 FORMAT ( A72, A8 / 5I14 / A3, 11X, 4I14 / 2A16, 2A20 )
+
+!     -------------------------
+!     ... READ MATRIX STRUCTURE
+!     -------------------------
+
+      READ ( 5, PTRFMT ) ( COLPTR (I), I = 1, NCOL+1 )
+
+      READ ( 5, INDFMT ) ( ROWIND (I), I = 1, NNZERO )
+
+      IF  ( VALCRD .GT. 0 )  THEN
+
+!         ----------------------
+!         ... READ MATRIX VALUES
+!         ----------------------
+
+          READ ( 5, VALFMT ) ( VALUES (I), I = 1, NNZERO )
+
+      ENDIF
+
+      return
+      end
+
diff --git a/INSTALL/Makefile b/INSTALL/Makefile
new file mode 100644
index 0000000..cf56e06
--- /dev/null
+++ b/INSTALL/Makefile
@@ -0,0 +1,26 @@
+include ../make.inc
+
+all:  testdmach testsmach testtimer #install.out
+
+testdmach: dmach_dist.o dmachtst.o
+	$(LOADER) $(LOADOPTS) -o testdmach dmach_dist.o dmachtst.o
+
+testsmach: smach_dist.o smachtst.o
+	$(LOADER) $(LOADOPTS) -o testsmach smach_dist.o smachtst.o
+
+testtimer: superlu_timer.o timertst.o
+	$(LOADER) $(LOADOPTS) -o testtimer superlu_timer.o timertst.o
+
+install.out:
+	@echo Testing machines parameters and timer
+	csh install.csh
+
+smach_dist.o: ../SRC/smach_dist.c ; $(CC) -c $<
+dmach_dist.o: ../SRC/dmach_dist.c ; $(CC) -c $<
+superlu_timer.o: ../SRC/superlu_timer.c ; $(CC) -c $<
+
+.c.o:
+	$(CC) $(CFLAGS) -c $<
+
+clean:
+	rm -f *.o test* *.out
diff --git a/INSTALL/dmachtst.c b/INSTALL/dmachtst.c
new file mode 100644
index 0000000..f72e742
--- /dev/null
+++ b/INSTALL/dmachtst.c
@@ -0,0 +1,34 @@
+#include <stdio.h>
+
+int main()
+{
+    /* Local variables */
+    double base, emin, prec, emax, rmin, rmax, t, sfmin;
+    extern double dmach_dist(char *);
+    double rnd, eps;
+
+    eps = dmach_dist("Epsilon");
+    sfmin = dmach_dist("Safe minimum");
+    base = dmach_dist("Base");
+    prec = dmach_dist("Precision");
+    t = dmach_dist("Number of digits in mantissa");
+    rnd = dmach_dist("Rounding mode");
+    emin = dmach_dist("Minnimum exponent");
+    rmin = dmach_dist("Underflow threshold");
+    emax = dmach_dist("Largest exponent");
+    rmax = dmach_dist("Overflow threshold");
+
+    printf(" Epsilon                      = %e\n", eps);
+    printf(" Safe minimum                 = %e\n", sfmin);
+    printf(" Base                         = %.0f\n", base);
+    printf(" Precision                    = %e\n", prec);
+    printf(" Number of digits in mantissa = %.0f\n", t);
+    printf(" Rounding mode                = %.0f\n", rnd);
+    printf(" Minimum exponent             = %.0f\n", emin);
+    printf(" Underflow threshold          = %e\n", rmin);
+    printf(" Largest exponent             = %.0f\n", emax);
+    printf(" Overflow threshold           = %e\n", rmax);
+    printf(" Reciprocal of safe minimum   = %e\n", 1./sfmin);
+
+    return 0;
+}
diff --git a/INSTALL/install.csh b/INSTALL/install.csh
new file mode 100644
index 0000000..a7b1f01
--- /dev/null
+++ b/INSTALL/install.csh
@@ -0,0 +1,14 @@
+#! /bin/csh
+
+set ofile = install.out			# output file
+
+echo '---- SINGLE PRECISION' >! $ofile
+./testsmach >> $ofile
+echo '' >> $ofile
+echo ---- DOUBLE PRECISION >> $ofile
+./testdmach >> $ofile
+echo '' >> $ofile
+echo ---- TIMER >> $ofile
+./testtimer >> $ofile
+
+
diff --git a/INSTALL/smachtst.c b/INSTALL/smachtst.c
new file mode 100644
index 0000000..38e1183
--- /dev/null
+++ b/INSTALL/smachtst.c
@@ -0,0 +1,34 @@
+#include <stdio.h>
+
+int main()
+{
+    /* Local variables */
+    float base, emin, prec, emax, rmin, rmax, t, sfmin;
+    extern float smach_dist(char *);
+    float rnd, eps;
+
+    eps = smach_dist("Epsilon");
+    sfmin = smach_dist("Safe minimum");
+    base = smach_dist("Base");
+    prec = smach_dist("Precision");
+    t = smach_dist("Number of digits in mantissa");
+    rnd = smach_dist("Rounding mode");
+    emin = smach_dist("Minnimum exponent");
+    rmin = smach_dist("Underflow threshold");
+    emax = smach_dist("Largest exponent");
+    rmax = smach_dist("Overflow threshold");
+
+    printf(" Epsilon                      = %e\n", eps);
+    printf(" Safe minimum                 = %e\n", sfmin);
+    printf(" Base                         = %.0f\n", base);
+    printf(" Precision                    = %e\n", prec);
+    printf(" Number of digits in mantissa = %.0f\n", t);
+    printf(" Rounding mode                = %.0f\n", rnd);
+    printf(" Minimum exponent             = %.0f\n", emin);
+    printf(" Underflow threshold          = %e\n", rmin);
+    printf(" Largest exponent             = %.0f\n", emax);
+    printf(" Overflow threshold           = %e\n", rmax);
+    printf(" Reciprocal of safe minimum   = %e\n", 1./sfmin);
+
+    return 0;
+}
diff --git a/INSTALL/superlu_timer.c b/INSTALL/superlu_timer.c
new file mode 100644
index 0000000..3a2ffcc
--- /dev/null
+++ b/INSTALL/superlu_timer.c
@@ -0,0 +1,54 @@
+/* 
+ * Purpose
+ * ======= 
+ *	Returns the time in seconds used by the process.
+ *
+ * Note: the timer function call is machine dependent. Use conditional
+ *       compilation to choose the appropriate function.
+ *
+ */
+
+
+#ifdef SUN 
+/*
+ * 	It uses the system call gethrtime(3C), which is accurate to 
+ *	nanoseconds. 
+*/
+#include <sys/time.h>
+ 
+double SuperLU_timer_() {
+    return ( (double)gethrtime() / 1e9 );
+}
+
+#elif defined ( UNIX_TIMER )
+
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+#include <sys/time.h>
+
+#ifndef CLK_TCK
+#define CLK_TCK 60
+#endif
+
+double SuperLU_timer_()
+{
+    struct tms use;
+    double tmp;
+    times(&use);
+    tmp = use.tms_utime;
+    tmp += use.tms_stime;
+    return (double)(tmp) / (double) CLK_TCK;
+}
+
+#else
+
+#include <mpi.h>
+
+double SuperLU_timer_()
+{
+    return MPI_Wtime();
+}
+
+#endif
+
diff --git a/INSTALL/timertst.c b/INSTALL/timertst.c
new file mode 100644
index 0000000..c509574
--- /dev/null
+++ b/INSTALL/timertst.c
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <mpi.h>
+
+void mysub(int n, double *x, double *y)
+{
+    return;
+}
+
+int main(int argc, char *argv[])
+{
+    /* Parameters */    
+#define NMAX    100
+#define ITS     10000
+    
+    int      i, j;
+    double   alpha, avg, t1, t2, tnotim;
+    double   x[NMAX], y[NMAX];
+    extern double   SuperLU_timer_dist_();
+
+    MPI_Init( &argc, &argv );
+
+    /* Initialize X and Y */
+    for (i = 0; i < NMAX; ++i) {
+	x[i] = 1.0 / (double)(i+1);
+	y[i] = (double)(NMAX - i) / (double)NMAX;
+    }
+    alpha = 0.315;
+
+    /* Time 1,000,000 DAXPY operations */
+    t1 = SuperLU_timer_dist_();
+    for (j = 0; j < ITS; ++j) {
+	for (i = 0; i < NMAX; ++i)
+	    y[i] += alpha * x[i];
+	alpha = -alpha;
+    }
+    t2 = SuperLU_timer_dist_();
+    printf("Time for 1,000,000 DAXPY ops  = %10.3g seconds\n", t2-t1);
+    if ( t2-t1 > 0. ) 
+	printf("DAXPY performance rate        = %10.3g mflops\n", 2./(t2-t1));
+    else
+	printf("*** Error:  Time for operations was zero\n");
+	
+    tnotim = t2 - t1;
+
+    /* Time 1,000,000 DAXPY operations with SuperLU_timer_() 
+       in the outer loop */
+    t1 = SuperLU_timer_dist_();
+    for (j = 0; j < ITS; ++j) {
+	for (i = 0; i < NMAX; ++i)
+	    y[i] += alpha * x[i];
+	alpha = -alpha;
+	t2 = SuperLU_timer_dist_();
+    }
+
+    /* Compute the time in milliseconds used by an average call to 
+       SuperLU_timer_(). */
+    printf("Including DSECND, time        = %10.3g seconds\n", t2-t1);
+    avg = ( (t2 - t1) - tnotim )*1000. / (double)ITS;
+    printf("Average time for DSECND       = %10.3g milliseconds\n", avg);
+
+    /* Compute the equivalent number of floating point operations used
+       by an average call to DSECND.    */
+    if ( tnotim > 0. )
+	printf("Equivalent floating point ops = %10.3g ops\n",
+	       1000.*avg / tnotim);
+
+    mysub(NMAX, x, y);
+
+    MPI_Finalize();
+    return 0;
+}
+
diff --git a/License.txt b/License.txt
new file mode 100644
index 0000000..e003503
--- /dev/null
+++ b/License.txt
@@ -0,0 +1,29 @@
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+(1) Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer. 
+(2) Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution. 
+(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of
+Energy nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
diff --git a/MAKE_INC/make.altix b/MAKE_INC/make.altix
new file mode 100644
index 0000000..f742af4
--- /dev/null
+++ b/MAKE_INC/make.altix
@@ -0,0 +1,77 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   April 10, 2006  version 2.0
+#
+#  Modified: 	    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT            = _altix
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB     = $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+
+MKLHOME		= /usr/common/intel/mkl/8.1.014
+BLASDEF         = -DUSE_VENDOR_BLAS
+BLASLIB         = -L${MKLHOME}/lib/64 -lmkl_ipf -lguide
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS		=
+
+# Define all the libraries
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \
+	          -lmpi -lm -L/usr/common/intel/fc/8.1.029/lib -lifcore
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH            = ar
+ARCHFLAGS       = crv
+RANLIB          = ranlib
+
+#######################################################################
+# C compiler setup
+CC              = icc
+ISA             = -ftz -mp
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = $(ISA) $(I_PARMETIS) -O3 -DDEBUGlevel=0 -DPRNTlevel=0
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS          = $(ISA) -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ifort
+FFLAGS          = $(CFLAGS)
+F90FLAGS	= -r8 -check all -save -Dmpi -ftz
+############################################################################
+LOADER          = icc
+LOADOPTS        = $(CFLAGS)
+#
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.carver b/MAKE_INC/make.carver
new file mode 100644
index 0000000..2e8b8a8
--- /dev/null
+++ b/MAKE_INC/make.carver
@@ -0,0 +1,91 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#		    September 1, 2011   version 3.0
+#		    October 1, 2014   version 4.0
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+# Carver: Intel compiler
+PLAT		= _sp
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        = ${MKL}
+#
+################# parmetis 4.x.x, 32-bit integer ###########################
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= 
+
+# Define all the libraries
+LIBS   	= $(DSUPERLULIB) $(PARMETISLIB) $(METISLIB) $(BLASLIB) $(FLIBS)
+
+# Include directories for header files
+INCS	= ${I_PARMETIS}
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+
+############################################################################
+# C compiler setup
+CC           	= mpicc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = ${CUDA_FLAGS} ${INCS} -std=c99 -O3 -Wall -w2 -openmp -mkl \
+		-DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 \
+#		-D_LONGINT 
+#	-Wunused-variable 
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+
+# Add more flags to use GPU
+ifeq "${ACC}" "GPU"
+CFLAGS	      += -DGPU_ACC
+INCS 	      += -I/usr/common/usg/cuda/5.5/include
+LIBS 	      += -L/usr/common/usg/cuda/5.5/lib64 -lcublas -lcudart 
+endif
+
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = mpif90
+F90FLAGS	= -fast -Mnomain
+############################################################################
+LOADER	        = mpicc
+LOADOPTS	= -openmp #-Mnomain
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DNoChange
diff --git a/MAKE_INC/make.cuda_gpu b/MAKE_INC/make.cuda_gpu
new file mode 100644
index 0000000..2e8b8a8
--- /dev/null
+++ b/MAKE_INC/make.cuda_gpu
@@ -0,0 +1,91 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#		    September 1, 2011   version 3.0
+#		    October 1, 2014   version 4.0
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+# Carver: Intel compiler
+PLAT		= _sp
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        = ${MKL}
+#
+################# parmetis 4.x.x, 32-bit integer ###########################
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= 
+
+# Define all the libraries
+LIBS   	= $(DSUPERLULIB) $(PARMETISLIB) $(METISLIB) $(BLASLIB) $(FLIBS)
+
+# Include directories for header files
+INCS	= ${I_PARMETIS}
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+
+############################################################################
+# C compiler setup
+CC           	= mpicc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = ${CUDA_FLAGS} ${INCS} -std=c99 -O3 -Wall -w2 -openmp -mkl \
+		-DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 \
+#		-D_LONGINT 
+#	-Wunused-variable 
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+
+# Add more flags to use GPU
+ifeq "${ACC}" "GPU"
+CFLAGS	      += -DGPU_ACC
+INCS 	      += -I/usr/common/usg/cuda/5.5/include
+LIBS 	      += -L/usr/common/usg/cuda/5.5/lib64 -lcublas -lcudart 
+endif
+
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = mpif90
+F90FLAGS	= -fast -Mnomain
+############################################################################
+LOADER	        = mpicc
+LOADOPTS	= -openmp #-Mnomain
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DNoChange
diff --git a/MAKE_INC/make.i386_linux b/MAKE_INC/make.i386_linux
new file mode 100644
index 0000000..c630112
--- /dev/null
+++ b/MAKE_INC/make.i386_linux
@@ -0,0 +1,78 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#
+# 		    January 18, 2006   Sam Adams
+#                                      General Dynamics - Network Systems
+#                       works for i386 Linux, with LAM-MPI 7.1.1 and GCC 4.
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _i386
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_5.1.3
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB      	= /usr/lib/libblas.so.3
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= 
+
+# Define all the libraries
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= mpicc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -pipe -O2 ${I_PARMETIS}
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= 
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = mpif77
+F90FLAGS	= 
+############################################################################
+LOADER	        = mpif77
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd__ 
diff --git a/MAKE_INC/make.mpich b/MAKE_INC/make.mpich
new file mode 100644
index 0000000..559a086
--- /dev/null
+++ b/MAKE_INC/make.mpich
@@ -0,0 +1,48 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   March 1, 2016	version 5.0.0
+#
+#  Modified:	    
+#		    
+#
+############################################################################
+#
+#  The name of the libraries to be created/linked to
+#
+VERSION		= 5.1.3
+SuperLUroot	= /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}
+DSUPERLULIB   	= $(SuperLUroot)/lib/libsuperlu_dist.a
+
+# BLASDEF 	= -DUSE_VENDOR_BLAS
+
+PARMETIS_DIR	:= ${HOME}/lib/static/parmetis-4.0.3
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+
+LIBS		= $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so \
+		${PARMETISLIB} ${METISLIB}
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         = /usr/bin/ar
+ARCHFLAGS    = cr
+RANLIB       = /usr/bin/ranlib
+
+CC           = /home/xiaoye/mpich-install/bin/mpicc
+CFLAGS 	     = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
+# CFLAGS       += -D_LONGINT
+# CFLAGS       +=  
+NOOPTS       = -O0
+FORTRAN	     = /usr/bin/gfortran
+
+LOADER       = $(CC)
+LOADOPTS     = -Wl,-rpath=/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}/lib -g # -Wl,-Bdynamic
diff --git a/MAKE_INC/make.opteron b/MAKE_INC/make.opteron
new file mode 100755
index 0000000..a9170ef
--- /dev/null
+++ b/MAKE_INC/make.opteron
@@ -0,0 +1,78 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _jacquard
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB      	= -L/usr/common/usg/acml/2.6.0/pathscale64/lib -lacml -lacml_mv
+#MPILIB		= -L/usr/lpp/ppe.poe/lib -lmpi_r
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	=
+
+# Define all the libraries
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= mpicc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -O3 -DDEBUGlevel=0 -DPRNTlevel=1 ${I_PARMETIS}
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = mpif90
+F90FLAGS	= -O3
+############################################################################
+LOADER	        = mpif90
+
+# 32-bit:
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.origin b/MAKE_INC/make.origin
new file mode 100644
index 0000000..af4a750
--- /dev/null
+++ b/MAKE_INC/make.origin
@@ -0,0 +1,80 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1997  version 1.0
+#
+#  Modified:        November 11, 2002 (by Tom Oppe)
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT            = _sgi
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB     = $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF         = -DUSE_VENDOR_BLAS
+BLASLIB         = -lscs
+MPILIB		= -lmpi
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lfortran
+
+# Define all the libraries
+#
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \
+	          $(MPILIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH            = ar
+ARCHFLAGS       = crv
+RANLIB          = touch
+
+#######################################################################
+# C compiler setup
+CC              = cc
+ISA             = -64 -mips4 -TARG:platform=ip35
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = $(ISA) $(I_PARMETIS) -O2
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+# This must be enforced to compile the two routines: slamch.c and dlamch.c.
+NOOPTS          = $(ISA) -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = f90
+F90FLAGS        = $(CFLAGS)
+############################################################################
+LOADER          = cc
+LOADOPTS        = $(CFLAGS)
+#
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_ -DORIGIN
diff --git a/MAKE_INC/make.sp b/MAKE_INC/make.sp
new file mode 100644
index 0000000..078c0cb
--- /dev/null
+++ b/MAKE_INC/make.sp
@@ -0,0 +1,80 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _sp
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB      	= -lessl
+#MPILIB		= -L/usr/lpp/ppe.poe/lib -lmpi
+#PERFLIB     	= -L/vol1/VAMPIR/lib -lVT
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	=
+
+# Define all the libraries
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+CC           	= mpcc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -D_SP -O3 -qarch=PWR3 -qalias=allptrs \
+		  -DDEBUGlevel=0 -DPRNTlevel=0 $(I_PARMETIS)
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+# This must be enforced to compile the two routines: slamch.c and dlamch.c.
+NOOPTS		=
+############################################################################
+FORTRAN         = mpxlf90
+F90FLAGS        = -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr3
+############################################################################
+LOADER	        = mpxlf90
+#LOADOPTS	= -bmaxdata:0x80000000
+LOADOPTS	= -bmaxdata:0x70000000
+#
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DNoChange
+
diff --git a/MAKE_INC/make.sp.64bit b/MAKE_INC/make.sp.64bit
new file mode 100644
index 0000000..79083e5
--- /dev/null
+++ b/MAKE_INC/make.sp.64bit
@@ -0,0 +1,85 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _power5
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB      	= -lessl
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	=
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+# 64-bit:
+ARCHFLAGS    	= -X64 cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+# 64-bit
+CC           	= mpcc_r
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -D_SP -qarch=pwr5 -qalias=allptrs -q64 \
+		  -DDEBUGlevel=0 -DPRNTlevel=0 -O3 $(I_PARMETIS)
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+# 64-bit
+NOOPTS		= -q64
+
+############################################################################
+# FORTRAN compiler setup
+# 64-bit
+FORTRAN         = mpxlf90_r
+F90FLAGS	= -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr5\
+                  -q64 #-qintsize=8
+############################################################################
+# 64-bit
+LOADER	= mpxlf90_r
+
+# 64-bit:
+LOADOPTS	= -q64
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DNoChange
diff --git a/MAKE_INC/make.t3e b/MAKE_INC/make.t3e
new file mode 100644
index 0000000..333b038
--- /dev/null
+++ b/MAKE_INC/make.t3e
@@ -0,0 +1,73 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1997  version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _t3e
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+#
+#PERFLIB	= -l pat pat.cld
+#PERFLIB	= -lapp
+METISLIB	=
+PARMETISLIB	=
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	=
+
+# Define all the libraries
+LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+#CFLAGS       	= -D_CRAY -DPRNTlevel=1 -O3 -h aggress,split,unroll
+CFLAGS       	= -O3 -D_CRAY -DPRNTlevel=0 -DDEBUGlevel=0 -DPROFlevel=0
+#		  -happrentice,inline0
+PTROPT	     	= -h restrict=a
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+# This must be enforced to compile the two routines: slamch.c and dlamch.c.
+NOOPTS		=
+############################################################################
+# FORTRAN compiler setup
+FORTRAN		= f90
+F90FLAGS	= -O3 -dp -i 32
+############################################################################
+LOADER       	= cc
+LOADOPTS   = 
+#
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DUpCase
+
diff --git a/MAKE_INC/make.xc30 b/MAKE_INC/make.xc30
new file mode 100644
index 0000000..dba42bb
--- /dev/null
+++ b/MAKE_INC/make.xc30
@@ -0,0 +1,83 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#		    September 1, 2011   version 3.0
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+## edison at NERSC
+PLAT		= _xc30
+VERSION		= 5.0.0
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_${VERSION}
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_${VERSION}.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+#
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+ PARMETIS_DIR	:= ${HOME}/Edison/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Edison/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+# FLIBS	 	= -lpgf90 -lpgf90_rpm1  ## for PGI compiler
+#FLIBS	 	= -lifport -lifcore     ## for Intel compiler
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -fast -m64 -std=c99 -Wall -openmp \
+		$(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=0 -DPROFlevel=0 \
+# uncomment the following to use 64-bit integer
+# CFLAGS 		+= -D_LONGINT
+
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0 -std=c99
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fast #-Mipa=fast,safe
+# uncomment the following to use 64-bit integer
+# F90FLAGS	+= -i8
+############################################################################
+LOADER	        = $(CC)
+LOADOPTS	= -openmp
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xe6 b/MAKE_INC/make.xe6
new file mode 100644
index 0000000..089849f
--- /dev/null
+++ b/MAKE_INC/make.xe6
@@ -0,0 +1,79 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#		    September 1, 2011   version 3.0
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xe6
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.3
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.3.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+#
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Edison/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Edison/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+ FLIBS	 	= -lpgf90 -lpgf90_rpm1  ## for PGI compiler
+# FLIBS	 	= -lifport -lifcore     ## for Intel compiler
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -fast -Mipa=fast,safe -m64 $(I_PARMETIS) \
+		-DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 \
+#		-D_LONGINT
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fast -Mipa=fast,safe
+############################################################################
+LOADER	        = $(CC)
+LOADOPTS	= -fast
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xt4 b/MAKE_INC/make.xt4
new file mode 100644
index 0000000..dc56907
--- /dev/null
+++ b/MAKE_INC/make.xt4
@@ -0,0 +1,66 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xt4
+VERSION		= 5.0.0
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_${VERSION}
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuper_dist_${VERSION}.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+METISLIB    	= -L/usr/common/usg/parmetis/3.1 -lmetis
+PARMETISLIB   	= -L/usr/common/usg/parmetis/3.1 -lparmetis
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -fastsse -DDEBUGlevel=0 -DPRNTlevel=1
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fastsse
+############################################################################
+LOADER	        = cc
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xt4.64bit b/MAKE_INC/make.xt4.64bit
new file mode 100644
index 0000000..f8472b9
--- /dev/null
+++ b/MAKE_INC/make.xt4.64bit
@@ -0,0 +1,75 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xt4
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -fastsse $(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=1 -D_LONGINT
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fastsse -i8
+############################################################################
+LOADER	        = cc
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xt4_pathscale b/MAKE_INC/make.xt4_pathscale
new file mode 100644
index 0000000..723f773
--- /dev/null
+++ b/MAKE_INC/make.xt4_pathscale
@@ -0,0 +1,75 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xt4
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lpathfortran
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -Ofast $(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0 -ipa
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -Ofast
+############################################################################
+LOADER	        = cc
+LOADOPTS	= -ipa
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xt4_pgi b/MAKE_INC/make.xt4_pgi
new file mode 100644
index 0000000..996bcaa
--- /dev/null
+++ b/MAKE_INC/make.xt4_pgi
@@ -0,0 +1,75 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xt4
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.2
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.2.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC           	= cc
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = -fastsse $(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fastsse
+############################################################################
+LOADER	        = cc
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/MAKE_INC/make.xt5 b/MAKE_INC/make.xt5
new file mode 100644
index 0000000..926d28e
--- /dev/null
+++ b/MAKE_INC/make.xt5
@@ -0,0 +1,78 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   February 4, 1999   version alpha
+#
+#  Modified:	    September 1, 1999  version 1.0
+#                   March 15, 2003     version 2.0
+#		    November 1, 2007   version 2.1
+#
+############################################################################
+#
+#  The machine (platform) identifier to append to the library names
+#
+PLAT		= _xt5
+
+#
+#  The name of the libraries to be created/linked to
+#
+DSuperLUroot 	= ${HOME}/Release_Codes/SuperLU_DIST_4.3
+DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_4.3.a
+#
+BLASDEF	     	= -DUSE_VENDOR_BLAS
+BLASLIB	        =
+
+############################################################################
+## parmetis 4.x.x, 32-bit integer
+PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3
+## parmetis 4.x.x, 64-bit integer
+# PARMETIS_DIR	:= ${HOME}/Carver/lib/parmetis-4.0.3_64
+
+METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
+PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
+I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
+############################################################################
+
+# Define the required Fortran libraries, if you use C compiler to link
+FLIBS	 	= -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl
+
+# Define all the libraries
+LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
+
+# Include directories for header files
+INCS	= ${I_PARMETIS}
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         	= ar
+ARCHFLAGS    	= cr
+RANLIB       	= ranlib
+
+############################################################################
+# C compiler setup
+CC      = cc
+INCS	= $(I_PARMETIS)
+# CFLAGS should be set to be the C flags that include optimization
+CFLAGS          = ${INCS} -c99 -fastsse -DDEBUGlevel=0 -DPRNTlevel=1 #-D_LONGINT
+#
+# NOOPTS should be set to be the C flags that turn off any optimization
+NOOPTS		= -O0
+############################################################################
+# FORTRAN compiler setup
+FORTRAN         = ftn
+F90FLAGS	= -fastsse #-i8
+############################################################################
+LOADER	        = cc
+LOADOPTS	= 
+############################################################################
+#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
+#
+#  Need follow the convention of how C calls a Fortran routine.
+#
+CDEFS        = -DAdd_
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7717442
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,45 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          Makefile
+#
+#  Purpose:         Top-level Makefile
+#
+#  Creation date:   September 1, 1999  version 1.0
+#
+#  Modified:        
+#
+############################################################################
+
+include make.inc
+
+all: install lib example
+
+lib: superlulib
+
+example:
+	( cd EXAMPLE; $(MAKE) )
+
+clean: cleanlib cleantesting
+
+install:
+	( cd INSTALL; $(MAKE) )
+#	( cd INSTALL; cp lsame.c ../SRC/; \
+#	  cp dlamch.c ../SRC/; cp slamch.c ../SRC/ )
+
+blaslib:
+	( cd CBLAS; $(MAKE) )
+
+superlulib:
+	( cd SRC; $(MAKE) )
+
+cleanlib:
+	( cd SRC; $(MAKE) clean )
+	( cd CBLAS; $(MAKE) clean )
+	( cd lib; rm -f *.a )
+
+cleantesting:
+	( cd INSTALL; $(MAKE) clean )
+	( cd EXAMPLE; $(MAKE) clean )
+	( cd FORTRAN; $(MAKE) clean )
diff --git a/README b/README
new file mode 100644
index 0000000..2cfbba7
--- /dev/null
+++ b/README
@@ -0,0 +1,251 @@
+		SuperLU_DIST (version 5.1)
+		============================
+
+SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
+A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
+Static pivoting is a technique that combines the numerical stability of
+partial pivoting with the scalability of Cholesky (no pivoting),
+to run accurately and efficiently on large numbers of processors. 
+
+SuperLU_DIST is a parallel extension to the serial SuperLU library.
+It is targeted for the distributed memory parallel machines.
+SuperLU_DIST is implemented in ANSI C, and MPI for communications.
+Currently, the LU factorization and triangular solution routines,
+which are the most time-consuming part of the solution process,
+are parallelized. The other routines, such as static pivoting and 
+column preordering for sparsity are performed sequentially. 
+This "alpha" release contains double-precision real and double-precision
+complex data types.
+
+The distribution contains the following directory structure:
+
+  SuperLU_DIST/README    instructions on installation
+  SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
+  SuperLU_DIST/DOC/  	 the Users' Guide
+  SuperLU_DIST/EXAMPLE/  example programs
+  SuperLU_DIST/INSTALL/  test machine dependent parameters
+  SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
+  SuperLU_DIST/lib/      contains library archive libsuperlu_dist.a
+  SuperLU_DIST/Makefile  top level Makefile that does installation and testing
+  SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
+                         preprocessor definitions, included in all Makefiles.
+                         (You may need to edit it to suit for your system
+                          before compiling the whole package.)
+  SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
+
+
+----------------
+| INSTALLATION |
+----------------
+
+There are two ways to install the package. One requires users to 
+edit makefile manually, the other uses CMake build system.
+The procedures are described below.
+
+1. Manual installation with makefile.
+   Before installing the package, please examine the three things dependent 
+   on your system setup:
+
+   1.1 Edit the make.inc include file.
+
+       This make include file is referenced inside each of the Makefiles
+       in the various subdirectories. As a result, there is no need to 
+       edit the Makefiles in the subdirectories. All information that is
+       machine specific has been defined in this include file. 
+
+       Sample machine-specific make.inc are provided in the MAKE_INC/
+       directory for several platforms, such as Cray XT5 and IBM SP.
+       When you have selected the machine to which you wish to install
+       SuperLU_DIST, copy the appropriate sample include file 
+       (if one is present) into make.inc.
+       For example, if you wish to run SuperLU_DIST on a Cray XT5,  you can do
+
+       	   cp MAKE_INC/make.xc30  make.inc
+   
+	For the systems other than listed above, some porting effort is needed
+   	for parallel factorization routines. Please refer to the Users' Guide 
+   	for detailed instructions on porting.
+
+   	The following CPP definitions can be set in CFLAGS.
+      	  o -D_LONGINT
+          use 64-bit integers for indexing sparse matrices. (default 32 bit)
+
+      	  o -DPRNTlevel=[0,1,2,...]
+          printing level to show solver's execution details. (default 0)
+
+      	  o -DDEBUGlevel=[0,1,2,...]
+          diagnostic printing level for debugging purpose. (default 0)
+      
+   
+   1.2. The BLAS library.
+
+   	The parallel routines in SuperLU_DIST uses some sequential BLAS routines
+   	on each process. If there is BLAS library available on your machine,
+   	you may define the following in the file make.inc:
+            BLASDEF = -DUSE_VENDOR_BLAS
+            BLASLIB = <BLAS library you wish to link with>
+
+   	    The CBLAS/ subdirectory contains the part of the C BLAS needed by 
+   	    SuperLU_DIST package. However, these codes are intended for use
+	    only if there is no faster implementation of the BLAS already
+	    available on your machine. In this case, you should go to the
+	    top-level SuperLU_DIST/ directory and do the following:
+
+	    1) In make.inc, undefine (comment out) BLASDEF, and define:
+               BLASLIB = ../lib/libblas$(PLAT).a
+
+    	    2) Type: make blaslib
+       	       to make the BLAS library from the routines in the
+	       CBLAS/ subdirectory.
+
+
+   1.3. External libraries: Metis and ParMetis.
+
+      If you will use Metis or ParMetis ordering, you will
+      need to install them yourself. Since ParMetis package already
+      contains the source code for the Metis library, you can just
+      download and compile ParMetis from:
+      http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download
+
+      After you have installed it, you should define the following in make.inc:
+        METISLIB = -L<metis directory> -lmetis
+        PARMETISLIB = -L<parmetis directory> -lparmetis
+        I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
+
+   1.4. C preprocessor definition CDEFS.
+
+   	In the header file SRC/Cnames.h, we use macros to determine how
+   	C routines should be named so that they are callable by Fortran.
+   	(Some vendor-supplied BLAS libraries do not have C interfaces. So the 
+    	re-naming is needed in order for the SuperLU BLAS calls (in C) to 
+    	interface with the Fortran-style BLAS.)
+   	The possible options for CDEFS are:
+
+       	o -DAdd_: Fortran expects a C routine to have an underscore
+		  postfixed to the name;
+		  (This is set as the default)
+        o -DNoChange: Fortran expects a C routine name to be identical to
+		      that compiled by C;
+        o -DUpCase: Fortran expects a C routine name to be all uppercase.
+   
+   1.5. Multicore and GPU (optional).
+   
+	To use OpenMP parallelism, need to compile the code with the
+	following CPP definition:
+
+	     -D_OPENMP
+
+        and set the number of threads to be used as follows:
+
+ 	     setenv OMP_NUM_THREADS <##>
+
+   	To enable Nvidia GPU access, need to take the following 2 step:
+      	  1) set the following Linux environment variable:
+
+	     setenv ACC GPU
+
+      	  2) Add the CUDA library location in make.inc:
+
+    	  ifeq "${ACC}" "GPU"
+      	       CFLAGS += -DGPU_ACC
+               INCS += -I<CUDA directory>/include
+      	       LIBS += -L<CUDA directory>/lib64 -lcublas -lcudart 
+    	  endif
+
+   A Makefile is provided in each subdirectory. The installation can be done
+   completely automatically by simply typing "make" at the top level.
+
+2. Using CMake build system. 
+   You will need to create a build tree from which to invoke CMake.
+   
+   First, in order to use parallel symbolic factorization function, you
+   need to install ParMETIS parallel ordering package, and define the
+   two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
+
+     setenv PARMETIS_ROOT <Prefix directory of the ParMETIS installation>
+     setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
+
+   Then, the installation procedure is the following.
+
+   From the top level directory, do:
+
+     	mkdir build ; cd build
+   	cmake .. \
+	  -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+          -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include"
+
+  ( example:
+  setenv PARMETIS_ROOT ~/lib/dynamic/parmetis-4.0.3 
+  setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64 
+  cmake .. \
+    -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+    -DCMAKE_C_FLAGS="-std=c99 -g" \
+    -Denable_blaslib=OFF \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DCMAKE_C_COMPILER=mpicc \
+    -DCMAKE_INSTALL_PREFIX=..
+  )
+
+   To actually build, type:
+   	make
+
+   To install the libraries, type:
+        make install
+
+   To run the installation test, type:
+        make test
+        (The outputs are in file: build/Testing/Temporary/LastTest.log)
+
+
+   ++++++++
+   Note on the C-Fortran name mangling handled by C preprocessor definition:
+   ++++++++
+   In the default setting, we assume that Fortran expects a C routine
+   to have an underscore postfixed to the name. Depending on the
+   compiler, you may need to define one of the following flags in
+   during the cmake build to overwrite default setting:
+
+   cmake .. -DCMAKE_C_FLAGS="-DNoChange"
+
+   cmake .. -DCMAKE_C_FLAGS="-DUpCase"
+
+
+--------------
+| REFERENCES |
+--------------
+
+[1] SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for
+    Unsymmetric Linear Systems.  Xiaoye S. Li and James W. Demmel.
+    ACM Trans. on Math. Solftware, Vol. 29, No. 2, June 2003, pp. 110-140.
+[2] Parallel Symbolic Factorization for Sparse LU with Static Pivoting.
+    L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3,
+    1289-1314, 2007.
+[3] A distributed CPU-GPU sparse direct solver. P. Sao, R. Vuduc and X.S. Li,
+    Proc. of EuroPar-2014 Parallel Processing, August 25-29, 2014.
+    Porto, Portugal.
+
+Xiaoye S. Li         Lawrence Berkeley National Lab, xsli at lbl.gov
+Laura Grigori        INRIA, France, Laura.Grigori at inria.fr
+Piyush Sao           Georgia Institute of Technology, piyush.feynman at gmail.com
+Ichitaro Yamazaki    Univ. of Tennessee, ic.yamazaki at gmail.com
+
+--------------------
+| RELEASE VERSIONS |
+--------------------
+
+  October 15, 2003   Version 2.0
+  October 1,  2007   Version 2.1
+  Feburary 20, 2008  Version 2.2
+  October 15, 2008   Version 2.3
+  June 9, 2010       Version 2.4 
+  November 23, 2010  Version 2.5
+  March 31, 2013     Version 3.3
+  October 1, 2014    Version 4.0
+  July 15, 2014      Version 4.1
+  September 25, 2015 Version 4.2
+  December 31, 2015  Version 4.3
+  April 8, 2016      Version 5.0.0
+  May 15, 2016       Version 5.1.0
+  October 4, 2016    Version 5.1.1
+  December 31, 2016  Version 5.1.3
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
new file mode 100644
index 0000000..b8341c9
--- /dev/null
+++ b/SRC/CMakeLists.txt
@@ -0,0 +1,127 @@
+set(headers
+    Cnames.h
+    cublas_utils.h
+    dcomplex.h
+    machines.h
+    psymbfact.h
+    superlu_defs.h
+    superlu_enum_consts.h
+    supermatrix.h
+    util_dist.h
+)
+
+# first: precision-independent files
+set(sources
+  sp_ienv.c
+  etree.c 
+  sp_colorder.c
+  get_perm_c.c
+  mmd.c
+  comm.c
+  memory.c
+  util.c
+  superlu_grid.c
+  pxerr_dist.c
+  superlu_timer.c
+  symbfact.c
+  psymbfact.c
+  psymbfact_util.c
+  get_perm_c_parmetis.c
+  mc64ad_dist.c
+  static_schedule.c
+  xerr_dist.c
+  smach_dist.c
+  dmach_dist.c
+)
+set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0)
+
+if(enable_double)
+  list(APPEND headers superlu_ddefs.h)
+
+  list(APPEND sources
+    dlangs_dist.c
+    dgsequ_dist.c
+    dlaqgs_dist.c
+    dutil_dist.c
+    dmemory_dist.c
+    dmyblas2_dist.c
+    dsp_blas2_dist.c
+    dsp_blas3_dist.c
+    pdgssvx.c
+    pdgssvx_ABglobal.c
+    dreadhb.c
+    dreadrb.c
+    dreadtriple.c
+    dreadMM.c
+    pdgsequ.c
+    pdlaqgs.c
+    dldperm_dist.c
+    pdlangs.c
+    pdutil.c
+    pdsymbfact_distdata.c
+    ddistribute.c
+    pddistribute.c
+    pdgstrf.c
+    pdgstrf2.c
+    pdgstrs.c
+    pdgstrs1.c
+    pdgstrs_lsum.c
+    pdgstrs_Bglobal.c
+    pdgsrfs.c
+    pdgsmv.c
+    pdgsrfs_ABXglobal.c
+    pdgsmv_AXglobal.c
+    pdGetDiagU.c
+  )
+endif()
+
+if(enable_complex16)
+  list(APPEND headers superlu_zdefs.h)
+
+  list(APPEND sources
+    dcomplex_dist.c
+    zlangs_dist.c
+    zgsequ_dist.c
+    zlaqgs_dist.c
+    zutil_dist.c
+    zmemory_dist.c
+    zmyblas2_dist.c
+    zsp_blas2_dist.c
+    zsp_blas3_dist.c
+    pzgssvx.c
+    pzgssvx_ABglobal.c
+    zreadhb.c
+    zreadrb.c
+    zreadtriple.c
+    zreadMM.c
+    pzgsequ.c
+    pzlaqgs.c
+    zldperm_dist.c
+    pzlangs.c
+    pzutil.c
+    pzsymbfact_distdata.c
+    zdistribute.c
+    pzdistribute.c
+    pzgstrf.c
+    pzgstrf2.c
+    pzgstrs.c
+    pzgstrs1.c
+    pzgstrs_lsum.c
+    pzgstrs_Bglobal.c
+    pzgsrfs.c
+    pzgsmv.c
+    pzgsrfs_ABXglobal.c
+    pzgsmv_AXglobal.c
+    pzGetDiagU.c
+  )
+endif()
+
+add_library(superlu_dist ${sources} ${HEADERS})
+target_link_libraries(superlu_dist
+                      ${MPI_C_LIBRARIES} ${BLAS_LIB} ${PARMETIS_LIB} m)
+set_target_properties(superlu_dist PROPERTIES
+                      VERSION ${PROJECT_VERSION} SOVERSION ${VERSION_MAJOR}
+)
+
+install(TARGETS superlu_dist DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
diff --git a/SRC/Cnames.h b/SRC/Cnames.h
new file mode 100644
index 0000000..b446c46
--- /dev/null
+++ b/SRC/Cnames.h
@@ -0,0 +1,365 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Macro definitions
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
+#define __SUPERLU_CNAMES
+
+/*
+ * These macros define how C routines will be called.  ADD_ assumes that
+ * they will be called by fortran, which expects C routines to have an
+ * underscore postfixed to the name (Suns, and the Intel expect this).
+ * NOCHANGE indicates that fortran will be calling, and that it expects
+ * the name called by fortran to be identical to that compiled by the C
+ * (RS6K's do this).  UPCASE says it expects C routines called by fortran
+ * to be in all upcase (CRAY wants this). 
+ */
+
+#define ADD_       0
+#define NOCHANGE   1
+#define UPCASE     2
+#define C_CALL     3
+
+#ifdef UpCase
+#define F77_CALL_C UPCASE
+#endif
+
+#ifdef NoChange
+#define F77_CALL_C NOCHANGE
+#endif
+
+#ifdef Add_
+#define F77_CALL_C ADD_
+#endif
+
+#ifndef F77_CALL_C
+#define F77_CALL_C ADD_
+#endif
+
+#if (F77_CALL_C == ADD_)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine
+ * No redefinition necessary to have following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void dgemm_(...)
+ *
+ * This is the default.
+ */
+/* These are the functions defined in F90 wraper */
+#define f_create_gridinfo_handle         f_create_gridinfo_handle_
+#define f_create_options_handle          f_create_options_handle_
+#define f_create_ScalePerm_handle        f_create_scaleperm_handle_
+#define f_create_LUstruct_handle         f_create_lustruct_handle_
+#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle_
+#define f_create_SuperMatrix_handle      f_create_supermatrix_handle_
+#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle_
+#define f_destroy_options_handle         f_destroy_options_handle_
+#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle_
+#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle_
+#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle_
+#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle_
+#define f_create_SuperLUStat_handle      f_create_superlustat_handle_
+#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle_
+#define f_get_gridinfo                   f_get_gridinfo_
+#define f_get_SuperMatrix                f_get_supermatrix_
+#define f_set_SuperMatrix                f_set_supermatrix_
+#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix_ 
+#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix_
+#define f_get_superlu_options            f_get_superlu_options_
+#define f_set_superlu_options            f_set_superlu_options_
+#define f_set_default_options            f_set_default_options_
+#define f_superlu_gridinit               f_superlu_gridinit_
+#define f_superlu_gridmap                f_superlu_gridmap_
+#define f_superlu_gridexit               f_superlu_gridexit_
+#define f_ScalePermstructInit            f_scalepermstructinit_
+#define f_ScalePermstructFree            f_scalepermstructfree_
+#define f_PStatInit                      f_pstatinit_
+#define f_PStatFree                      f_pstatfree_
+#define f_LUstructInit                   f_lustructinit_
+#define f_LUstructFree                   f_lustructfree_
+#define f_Destroy_LU                     f_destroy_lu_
+#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist_
+#define f_zCreate_CompRowLoc_Mat_dist    f_zcreate_comprowloc_mat_dist_
+#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist_
+#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist_
+#define f_dSolveFinalize                 f_dsolvefinalize_
+#define f_zSolveFinalize                 f_zsolvefinalize_
+#define f_pdgssvx                        f_pdgssvx_
+#define f_pzgssvx                        f_pzgssvx_
+#define f_dcreate_dist_matrix            f_dcreate_dist_matrix_
+#define f_zcreate_dist_matrix            f_zcreate_dist_matrix_
+#define f_check_malloc                   f_check_malloc_
+#endif
+
+#if (F77_CALL_C == UPCASE)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine 
+ * following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void DGEMM(...)
+ */
+/* BLAS */
+#define sasum_    SASUM
+#define isamax_   ISAMAX
+#define scopy_    SCOPY
+#define sscal_    SSCAL
+#define sger_     SGER
+#define snrm2_    SNRM2
+#define ssymv_    SSYMV
+#define sdot_     SDOT
+#define saxpy_    SAXPY
+#define ssyr2_    SSYR2
+#define srot_     SROT
+#define sgemv_    SGEMV
+#define strsv_    STRSV
+#define sgemm_    SGEMM
+#define strsm_    STRSM
+
+#define dasum_    DASUM
+#define idamax_   IDAMAX
+#define dcopy_    DCOPY
+#define dscal_    DSCAL
+#define dger_     DGER
+#define dnrm2_    DNRM2
+#define dsymv_    DSYMV
+#define ddot_     DDOT
+#define daxpy_    DAXPY
+#define dsyr2_    DSYR2
+#define drot_     DROT
+#define dgemv_    DGEMV
+#define dtrsv_    DTRSV
+#define dgemm_    DGEMM
+#define dtrsm_    DTRSM
+
+#define scasum_   SCASUM
+#define icamax_   ICAMAX
+#define ccopy_    CCOPY
+#define cscal_    CSCAL
+#define scnrm2_   SCNRM2
+#define caxpy_    CAXPY
+#define cgemv_    CGEMV
+#define ctrsv_    CTRSV
+#define cgemm_    CGEMM
+#define ctrsm_    CTRSM
+#define cgerc_    CGERC
+#define chemv_    CHEMV
+#define cher2_    CHER2
+
+#define dzasum_   DZASUM
+#define izamax_   IZAMAX
+#define zcopy_    ZCOPY
+#define zscal_    ZSCAL
+#define dznrm2_   DZNRM2
+#define zaxpy_    ZAXPY
+#define zgemv_    ZGEMV
+#define ztrsv_    ZTRSV
+#define zgemm_    ZGEMM
+#define ztrsm_    ZTRSM
+#define zgerc_    ZGERC
+#define zhemv_    ZHEMV
+#define zher2_    ZHER2
+#define zgeru_    ZGERU
+
+/*
+#define mc64id_dist     MC64ID_DIST
+#define mc64ad_dist     MC64AD_DIST
+*/
+#define c_bridge_dgssv_               C_BRIDGE_DGSSV
+#define c_fortran_slugrid_            C_FORTRAN_SLUGRID
+#define c_fortran_pdgssvx_            C_FORTRAN_PDGSSVX
+#define c_fortran_pdgssvx_ABglobal_   C_FORTRAN_PDGSSVX_ABGLOBAL
+#define c_fortran_pzgssvx_            C_FORTRAN_PZGSSVX
+#define c_fortran_pzgssvx_ABglobal_   C_FORTRAN_PZGSSVX_ABGLOBAL
+
+/* These are the functions defined in F90 wraper */
+#define f_create_gridinfo_handle         F_CREATE_GRIDINFO_HANDLE
+#define f_create_options_handle          F_CREATE_OPTIONS_HANDLE
+#define f_create_ScalePerm_handle        F_CREATE_SCALEPERM_HANDLE
+#define f_create_LUstruct_handle         F_CREATE_LUSTRUCT_HANDLE
+#define f_create_SOLVEstruct_handle      F_CREATE_SOLVESTRUCT_HANDLE
+#define f_create_SuperMatrix_handle      F_CREATE_SUPERMATRIX_HANDLE
+#define f_destroy_gridinfo_handle        F_DESTROY_GRIDINFO_HANDLE
+#define f_destroy_options_handle         F_DESTROY_OPTIONS_HANDLE
+#define f_destroy_ScalePerm_handle       F_DESTROY_SCALEPERM_HANDLE
+#define f_destroy_LUstruct_handle        F_DESTROY_LUSTRUCT_HANDLE
+#define f_destroy_SOLVEstruct_handle     F_DESTROY_SOLVESTRUCT_HANDLE
+#define f_destroy_SuperMatrix_handle     F_DESTROY_SUPERMATRIX_HANDLE
+#define f_create_SuperLUStat_handle      F_CREATE_SUPERLUSTAT_HANDLE
+#define f_destroy_SuperLUStat_handle     F_DESTROY_SUPERLUSTAT_HANDLE
+#define f_get_gridinfo                   F_GET_GRIDINFO
+#define f_get_SuperMatrix                F_GET_SUPERMATRIX
+#define f_set_SuperMatrix                F_SET_SUPERMATRIX
+#define f_get_CompRowLoc_Matrix          F_GET_COMPROWLOC_MATRIX
+#define f_set_CompRowLoc_Matrix          F_SET_COMPROWLOC_MATRIX
+#define f_get_superlu_options            F_GET_SUPERLU_OPTIONS
+#define f_set_superlu_options            F_SET_SUPERLU_OPTIONS
+#define f_set_default_options            F_SET_DEFAULT_OPTIONS
+#define f_superlu_gridinit               F_SUPERLU_GRIDINIT
+#define f_superlu_gridmap                F_SUPERLU_GRIDMAP
+#define f_superlu_gridexit               F_SUPERLU_GRIDEXIT
+#define f_ScalePermstructInit            F_SCALEPERMSTRUCTINIT
+#define f_ScalePermstructFree            F_SCALEPERMSTRUCTFREE
+#define f_PStatInit                      F_PSTATINIT
+#define f_PStatFree                      F_PSTATFREE
+#define f_LUstructInit                   F_LUSTRUCTINIT
+#define f_LUstructFree                   F_LUSTRUCTFREE
+#define f_Destroy_LU                     F_DESTROY_LU
+#define f_dCreate_CompRowLoc_Mat_dist    F_DCREATE_COMPROWLOC_MAT_DIST
+#define f_zCreate_CompRowLoc_Mat_dist    F_ZCREATE_COMPROWLOC_MAT_DIST
+#define f_Destroy_CompRowLoc_Mat_dist    F_DESTROY_COMPROWLOC_MAT_DIST
+#define f_Destroy_SuperMat_Store_dist    F_DESTROY_SUPERMAT_STORE_DIST
+#define f_dSolveFinalize                 F_DSOLVEFINALIZE
+#define f_zSolveFinalize                 F_ZSOLVEFINALIZE
+#define f_pdgssvx                        F_PDGSSVX
+#define f_pzgssvx                        F_PZGSSVX
+#define f_dcreate_dist_matrix            F_DCREATE_DIST_MATRIX
+#define f_zcreate_dist_matrix            F_ZCREATE_DIST_MATRIX
+#define f_check_malloc                   F_CHECK_MALLOC
+#endif
+
+#if (F77_CALL_C == NOCHANGE)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine 
+ * for following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void dgemm(...)
+ */
+/* BLAS */
+#define sasum_    sasum
+#define isamax_   isamax
+#define scopy_    scopy
+#define sscal_    sscal
+#define sger_     sger
+#define snrm2_    snrm2
+#define ssymv_    ssymv
+#define sdot_     sdot
+#define saxpy_    saxpy
+#define ssyr2_    ssyr2
+#define srot_     srot
+#define sgemv_    sgemv
+#define strsv_    strsv
+#define sgemm_    sgemm
+#define strsm_    strsm
+
+#define dasum_    dasum
+#define idamax_   idamax
+#define dcopy_    dcopy
+#define dscal_    dscal
+#define dger_     dger
+#define dnrm2_    dnrm2
+#define dsymv_    dsymv
+#define ddot_     ddot
+#define daxpy_    daxpy
+#define dsyr2_    dsyr2
+#define drot_     drot
+#define dgemv_    dgemv
+#define dtrsv_    dtrsv
+#define dgemm_    dgemm
+#define dtrsm_    dtrsm
+
+#define scasum_   scasum
+#define icamax_   icamax
+#define ccopy_    ccopy
+#define cscal_    cscal
+#define scnrm2_   scnrm2
+#define caxpy_    caxpy
+#define cgemv_    cgemv
+#define ctrsv_    ctrsv
+#define cgemm_    cgemm
+#define ctrsm_    ctrsm
+#define cgerc_    cgerc
+#define chemv_    chemv
+#define cher2_    cher2
+
+#define dzasum_   dzasum
+#define izamax_   izamax
+#define zcopy_    zcopy
+#define zscal_    zscal
+#define dznrm2_   dznrm2
+#define zaxpy_    zaxpy
+#define zgemv_    zgemv
+#define ztrsv_    ztrsv
+#define zgemm_    zgemm
+#define ztrsm_    ztrsm
+#define zgerc_    zgerc
+#define zhemv_    zhemv
+#define zher2_    zher2
+#define zgeru_    zgeru
+
+/*
+#define mc64id_dist         mc64id_dist
+#define mc64ad_dist         mc64ad_dist
+*/
+
+#define c_bridge_dgssv_               c_bridge_dgssv
+#define c_fortran_slugrid_            c_fortran_slugrid
+#define c_fortran_pdgssvx_            c_fortran_pdgssvx
+#define c_fortran_pdgssvx_ABglobal_   c_fortran_pdgssvx_abglobal
+#define c_fortran_pzgssvx_            c_fortran_pzgssvx
+#define c_fortran_pzgssvx_ABglobal_   c_fortran_pzgssvx_abglobal
+
+/* These are the functions defined in F90 wraper */
+#define f_create_gridinfo_handle         f_create_gridinfo_handle
+#define f_create_options_handle          f_create_options_handle
+#define f_create_ScalePerm_handle        f_create_scaleperm_handle
+#define f_create_LUstruct_handle         f_create_lustruct_handle
+#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle
+#define f_create_SuperMatrix_handle      f_create_supermatrix_handle
+#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle
+#define f_destroy_options_handle         f_destroy_options_handle
+#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle
+#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle
+#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle
+#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle
+#define f_create_SuperLUStat_handle      f_create_superlustat_handle
+#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle
+#define f_get_gridinfo                   f_get_gridinfo
+#define f_get_SuperMatrix                f_get_supermatrix
+#define f_set_SuperMatrix                f_set_supermatrix
+#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix 
+#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix
+#define f_get_superlu_options            f_get_superlu_options
+#define f_set_superlu_options            f_set_superlu_options
+#define f_set_default_options            f_set_default_options
+#define f_superlu_gridinit               f_superlu_gridinit
+#define f_superlu_gridmap                f_superlu_gridmap
+#define f_superlu_gridexit               f_superlu_gridexit
+#define f_ScalePermstructInit            f_scalepermstructinit
+#define f_ScalePermstructFree            f_scalepermstructfree
+#define f_PStatInit                      f_pstatinit
+#define f_PStatFree                      f_pstatfree
+#define f_LUstructInit                   f_lustructinit
+#define f_LUstructFree                   f_lustructfree
+#define f_Destroy_LU                     f_destroy_lu
+#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist
+#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist
+#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist
+#define f_dSolveFinalize                 f_dsolvefinalize
+#define f_zSolveFinalize                 f_zsolvefinalize
+#define f_pdgssvx                        f_pdgssvx
+#define f_pzgssvx                        f_pzgssvx
+#define f_dcreate_dist_matrix            f_dcreate_dist_matrix
+#define f_zcreate_dist_matrix            f_zcreate_dist_matrix
+#define f_check_malloc                   f_check_malloc
+#endif
+
+#endif /* __SUPERLU_CNAMES */
diff --git a/SRC/Makefile b/SRC/Makefile
new file mode 100644
index 0000000..c78083d
--- /dev/null
+++ b/SRC/Makefile
@@ -0,0 +1,91 @@
+#######################################################################
+#
+#  This makefile creates a library for distributed SuperLU.
+#  The files are organized as follows:
+#
+#       ALLAUX  -- Auxiliary routines called from all precisions
+#       DSLUSRC -- Double precision real serial SuperLU routines
+#       DPLUSRC -- Double precision real parallel SuperLU routines
+#       ZSLUSRC -- Double precision complex serial SuperLU routines
+#       ZPLUSRC -- Double precision complex parallel SuperLU routines
+#
+#  The library can be set up to include routines for any combination
+#  of the two precisions.  To create or add to the library, enter make
+#  followed by one or more of the precisions desired.  Some examples:
+#       make double
+#       make double complex16
+#  Alternatively, the command
+#       make
+#  without any arguments creates a library of all two precisions.
+#  The library is called
+#       superlu.a
+#  and is created at the next higher directory level.
+#
+#  To remove the object files after the library is created, enter
+#       make clean
+#
+#######################################################################
+include ../make.inc
+#
+# Precision independent routines
+#
+ALLAUX 	= sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
+	  mmd.o comm.o memory.o util.o superlu_grid.o \
+	  pxerr_dist.o superlu_timer.o symbfact.o \
+	  psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \
+	  static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o
+
+ifeq "${ACC}" "GPU"
+ALLAUX += cublas_utils.o
+endif
+
+#
+# Routines literally taken from SuperLU, but renamed with suffix _dist
+#
+DSLUSRC	= dlangs_dist.o dgsequ_dist.o dlaqgs_dist.o dutil_dist.o \
+	  dmemory_dist.o dmyblas2_dist.o dsp_blas2_dist.o dsp_blas3_dist.o
+ZSLUSRC	= dcomplex_dist.o zlangs_dist.o zgsequ_dist.o zlaqgs_dist.o \
+	  zutil_dist.o zmemory_dist.o zmyblas2_dist.o \
+	  zsp_blas2_dist.o zsp_blas3_dist.o
+
+#
+# Routines for double precision parallel SuperLU
+DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \
+	  dreadhb.o dreadrb.o dreadtriple.o dreadMM.o \
+	  pdgsequ.o pdlaqgs.o dldperm_dist.o pdlangs.o pdutil.o \
+	  pdsymbfact_distdata.o ddistribute.o pddistribute.o \
+	  pdgstrf.o pdgstrf2.o pdGetDiagU.o \
+	  pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \
+	  pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o 
+
+#
+# Routines for double complex parallel SuperLU
+ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
+	  zreadhb.o zreadrb.o zreadtriple.o zreadMM.o \
+	  pzgsequ.o pzlaqgs.o zldperm_dist.o pzlangs.o pzutil.o \
+	  pzsymbfact_distdata.o zdistribute.o pzdistribute.o \
+	  pzgstrf.o pzgstrf2.o pzGetDiagU.o \
+	  pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \
+	  pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o 
+
+all:  double complex16
+
+double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX)
+	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
+		$(DSLUSRC) $(DPLUSRC) $(ALLAUX)
+	$(RANLIB) $(DSUPERLULIB)
+
+complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
+	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
+		$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
+	$(RANLIB) $(DSUPERLULIB)
+
+
+.c.o:
+	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
+
+.f.o:
+	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
+
+clean:	
+	rm -f *.o $(DSUPERLULIB)
diff --git a/SRC/comm.c b/SRC/comm.c
new file mode 100644
index 0000000..31cae51
--- /dev/null
+++ b/SRC/comm.c
@@ -0,0 +1,124 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Broadcast an array of *dtype* numbers
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Broadcast an array of *dtype* numbers. The communication pattern
+ *   is a tree with number of branches equal to NBRANCHES.
+ *   The process ranks are between 0 and Np-1.
+ * 
+ *   The following two pairs of graphs give different ways of viewing the same
+ *   algorithm.  The first pair shows the trees as they should be visualized
+ *   when examining the algorithm.  The second pair are isomorphic graphs of
+ *   of the first, which show the actual pattern of data movement.
+ *   Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a
+ *   hypercube broadcast (however, it does not require the nodes be a
+ *   power of two to work).
+ *
+ *    TREE BROADCAST, NBRANCHES = 2     *    TREE BROADCAST, NBRANCHES = 3
+ *       
+ *     root=2
+ * i=4   &______________                *
+ *       |              \               *       root=2
+ * i=2   &______         &______        * i=3     &______________________
+ *       |      \        |      \       *         |          \           \
+ * i=1   &__     &__     &__     &__    * i=1     &______     &______     &__
+ *       |  \    |  \    |  \    |  \   *         |  \   \    |  \   \    |  \
+ *       2   3   4   5   6   7   0   1  *         2   3   4   5   6   7   0   1
+ *
+ *
+ *          ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS:
+ *
+ *                2                                           2
+ *       _________|_________                       ___________|____________
+ *      /         |         \                     /           |      |     \
+ *     6          4          3                   5            0      3      4
+ *    / \         |                             / \           |
+ *   0   7        5                            6   7          1
+ *   |
+ *   1
+ *
+ *
+ * Arguments
+ * =========
+ * 
+ * scope
+ * </pre>
+ */
+
+void
+bcast_tree(void *buf, int count, MPI_Datatype dtype, int root, int tag,
+	   gridinfo_t *grid, int scope, int *recvcnt)
+
+{
+    int Iam, i, j, Np, nbranches = 2;
+    int destdist; /* The distance of the destination node. */
+    int mydist;   /* My distance from root. */
+    superlu_scope_t *scp;
+    MPI_Status status;
+
+    if ( scope == COMM_COLUMN ) scp = &grid->cscp;
+    else if ( scope == ROW ) scp = &grid->rscp;
+    Np = scp->Np;
+    if ( Np < 2 ) return;
+    Iam = scp->Iam;
+    
+    if ( Iam == root ) {
+	for (i = nbranches; i < Np; i *= nbranches);
+	for (i /= nbranches; i > 0; i /= nbranches) {
+	    for (j = 1; j < nbranches; ++j) {
+		destdist = i*j;
+		if ( destdist < Np )
+		    MPI_Send( buf, count, dtype, (Iam+destdist)%Np, 
+			     tag, scp->comm );
+	    }
+	}
+    } else {
+	mydist = (Np + Iam - root) % Np;
+	for (i = nbranches; i < Np; i *= nbranches);
+	for (i /= nbranches; (mydist%i); i /= nbranches);
+/*	MPI_Probe( MPI_ANY_SOURCE, tag, scp->comm, &status );*/
+	MPI_Recv( buf, count, dtype, MPI_ANY_SOURCE, tag, scp->comm, &status );
+	MPI_Get_count( &status, dtype, recvcnt );
+
+	/* I need to send data to others. */
+	while ( (i > 1) && !(mydist%i) ) {
+	    i /= nbranches;
+	    for (j = 1; j < nbranches; ++j) {
+		destdist = mydist + j*i;
+		if ( destdist < Np )
+		    MPI_Send( buf, *recvcnt, dtype, (root+destdist)%Np, 
+			     tag, scp->comm );
+	    }
+	}
+    }
+} /* BCAST_TREE */
+
+
+
+
+
+
+
diff --git a/SRC/cublas_utils.c b/SRC/cublas_utils.c
new file mode 100644
index 0000000..7157e6d
--- /dev/null
+++ b/SRC/cublas_utils.c
@@ -0,0 +1,109 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+#include <stdio.h>
+#include "cublas_utils.h"
+
+ void DisplayHeader()
+{
+    const int kb = 1024;
+    const int mb = kb * kb;
+    // cout << "NBody.GPU" << endl << "=========" << endl << endl;
+
+    printf("CUDA version:   v %d\n",CUDART_VERSION);
+    //cout << "Thrust version: v" << THRUST_MAJOR_VERSION << "." << THRUST_MINOR_VERSION << endl << endl; 
+
+    int devCount;
+    cudaGetDeviceCount(&devCount);
+    printf( "CUDA Devices: \n \n"); 
+
+    for(int i = 0; i < devCount; ++i)
+    {
+        struct cudaDeviceProp props;       
+        cudaGetDeviceProperties(&props, i);
+        printf("%d : %s %d %d\n",i, props.name,props.major,props.minor );
+        // cout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
+        printf("  Global memory:   %ld mb \n", props.totalGlobalMem / mb);
+        // cout << "  Global memory:   " << props.totalGlobalMem / mb << "mb" << endl;
+        printf("  Shared memory:   %ld kb \n", props.sharedMemPerBlock / kb ); //<<  << "kb" << endl;
+        printf("  Constant memory: %ld kb \n", props.totalConstMem / kb );
+        printf("  Block registers: %d \n\n", props.regsPerBlock );
+
+        // to do these later
+        // printf("  Warp size:         %d" << props.warpSize << endl;
+        // printf("  Threads per block: %d" << props.maxThreadsPerBlock << endl;
+        // printf("  Max block dimensions: [ %d" << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
+        // printf("  Max grid dimensions:  [ %d" << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
+
+        // cout << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kb" << endl;
+        // cout << "  Constant memory: " << props.totalConstMem / kb << "kb" << endl;
+        // cout << "  Block registers: " << props.regsPerBlock << endl << endl;
+
+        // cout << "  Warp size:         " << props.warpSize << endl;
+        // cout << "  Threads per block: " << props.maxThreadsPerBlock << endl;
+        // cout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
+        // cout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
+        // cout << endl;
+    }
+}
+
+
+const char* cublasGetErrorString(cublasStatus_t status)
+{
+    switch(status)
+    {
+        case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 
+        case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 
+        case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 
+        case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 
+    }
+    return "unknown error";
+}
+
+inline
+cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+    if (result != cudaSuccess) {
+        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+        assert(result == cudaSuccess);
+    }
+#endif
+    return result;
+}
+
+cublasStatus_t checkCublas(cublasStatus_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result));
+    assert(result == CUBLAS_STATUS_SUCCESS);
+  }
+#endif
+  return result;
+}
+
+
+cublasHandle_t create_handle ()
+{
+       cublasHandle_t handle;
+       checkCublas(cublasCreate(&handle));
+       return handle;
+ }
+
+ void destroy_handle (cublasHandle_t handle)
+ {
+      checkCublas(cublasDestroy(handle));
+ }
+
diff --git a/SRC/cublas_utils.h b/SRC/cublas_utils.h
new file mode 100644
index 0000000..9c457ab
--- /dev/null
+++ b/SRC/cublas_utils.h
@@ -0,0 +1,34 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * </pre>
+ */
+
+#ifndef CUBLAS_UTILS_H
+#define CUBLAS_UTILS_H
+
+#include <cublas_v2.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "cuda_runtime.h"
+
+extern void DisplayHeader();
+extern const char* cublasGetErrorString(cublasStatus_t status);
+extern cudaError_t checkCuda(cudaError_t);
+extern cublasStatus_t checkCublas(cublasStatus_t);
+extern cublasHandle_t create_handle ();
+extern void destroy_handle (cublasHandle_t handle);
+
+#endif 
diff --git a/SRC/dSchCompUdt-2Ddynamic.c b/SRC/dSchCompUdt-2Ddynamic.c
new file mode 100644
index 0000000..360861f
--- /dev/null
+++ b/SRC/dSchCompUdt-2Ddynamic.c
@@ -0,0 +1,525 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This file contains the main loop of pdgstrf which involves rank k
+ *        update of the Schur complement.
+ *        Uses 2D partitioning for the scatter phase.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY guided 
+double tt_start;
+double tt_end;
+
+if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+    int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
+    int temp_nbrow;   /* nonzero rows in current block L(i,k) */
+    lptr  = lptr0;
+    luptr = luptr0;
+    /**
+     * Seperating L blocks into the top part within look-ahead window
+     * and the remaining ones.
+     */
+     int lookAheadBlk=0, RemainBlk=0;
+
+     tt_start = SuperLU_timer_();
+
+     /* Loop through all blocks in L(:,k) to set up pointers to the start 
+      * of each block in the data arrays.
+      *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
+      *   - lookAheadStRow[i] := number of nonzero rows before block i
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[] 
+      *   - (ditto Remain_Info[i])
+      */
+     for (int i = 0; i < nlb; ++i) {
+	 ib = lsub[lptr];            /* block number of L(i,k). */
+	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+        
+	 int look_up_flag = 1; /* assume ib is outside look-up window */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
+	     {
+		 if(ib == perm_c_supno[j]) {
+		     look_up_flag=0; /* flag ib is within look-up window */
+                     break; /* Sherry -- can exit the loop?? */
+                 }
+	     }
+	 
+	 if( look_up_flag == 0 ) { /* ib is within look up window */
+	     if (lookAheadBlk==0) {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
+	     } else {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];   
+	     }
+	     lookAheadStRow[lookAheadBlk] = cum_nrow;
+	     lookAhead_lptr[lookAheadBlk] = lptr;
+	     lookAhead_ib[lookAheadBlk] = ib; 
+	     lookAheadBlk++;
+	 } else { /* ib is not in look up window */
+
+	     if (RemainBlk==0) {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow;
+	     } else {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;   
+	     }
+
+             RemainStRow[RemainBlk] = cum_nrow;
+             // Remain_lptr[RemainBlk] = lptr;
+	     Remain_info[RemainBlk].lptr = lptr;
+	     // Remain_ib[RemainBlk] = ib; 
+	     Remain_info[RemainBlk].ib = ib; 
+	     RemainBlk++;
+	 }
+	 
+         cum_nrow +=temp_nbrow;
+	 
+	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+	 lptr += temp_nbrow;     /* Move to next block */
+	 luptr += temp_nbrow;
+     }  /* for i ... all blocks in L(:,k) */
+
+     lptr = lptr0;
+     luptr = luptr0;
+
+     /* leading dimension of L buffer */
+#if 0
+     int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
+#else /* Piyush fix */
+     int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
+#endif
+
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef __OPENMP
+     /* #pragma omp parallel for -- why not?? Sherry */
+#endif
+     for (int i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest  = 0;
+	 int temp_nbrow;
+	 if (i==0) {
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+	 
+	 int StRowSource=lookAheadStRow[i];
+	 
+	 /* Now copying the matrix*/
+	 // #pragma omp parallel for (gives slow down)
+	 for (int j = 0; j < knsupc; ++j) {
+	     memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
+		    &lusup[luptr+j*nsupr+StRowSource],
+		    temp_nbrow * sizeof(double) );
+	 }
+     }
+
+     int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+
+    /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for 
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+	 int StRowDest  = 0;
+	 int temp_nbrow;
+         if (i==0)  {
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
+	 }
+
+	 int StRowSource=RemainStRow[i];
+
+	 /* Now copying the matrix*/
+	 // #pragma omp parallel for (gives slow down)
+	 for (int j = 0; j < knsupc; ++j) {
+	     // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
+	     memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
+		    &lusup[luptr+j*nsupr+StRowSource],
+                    temp_nbrow * sizeof(double) );
+	 }
+     } /* parallel for i ... */
+
+#if ( PRNTlevel>=1 )
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
+#endif
+#if 0
+     LookAheadRowSepMOP  +=  2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
+#else
+     int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
+                              or remaining part.  */
+     lnbrow = lookAheadBlk==0 ? 0  : lookAheadFullRow[lookAheadBlk-1];
+     rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     nbrow = lnbrow + rnbrow; /* total number of rows in L */
+     LookAheadRowSepMOP += 2*knsupc*(nbrow);
+#endif     
+     
+     /**********************
+      * Gather U blocks *
+      **********************/
+
+     tt_start = SuperLU_timer_();
+#if 0     
+     nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
+#endif
+
+     if ( nbrow > 0 ) { /* L(:,k) is not empty */
+	 /*
+	  * Counting U blocks
+	  */
+	 ncols = 0; /* total number of nonzero columns in U(k,:) */
+	 ldu   = 0;
+	 full  = 1; /* flag the U block is indeed 'full', containing segments
+	               of same length. No need padding 0 */
+	 int temp_ncols=0;
+
+         /* Loop through all blocks in U(k,:) to set up pointers to the start
+          * of each block in the data arrays, store them in Ublock_info[j]
+          * for block U(k,j).
+  	  */
+	 for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+	     temp_ncols = 0;
+	     arrive_at_ublock(
+			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			      iukp0, rukp0, usub, perm_u, xsup, grid
+			      );
+	     Ublock_info[j].iukp = iukp;
+	     Ublock_info[j].rukp = rukp;
+	     Ublock_info[j].jb = jb;
+	     
+	     /* Prepare to call GEMM. */
+	     jj = iukp;
+	     
+	     for (; jj < iukp+nsupc; ++jj) {
+		 segsize = klst - usub[jj];
+		 if ( segsize ) {
+                    ++temp_ncols;
+                    if ( segsize != ldu ) full = 0; /* need padding 0 */
+                    if ( segsize > ldu ) ldu = segsize;
+		 }
+	     }
+
+	     Ublock_info[j].full_u_cols = temp_ncols;
+	     ncols += temp_ncols;
+	 }
+
+	 /* Now doing prefix sum on full_u_cols.
+	  * After this, full_u_cols is the number of nonzero columns
+          * from block 0 to block j.
+          */
+	 for ( j = jj0+1; j < nub; ++j) {
+	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
+	 }
+            
+	 tempu = bigU; /* buffer the entire row block U(k,:) */
+
+         /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
+#ifdef _OPENMP        
+#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
+	lead_zero, jj, i) \
+        default (shared) schedule(SCHEDULE_STRATEGY)
+#endif
+        for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+
+            if(j==jj0) tempu = bigU;
+            else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+
+            /* == processing each of the remaining columns == */
+            arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			     iukp0, rukp0, usub,perm_u, xsup, grid);
+
+            /* Copy from U(k,:) to tempu[], padding zeros.  */            
+            for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+                    lead_zero = ldu - segsize;
+                    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                    tempu += lead_zero;
+                    for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+                    rukp += segsize;
+                    tempu += segsize;
+                }
+            }
+
+            rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+
+        }   /* parallel for j:jjj_st..jjj */
+
+        tempu = bigU;  /* setting to the start of padded U(k,:) */
+
+    }  /* end if (nbrow>0) */
+
+#if ( PRNTlevel>=1 )
+    GatherUTimer += SuperLU_timer_() - tt_start;
+#endif
+    GatherMOP += 2*ldu*ncols;
+
+    int Lnbrow   = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+    int Rnbrow   = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+    int jj_cpu=nub;       /*limit between CPU and GPU */
+    int thread_id;
+    tempv = bigV;
+
+    /**************************************
+     * Perform GEMM followed by Scatter *
+     **************************************/
+
+    if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+        /* Perform a large GEMM call */
+        ncols = Ublock_info[nub-1].full_u_cols;
+        schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+        stat->ops[FACT]    += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+
+        /***************************************************************
+         * Updating look-ahead blocks in both L and U look-ahead windows.
+         ***************************************************************/
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
+     {
+ 	thread_id = omp_get_thread_num();
+ 
+ 	/* Ideally, should organize the loop as:
+                for (j = 0; j < nub; ++j) {
+                    for (lb = 0; lb < lookAheadBlk; ++lb) {
+ 	               L(lb,k) X U(k,j) -> tempv[]
+                    }
+                }
+ 	   But now, we use collapsed loop to achieve more parallelism.
+ 	   Total number of block updates is:
+ 	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+ 	*/
+#pragma omp for \
+    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+ 	thread_id = 0;
+#endif
+ 	/* Each thread is assigned one loop index ij, responsible for 
+ 	   block update L(lb,k) * U(k,j) -> tempv[]. */
+        for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	    if ( thread_id == 0 ) tt_start = SuperLU_timer_();
+
+            int j   = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+            int lb  = ij%lookAheadBlk;
+
+            int* indirect_thread    = indirect + ldt*thread_id;
+            int* indirect2_thread   = indirect2 + ldt*thread_id;
+            double* tempv1 = bigV + thread_id*ldt*ldt; 
+
+            /* Getting U block U(k,j) information */
+            /* unsigned long long ut_start, ut_end; */
+            int_t rukp =  Ublock_info[j].rukp;
+            int_t iukp =  Ublock_info[j].iukp;
+            int jb   =  Ublock_info[j].jb;
+            int nsupc = SuperSize(jb);
+            int ljb = LBj (jb, grid);  /* destination column block */
+            int st_col;
+            int ncols;
+            if ( j>jj0 ) { /* jj0 was set to 0 */
+                ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+                st_col = Ublock_info[j-1].full_u_cols;
+            } else {
+                ncols  = Ublock_info[j].full_u_cols;
+                st_col = 0;   
+            }
+
+            /* Getting L block L(i,k) information */
+            int_t lptr = lookAhead_lptr[lb];
+            int ib   = lookAhead_ib[lb];
+            int temp_nbrow = lsub[lptr+1];
+            lptr += LB_DESCRIPTOR;
+            int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+
+#if ( PRNTlevel>= 1)
+	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
+	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+#endif
+
+#if defined (USE_VENDOR_BLAS)            
+            dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+            dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0) {
+		tt_end = SuperLU_timer_();
+		LookAheadGEMMTimer += tt_end - tt_start;
+		tt_start = tt_end;
+	    }
+#endif
+            if ( ib < jb ) {
+                dscatter_u (
+				 ib, jb,
+				 nsupc, iukp, xsup,
+				 klst, temp_nbrow,
+				 lptr, temp_nbrow, lsub,
+				 usub, tempv1,
+				 Ufstnz_br_ptr, Unzval_br_ptr,
+				 grid
+			        );
+            } else {
+                dscatter_l (
+				 ib, ljb, 
+				 nsupc, iukp, xsup,
+ 				 klst, temp_nbrow,
+				 lptr, temp_nbrow,
+				 usub, lsub, tempv1,
+				 indirect_thread, indirect2_thread,
+				 Lrowind_bc_ptr, Lnzval_bc_ptr,
+				 grid
+				);
+            }
+
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0)
+		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+        } /* end omp for ij = ... */
+#ifdef _OPENMP
+    } /* end omp parallel */
+#endif
+        LookAheadGEMMFlOp  += 2*(double)Lnbrow * (double)ldu * (double)ncols;
+        stat->ops[FACT]    += 2*(double)Lnbrow * (double)ldu * (double)ncols;
+        LookAheadScatterMOP += 3*Lnbrow*ncols;
+    } /* end if Lnbrow < ... */
+    
+    /***************************************************************
+     * Updating remaining rows and columns on CPU.
+     ***************************************************************/
+    Rnbrow  = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+    ncols   = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    schur_flop_counter  += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+    stat->ops[FACT]     += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
+    {
+	thread_id = omp_get_thread_num();
+ 
+	/* Ideally, should organize the loop as:
+               for (j = 0; j < jj_cpu; ++j) {
+                   for (lb = 0; lb < RemainBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+                   }
+               }
+	   But now, we use collapsed loop to achieve more parallelism.
+	   Total number of block updates is:
+	      (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	*/
+#pragma omp for \
+    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+    thread_id = 0;
+#endif
+	/* Each thread is assigned one loop index ij, responsible for 
+	   block update L(lb,k) * U(k,j) -> tempv[]. */
+    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
+	int j   = ij / RemainBlk + jj0; 
+	int lb  = ij % RemainBlk;
+
+	int* indirect_thread = indirect + ldt*thread_id;
+	int* indirect2_thread = indirect2 + ldt*thread_id;
+	double* tempv1 = bigV + thread_id*ldt*ldt; 
+
+	/* Getting U block U(k,j) information */
+	/* unsigned long long ut_start, ut_end; */
+	int_t rukp =  Ublock_info[j].rukp;
+	int_t iukp =  Ublock_info[j].iukp;
+	int jb   =  Ublock_info[j].jb;
+	int nsupc = SuperSize(jb);
+	int ljb = LBj (jb, grid);
+	int st_col;
+	int ncols;
+	if ( j>jj0 ) {
+	    ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+	    st_col = Ublock_info[j-1].full_u_cols;
+	} else {
+	    ncols  = Ublock_info[j].full_u_cols;
+	    st_col = 0;   
+	}
+
+	/* Getting L block L(i,k) information */
+	int_t lptr = Remain_info[lb].lptr;
+	int ib   = Remain_info[lb].ib;
+	int temp_nbrow = lsub[lptr+1];
+	lptr += LB_DESCRIPTOR;
+	int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+#if ( PRNTlevel>=1 )
+	if ( thread_id==0 ) tt_start = SuperLU_timer_();
+#endif
+
+	/* calling GEMM */
+#if defined (USE_VENDOR_BLAS)
+	dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
+	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+	dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
+	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+
+#if ( PRNTlevel>=1 )
+	if (thread_id==0) {
+	    tt_end = SuperLU_timer_();
+	    RemainGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	}
+#endif
+
+	/* Now scattering the block */
+	if ( ib<jb ) {
+	    dscatter_u(
+			    ib, jb,
+			    nsupc, iukp, xsup,
+			    klst, temp_nbrow,
+			    lptr, temp_nbrow,lsub,
+			    usub, tempv1,
+			    Ufstnz_br_ptr, Unzval_br_ptr,
+			    grid
+		           );
+	} else {
+	    dscatter_l(
+			    ib, ljb,
+			    nsupc, iukp, xsup,
+			    klst, temp_nbrow,
+			    lptr, temp_nbrow,
+			    usub, lsub, tempv1,
+			    indirect_thread, indirect2_thread,
+			    Lrowind_bc_ptr,Lnzval_bc_ptr,
+			    grid
+			   );
+	}
+
+#if ( PRNTlevel>=1 )
+	if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+    } /* end omp for (int ij =...) */
+#ifdef _OPENMP
+    } /* end omp parallel region */
+#endif
+}  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/dSchCompUdt-cuda.c b/SRC/dSchCompUdt-cuda.c
new file mode 100644
index 0000000..3b782aa
--- /dev/null
+++ b/SRC/dSchCompUdt-cuda.c
@@ -0,0 +1,550 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This file contains the main loop of pdgstrf which involves
+ *        rank k update of the Schur complement.
+ *        Uses CUDA GPU.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY dynamic
+
+#define cublasCheckErrors(fn) \
+    do { \
+        cublasStatus_t __err = fn; \
+        if (__err != CUBLAS_STATUS_SUCCESS) { \
+            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
+                (int)(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while(0);
+
+
+if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
+    ldu   =0;
+    full  =1;
+    int cum_nrow;
+    int temp_nbrow;
+
+    lptr = lptr0;
+    luptr = luptr0;
+    
+    nbrow= lsub[1];
+    if (myrow==krow) nbrow = lsub[1]-lsub[3];
+
+    if (nbrow>0) {
+        
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
+        int num_streams_used,        /*number of streams that will be used*/
+        ncpu_blks;                     /*Number of CPU dgemm blks*/
+
+        int jjj, jjj_st,jjj_global;        
+        for (j = jj0; j < nub; ++j) {
+            arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
+
+            ncols =0 ;  //initialize at 0 
+            jj = iukp;
+            int temp_ldu=0; 
+            for (; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+		    ++ncols;
+		}
+                temp_ldu = SUPERLU_MAX(temp_ldu, segsize);
+            }
+
+            full_u_cols[j] = ncols;
+            blk_ldu[j] = temp_ldu;
+        } /* end for j = jj0..nub */
+
+        jjj = jj0; /* initialization */
+            
+        // #pragma omp barrier 
+        while ( jjj < nub ) {
+            jjj_st=jjj;
+#ifdef _OPENMP
+#pragma omp single
+#endif
+            {
+                ldu = blk_ldu[jjj_st];
+                for (j = jjj_st; j < nub ; ++j) {
+                    
+                    /* prefix sum */
+                    if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
+
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);   
+
+                    /* break condition */
+                    /* the number of columns that can be processed is limited by buffer size*/
+                    if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
+                        break;
+                    }
+                } /* end for j=jjj_st to nub */  
+
+                jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj will be nub */
+                
+                // TAU_STATIC_TIMER_START("work_divison");
+                /* Divide CPU-GPU gemm here */
+                gemm_division_cpu_gpu(
+		       &num_streams_used, /*number of streams that will be used*/
+		       stream_end_col,    /*array holding last column blk for each partition*/
+		       &ncpu_blks,        /*Number of CPU gemm blks*/
+		       			  /*input*/
+		       nbrow,             /*number of row in A matrix*/
+		       ldu,               /*number of k in dgemm*/
+		       nstreams,
+		       full_u_cols + jjj_st, /*array containing prefix sum of work load*/
+		       jjj_global-jjj_st     /*Number of work load */
+                );
+                // TAU_STATIC_TIMER_STOP("work_divison");
+
+            } /* pragma omp single */
+
+            jjj = jjj_global;
+            // printf("thread_id %d, jjj %d \n",thread_id,jjj );
+            if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
+                printf("allocate more memory for buffer !!!!\n");
+                if(nbrow * full_u_cols[jjj_st] > buffer_size)
+                    printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
+            }
+            
+            // #pragma omp barrier 
+            /* gathering circuit */
+            assert(jjj_st<nub);
+            assert(jjj-1<nub);
+            // TAU_STATIC_TIMER_START("GATHER_U");
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY )
+#endif
+            for (j = jjj_st; j < jjj; ++j) {
+                if (j==jjj_st) tempu = bigU;
+                else tempu = bigU + ldu*full_u_cols[j-1];
+
+                /* == processing each of the remaining columns == */
+                arrive_at_ublock(j,&iukp,&rukp,&jb,&ljb,&nsupc,
+				 iukp0,rukp0,usub,perm_u,xsup,grid);
+
+                // tempu = tempU2d;
+                for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                    segsize = klst - usub[jj];
+                    if ( segsize ) {
+                        lead_zero = ldu - segsize;
+                        for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                        tempu += lead_zero;
+                        for (i = 0; i < segsize; ++i)
+                            tempu[i] = uval[rukp+i];
+                        rukp += segsize;
+                        tempu += segsize;
+                    }
+                }
+
+                rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+
+            } /* end for j=jjj_st to jjj */  
+
+	    if ( num_streams_used > 0 ) {
+#ifdef PI_DEBUG
+		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size );
+		assert(nbrow*ldu<=ldt*max_row_size);
+#endif 
+		cudaMemcpy2DAsync(dA, nbrow*sizeof(double),
+				  &lusup[luptr+(knsupc-ldu)*nsupr],
+				  nsupr*sizeof(double), nbrow*sizeof(double),
+				  ldu, cudaMemcpyHostToDevice, streams[0]);
+	    }
+                
+	    for (int i = 0; i < num_streams_used; ++i) {
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1]; 
+		int st_col = full_u_cols[st-1];
+		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
+		tempu = bigU;
+                    
+		double *tempv1 = bigV + full_u_cols[st-1]*nbrow;
+
+		/* Following is for testing purpose */
+#ifdef GPU_ACC
+		int stream_id = i;
+		int b_offset  = ldu * st_col;
+		int c_offset  = st_col * nbrow;
+		size_t B_stream_size = ldu * num_col_stream * sizeof(double);
+		size_t C_stream_size = nbrow * num_col_stream * sizeof(double);
+		
+		assert(ldu*(st_col+num_col_stream) < bigu_size);
+		assert(nbrow*(st_col+num_col_stream) < buffer_size);
+		
+		cudaMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
+				cudaMemcpyHostToDevice, streams[stream_id]);
+		
+		cublasCheckErrors(
+				  cublasSetStream(handle[stream_id],
+						  streams[stream_id])
+				  );
+		
+		cublasCheckErrors(
+				  cublasDgemm(handle[stream_id],
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      nbrow, num_col_stream, ldu,
+                                              &alpha, dA, nbrow,
+					      &dB[b_offset], ldu, 
+					      &beta, &dC[c_offset],
+                                              nbrow)
+				  );
+		
+		checkCuda( cudaMemcpyAsync(tempv1, dC+c_offset,
+					   C_stream_size,
+					   cudaMemcpyDeviceToHost,
+					   streams[stream_id]) );
+#else 
+		if ( num_col_stream > 0 ) {   
+		    my_dgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
+			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
+			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
+			      tempv1, &nbrow, 1, 1);
+		}
+		
+#endif 
+		
+	    } /* end for i = 1 to num_streams used */
+	    
+	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
+	    int st_col = 0;        /*special case for cpu */
+	    tempv = bigV + nbrow * st_col;
+	    tempu = bigU;
+	    
+	    double tstart = SuperLU_timer_();
+#if defined (USE_VENDOR_BLAS)            
+	    dgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
+#else
+	    dgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow);
+#endif
+	    gemm_timer += SuperLU_timer_() -tstart;
+	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
+	    
+	    // printf("after dgemm \n");
+	    
+            /* Now scattering blocks handled by cpu */
+            int temp_ncol;
+	    
+            /* scatter first blocks which cpu has computated*/
+            tstart = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel  \
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,	\
+	    segsize,lead_zero,					\
+	    ib, temp_nbrow,ilst,lib,index,			\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,			\
+	    nzval,     lb ,                     jj, i)		\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+                int thread_id = omp_get_thread_num();
+        
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                double* tempv1;
+                
+                if (ncpu_blks< omp_get_num_threads()) {
+                    // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
+                    
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+                        #ifdef PI_DEBUG
+                            printf("scattering %d  block column\n",j);
+                        #endif
+
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+#endif
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            int cum_nrow = 0;
+                            int temp_nbrow;
+                            lptr = lptr0;
+                            luptr = luptr0;
+                            for (int i = 0; i < lb; ++i) {
+                                ib = lsub[lptr];        /* Row block L(i,k). */
+                                temp_nbrow = lsub[lptr+1];   /* Number of full rows. */
+                                lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+                                lptr += temp_nbrow;
+                                luptr += temp_nbrow;
+                                cum_nrow +=temp_nbrow;
+                            }
+
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) {  /* A(i,j) is in U. */
+                                #ifdef PI_DEBUG
+                                    printf("cpu scatter \n");
+                                    printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+                                #endif
+
+                                tempv = tempv1+cum_nrow;
+                                dscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                
+                                tempv = tempv1+cum_nrow;
+
+                                dscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                    // TAU_STATIC_TIMER_STOP("SPECIAL_CPU_SCATTER");
+                } else {
+#ifdef _OPENMP
+#pragma omp for schedule(SCHEDULE_STRATEGY) nowait
+#endif
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+                        #ifdef PI_DEBUG
+                            printf("scattering %d  block column\n",j);
+                        #endif 
+
+                        /* == processing each of the remaining columns == */
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+			    /* Now gather the result into the destination block. */
+			    if ( ib < jb ) {  /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("cpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+
+				tempv = tempv1+cum_nrow;
+                                dscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+			    } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                dscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+			    } /* if ib < jb ... */
+
+			    lptr += temp_nbrow;
+			    luptr += temp_nbrow;
+			    cum_nrow += temp_nbrow;
+			
+			} /* for lb ... */
+
+			luptr=luptr0;
+		    } /* for j = jjj_st ... */
+		}     /* else if (ncpu_blks >= omp_get_num_threads()) */
+	    }         /* parallel region */
+
+	    scatter_timer += SuperLU_timer_() - tstart; 
+#ifdef _OPENMP
+#pragma omp parallel							\
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
+	    segsize,lead_zero,						\
+	    ib, temp_nbrow,ilst,lib,index,				\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,				\
+	    nzval,     lb ,                     jj, i)			\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+                int thread_id = omp_get_thread_num();
+        
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                double* tempv1;
+                for(i = 0; i < num_streams_used; i++) { /* i is private variable */
+                    checkCuda(cudaStreamSynchronize (streams[i]));
+                    int jjj_st1 = (i==0) ? jjj_st + ncpu_blks : jjj_st + stream_end_col[i-1];
+                    int jjj_end = jjj_st + stream_end_col[i];
+                    assert(jjj_end-1<nub);
+                    assert(jjj_st1>jjj_st) ;
+
+                    /* now scatter it */
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait 
+                    for (j = jjj_st1; j < jjj_end; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering %d  block column\n",j);
+#endif 
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+                        for (lb = 0; lb < nlb; lb++) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) { /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("gpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+                                dscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("gpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                dscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+			    
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+                    
+                } /* end for i = 0 to nstreams */
+                // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
+                // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
+            } /* end pragma omp parallel */
+            // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
+        }  /* end while(jjj<nub) */
+ 
+    } /* if nbrow>0 */
+
+ }   /* if msg1 and msg 2 */
+
+
+
diff --git a/SRC/dcomplex.h b/SRC/dcomplex.h
new file mode 100644
index 0000000..7c69b0f
--- /dev/null
+++ b/SRC/dcomplex.h
@@ -0,0 +1,81 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Header for dcomplex.c
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/* 
+ * This header file is to be included in source files z*.c
+ */
+#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */
+#define __SUPERLU_DCOMPLEX
+
+#include <mpi.h>
+
+typedef struct { double r, i; } doublecomplex;
+
+/*
+ * These variables will be defined to be MPI datatypes for complex
+ * and double complex. I'm too lazy to declare
+ * these guys external in every file that needs them.
+ */
+extern MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX;
+
+
+/* Macro definitions */
+
+/*! \brief Complex Addition c = a + b */
+#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \
+			 (c)->i = (a)->i + (b)->i; }
+
+/*! \brief Complex Subtraction c = a - b */
+#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \
+			 (c)->i = (a)->i - (b)->i; }
+
+/*! \brief Complex-Double Multiplication */
+#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \
+                           (c)->i = (a)->i * (b); }
+
+/*! \brief Complex-Complex Multiplication */
+#define zz_mult(c, a, b) { \
+	double cr, ci; \
+    	cr = (a)->r * (b)->r - (a)->i * (b)->i; \
+    	ci = (a)->i * (b)->r + (a)->r * (b)->i; \
+    	(c)->r = cr; \
+    	(c)->i = ci; \
+    }
+
+/*! \brief Complex equality testing */
+#define z_eq(a, b)  ( (a)->r == (b)->r && (a)->i == (b)->i )
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Prototypes for functions in dcomplex.c */
+void   slud_z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+double slud_z_abs(doublecomplex *);     /* exact */
+double slud_z_abs1(doublecomplex *);    /* approximate */
+
+
+#ifdef __cplusplus
+  }
+#endif
+
+
+#endif  /* __SUPERLU_DCOMPLEX */
diff --git a/SRC/dcomplex_dist.c b/SRC/dcomplex_dist.c
new file mode 100644
index 0000000..d850580
--- /dev/null
+++ b/SRC/dcomplex_dist.c
@@ -0,0 +1,94 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Defines common arithmetic operations for complex type
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "dcomplex.h"
+
+
+/* Complex Division c = a/b */
+void slud_z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b)
+{
+    double ratio, den;
+    double abr, abi, cr, ci;
+  
+    if( (abr = b->r) < 0.)
+	abr = - abr;
+    if( (abi = b->i) < 0.)
+	abi = - abi;
+    if( abr <= abi ) {
+	if (abi == 0) {
+	    fprintf(stderr, "slud_z_div.c: division by zero");
+	    exit(-1);
+	}	  
+	ratio = b->r / b->i ;
+	den = b->i * (1 + ratio*ratio);
+	cr = (a->r*ratio + a->i) / den;
+	ci = (a->i*ratio - a->r) / den;
+    } else {
+	ratio = b->i / b->r ;
+	den = b->r * (1 + ratio*ratio);
+	cr = (a->r + a->i*ratio) / den;
+	ci = (a->i - a->r*ratio) / den;
+    }
+    c->r = cr;
+    c->i = ci;
+}
+
+
+/* Returns sqrt(z.r^2 + z.i^2) */
+double slud_z_abs(doublecomplex *z)
+{
+    double temp;
+    double real = z->r;
+    double imag = z->i;
+
+    if (real < 0) real = -real;
+    if (imag < 0) imag = -imag;
+    if (imag > real) {
+	temp = real;
+	real = imag;
+	imag = temp;
+    }
+    if ((real+imag) == real) return(real);
+  
+    temp = imag/real;
+    temp = real*sqrt(1.0 + temp*temp);  /*overflow!!*/
+    return (temp);
+}
+
+
+/* Approximates the abs */
+/* Returns abs(z.r) + abs(z.i) */
+double slud_z_abs1(doublecomplex *z)
+{
+    double real = z->r;
+    double imag = z->i;
+  
+    if (real < 0) real = -real;
+    if (imag < 0) imag = -imag;
+
+    return (real + imag);
+}
+
+
+
diff --git a/SRC/ddistribute.c b/SRC/ddistribute.c
new file mode 100644
index 0000000..39fb23a
--- /dev/null
+++ b/SRC/ddistribute.c
@@ -0,0 +1,750 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Distribute the matrix onto the 2D process mesh.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ * </pre>
+ */
+
+float
+ddistribute(fact_t fact, int_t n, SuperMatrix *A, 
+            Glu_freeable_t *Glu_freeable,
+	    LUstruct_t *LUstruct, gridinfo_t *grid)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
+          len, len1, nsupc;
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+    int iam, jbrow, kcol, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NCPformat *Astore;
+    double *a;
+    int_t *asub;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;   
+    int_t *lsub, *xlsub, *usub, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+    int   *index1;        /* temporary pointer to array of int */
+    double *lusup, *uval; /* nonzero values in L and U */
+    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+    double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in 
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+    double *dense, *dense_col; /* SPA */
+    double zero = 0.0;
+    int_t  ldaspa;     /* LDA of SPA */
+    int_t iword, dword;
+    float mem_use = 0.0;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 ) 
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = A->Store;
+    a        = Astore->nzval;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    dword = sizeof(double);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter ddistribute()");
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+        /* ---------------------------------------------------------------
+         * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
+         * --------------------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+#if ( PRNTlevel>=1 )
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else { 
+        /* --------------------------------------------------
+         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
+         * -------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* No L and U data structures are available yet.
+	   We need to set up the L and U data structures and propagate
+	   the values of A into them.          */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+    
+	if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+#if ( PRNTlevel>=1 )
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+#endif
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr = 
+               (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+	
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword;
+#endif
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+	
+            
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+	
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+	
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+#if ( PRNTlevel>=1 )
+        mem_use -= 2.0*k * iword;
+#endif
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
+              *sp_ienv_dist(3)))))
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+#if ( PRNTlevel>=1 )
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+#endif
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+		
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		}
+
+		jbrow = PROW( jb, grid );
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see 
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in 
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) ) 
+			ABORT("Malloc fails for index[]");
+		    Lrowind_bc_ptr[ljb] = index;
+		    if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) {
+			fprintf(stderr, "col block " IFMT " ", jb);
+			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
+		    }
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len; 
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[], and
+		       the initial values of A from SPA into Lnzval[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = 0.0;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		      MPI_MAX, grid->comm);
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit ddistribute()");
+#endif
+
+    return (mem_use);
+} /* DDISTRIBUTE */
+
diff --git a/SRC/dgsequ_dist.c b/SRC/dgsequ_dist.c
new file mode 100644
index 0000000..809e5be
--- /dev/null
+++ b/SRC/dgsequ_dist.c
@@ -0,0 +1,193 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Computes row and column scalings 
+ */
+
+/*
+ * File name:	dgsequ.c
+ * History:     Modified from LAPACK routine DGEEQU
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre>    
+    Purpose   
+    =======   
+
+    DGSEQU_dist computes row and column scalings intended to equilibrate an   
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make   
+    the largest element in each row and column of the matrix B with   
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+    number and BIGNUM = largest safe number.  Use of these scaling   
+    factors is not guaranteed to reduce the condition number of A but   
+    works well in practice.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+ 
+    Arguments   
+    =========   
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
+	    
+    R       (output) double*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors   
+            for A.
+	    
+    C       (output) double*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+	    
+    ROWCND  (output) double*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+            AMAX is neither too large nor too small, it is not worth   
+            scaling by R.
+	    
+    COLCND  (output) double*
+            If INFO = 0, COLCND contains the ratio of the smallest   
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            worth scaling by C.
+	    
+    AMAX    (output) double*
+            Absolute value of largest matrix element.  If AMAX is very   
+            close to overflow or very close to underflow, the matrix   
+            should be scaled.
+	    
+    INFO    (output) int*
+            = 0:  successful exit   
+            < 0:  if INFO = -i, the i-th argument had an illegal value   
+            > 0:  if INFO = i,  and i is   
+                  <= M:  the i-th row of A is exactly zero   
+                  >  M:  the (i-M)-th column of A is exactly zero   
+
+    ===================================================================== 
+</pre>
+*/
+
+void
+dgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd,
+	    double *colcnd, double *amax, int_t *info)
+{
+
+    /* Local variables */
+    NCformat *Astore;
+    double   *Aval;
+    int i, j, irow;
+    double rcmin, rcmax;
+    double bignum, smlnum;
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	xerr_dist("dgsequ_dist", &i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (double *) Astore->nzval;
+    
+    /* Get machine constants. */
+    smlnum = dmach_dist("S");
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = 0; i < A->nrow; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    return;
+
+} /* dgsequ_dist */
+
+
diff --git a/SRC/dlangs_dist.c b/SRC/dlangs_dist.c
new file mode 100644
index 0000000..5888a9b
--- /dev/null
+++ b/SRC/dlangs_dist.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Returns the value of the one norm, the infinity norm, or the element of largest value 
+ */
+ 
+
+/*
+ * File name:	dlangs.c
+ * History:     Modified from lapack routine DLANGE
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre> 
+    Purpose   
+    =======   
+
+    DLANGS_dist returns the value of the one norm, or the Frobenius norm, or 
+    the infinity norm, or the element of largest absolute value of a 
+    real matrix A.   
+
+    Description   
+    ===========   
+
+    DLANGE returns the value   
+
+       DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+                (   
+                ( norm1(A),         NORM = '1', 'O' or 'o'   
+                (   
+                ( normI(A),         NORM = 'I' or 'i'   
+                (   
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+
+    Arguments   
+    =========   
+
+    NORM    (input) CHARACTER*1   
+            Specifies the value to be returned in DLANGE as described above.   
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A. 
+
+   ===================================================================== 
+</pre>
+*/
+double dlangs_dist(char *norm, SuperMatrix *A)
+{
+
+    
+    /* Local variables */
+    NCformat *Astore;
+    double   *Aval;
+    int_t    i, j, irow;
+    double   value=0., sum;
+    double   *rwork;
+
+    Astore = (NCformat *) A->Store;
+    Aval   = (double *) Astore->nzval;
+    
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+	
+    } else if ( strncmp(norm, "M", 1)==0 ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
+		value = SUPERLU_MAX( value, fabs( Aval[i]) );
+	
+    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+		sum += fabs(Aval[i]);
+	    value = SUPERLU_MAX(value, sum);
+	}
+	
+    } else if ( strncmp(norm, "I", 1)==0 ) {
+	/* Find normI(A). */
+	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
+	    ABORT("SUPERLU_MALLOC fails for rwork.");
+	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
+		irow = Astore->rowind[i];
+		rwork[irow] += fabs(Aval[i]);
+	    }
+	value = 0.;
+	for (i = 0; i < A->nrow; ++i)
+	    value = SUPERLU_MAX(value, rwork[i]);
+	
+	SUPERLU_FREE (rwork);
+	
+    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else
+	ABORT("Illegal norm specified.");
+
+    return (value);
+
+} /* dlangs_dist */
+
diff --git a/SRC/dlaqgs_dist.c b/SRC/dlaqgs_dist.c
new file mode 100644
index 0000000..6db27f9
--- /dev/null
+++ b/SRC/dlaqgs_dist.c
@@ -0,0 +1,143 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Equilibrates a general sparse M by N matrix A 
+*/
+/*
+ * File name:	dlaqgs.c
+ * History:     Modified from LAPACK routine DLAQGE
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose   
+    =======   
+
+    DLAQGS_dist equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments   
+    =========   
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of 
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
+	    
+    R       (input) double*, dimension (A->nrow)
+            The row scale factors for A.
+	    
+    C       (input) double*, dimension (A->ncol)
+            The column scale factors for A.
+	    
+    ROWCND  (input) double
+            Ratio of the smallest R(i) to the largest R(i).
+	    
+    COLCND  (input) double
+            Ratio of the smallest C(i) to the largest C(i).
+	    
+    AMAX    (input) double
+            Absolute value of largest matrix entry.
+	    
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.   
+            = 'N':  No equilibration   
+            = 'R':  Row equilibration, i.e., A has been premultiplied by  
+                    diag(R).   
+            = 'C':  Column equilibration, i.e., A has been postmultiplied  
+                    by diag(C).   
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).   
+
+    Internal Parameters   
+    ===================   
+
+    THRESH is a threshold value used to decide if row or column scaling   
+    should be done based on the ratio of the row or column scaling   
+    factors.  If ROWCND < THRESH, row scaling is done, and if   
+    COLCND < THRESH, column scaling is done.   
+
+    LARGE and SMALL are threshold values used to decide if row scaling   
+    should be done based on the absolute size of the largest matrix   
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+
+    ===================================================================== 
+</pre>
+*/
+void
+dlaqgs_dist(SuperMatrix *A, double *r, double *c, 
+	    double rowcnd, double colcnd, double amax, char *equed)
+{
+
+#define THRESH    (0.1)
+    
+    /* Local variables */
+    NCformat *Astore;
+    double   *Aval;
+    int_t i, j, irow;
+    double large, small, cj;
+
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (double *) Astore->nzval;
+    
+    /* Initialize LARGE and SMALL. */
+    small = dmach_dist("Safe minimum") / dmach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    for (j = 0; j < A->ncol; ++j) {
+		cj = c[j];
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    Aval[i] *= cj;
+                }
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		Aval[i] *= r[irow];
+	    }
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Row and column scaling */
+	for (j = 0; j < A->ncol; ++j) {
+	    cj = c[j];
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		Aval[i] *= cj * r[irow];
+	    }
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* dlaqgs_dist */
+
diff --git a/SRC/dldperm_dist.c b/SRC/dldperm_dist.c
new file mode 100644
index 0000000..7dcb9c9
--- /dev/null
+++ b/SRC/dldperm_dist.c
@@ -0,0 +1,172 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Finds a row permutation so that the matrix has large entries on the diagonal
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
+		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
+		    int_t [], int_t []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *   DLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal 
+ *              entries of the permuted matrix are one in absolute value and 
+ *              all the off-diagonal entries are less than or equal to one in 
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) double*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors. 
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * </pre>
+ */
+
+int
+dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
+	double nzval[], int_t *perm, double u[], double v[])
+{ 
+    int_t i, liw, ldw, num;
+    int_t *iw, icntl[10], info[10];
+    double *dw;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter dldperm_dist()");
+#endif
+    liw = 5*n;
+    if ( job == 3 ) liw = 10*n + nnz;
+    if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
+    ldw = 3*n + nnz;
+    if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
+	    
+    /* Increment one to get 1-based indexing. */
+    for (i = 0; i <= n; ++i) ++colptr[i];
+    for (i = 0; i < nnz; ++i) ++adjncy[i];
+#if ( DEBUGlevel>=2 )
+    printf("LDPERM(): n %d, nnz %d\n", n, nnz);
+    PrintInt10("colptr", n+1, colptr);
+    PrintInt10("adjncy", nnz, adjncy);
+#endif
+	
+    /* 
+     * NOTE:
+     * =====
+     *
+     * MC64AD assumes that column permutation vector is defined as:
+     * perm(i) = j means column i of permuted A is in column j of original A.
+     *
+     * Since a symmetric permutation preserves the diagonal entries. Then
+     * by the following relation:
+     *     P'(A*P')P = P'A
+     * we can apply inverse(perm) to rows of A to get large diagonal entries.
+     * But, since 'perm' defined in MC64AD happens to be the reverse of
+     * SuperLU's definition of permutation vector, therefore, it is already
+     * an inverse for our purpose. We will thus use it directly.
+     *
+     */
+    mc64id_dist(icntl);
+    /* Suppress error and warning messages. */
+    icntl[0] = -1;
+    icntl[1] = -1;
+
+    mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm,
+	        &liw, iw, &ldw, dw, icntl, info);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("perm", n, perm);
+    printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
+#endif
+    if ( info[0] == 1 ) { /* Structurally singular */
+        printf(".. The last " IFMT " permutations:\n", n-num);
+	PrintInt10("perm", n-num, &perm[num]);
+    }
+
+    /* Restore to 0-based indexing. */
+    for (i = 0; i <= n; ++i) --colptr[i];
+    for (i = 0; i < nnz; ++i) --adjncy[i];
+    for (i = 0; i < n; ++i) --perm[i];
+
+    if ( job == 5 )
+        for (i = 0; i < n; ++i) {
+	    u[i] = dw[i];
+	    v[i] = dw[n+i];
+	}
+
+    SUPERLU_FREE(iw);
+    SUPERLU_FREE(dw);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit dldperm_dist()");
+#endif
+   return (info[0]);
+}
+
diff --git a/SRC/dlook_ahead_update.c b/SRC/dlook_ahead_update.c
new file mode 100644
index 0000000..7521506
--- /dev/null
+++ b/SRC/dlook_ahead_update.c
@@ -0,0 +1,251 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/************************************************************************/
+/*! @file 
+ * \brief Look-ahead update of the Schur complement.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    double zero = 0.0;
+
+    /* Search along the row for the pointers {iukp, rukp} pointing to
+     * block U(k,j).
+     * j    -- current block in look-ahead window, initialized to 0 on entry
+     * iukp -- point to the start of index[] medadata
+     * rukp -- point to the start of nzval[] array
+     * jb   -- block number of block U(k,j), update destination column
+     */
+    arrive_at_ublock(
+		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         	     iukp0, rukp0, usub, perm_u, xsup, grid
+		    );
+    j++;
+    jj0++;
+    jj = iukp;
+
+    while (usub[jj] == klst) ++jj; /* Skip zero segments */
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+    full = 1; /* flag the U block is indeed 'full', containing segments
+                 of same length. No need padding 0.  */
+    for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize != ldu) full = 0; /* need padding 0 */
+            if (segsize > ldu)  ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+    if (0) {
+        tempu = &uval[rukp];
+    }
+    else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+#if ( DEBUGlevel>=3 )
+        printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+                iam, full, k, jb, ldu, ncols, nsupc);
+        ++num_copy;
+#endif
+        tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+        for (jj = iukp; jj < iukp + nsupc; ++jj) {
+            segsize = klst - usub[jj];
+            if (segsize) {
+                lead_zero = ldu - segsize;
+                for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (i = 0; i < segsize; ++i) {
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        tempu = bigU;
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    } /* if full ... */
+
+    nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+    // double ttx =SuperLU_timer_();
+
+    int current_b = 0; /* Each thread starts searching from first block.
+                          This records the moving search target.           */
+    lptr = lptr0; /* point to the start of index[] in supernode L(:,k) */
+    luptr = luptr0;
+
+#ifdef _OPENMP
+    /* Sherry -- examine all the shared variables ??
+       'firstprivate' ensures that the private variables are initialized
+       to the values before entering the loop  */
+#pragma omp parallel for \
+    firstprivate(lptr,luptr,ib,tempv,current_b) private(lb) \
+    default(shared) schedule(dynamic)
+#endif
+    for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
+        int temp_nbrow; /* automatic variable is private */
+
+        /* Search for the L block that my thread will work on.
+           No need to search from 0, can continue at the point where
+           it is left from last iteration.
+           Note: Blocks may not be sorted in L. Different thread picks up
+	   different lb.   */
+        for (; current_b < lb; ++current_b) {
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;   /* move to next block */
+            luptr += temp_nbrow;  /* move to next block */
+        }
+
+#ifdef _OPENMP        
+        int_t thread_id = omp_get_thread_num ();
+#else
+        int_t thread_id = 0;
+#endif
+        double * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread  = indirect + ldt * thread_id;
+        int *indirect2_thread = indirect2 + ldt * thread_id;        
+        ib = lsub[lptr];        /* block number of L(i,k) */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+	/* assert (temp_nbrow <= nbrow); */
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+        /* calling gemm */
+#if defined (USE_VENDOR_BLAS)
+        dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+        /* Now scattering the output*/
+        if (ib < jb) {    /* A(i,j) is in U. */
+            dscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            dscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread, 
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+        ++current_b;         /* move to next block */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+
+    } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
+
+    rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
+    iukp += nsupc;
+
+    /* =========================================== *
+     * == factorize L(:,j) and send if possible == *
+     * =========================================== */
+    kk = jb; /* destination column that is just updated */
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+        /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+        /* double ttt1 = SuperLU_timer_(); */
+#if ( VAMPIR>=1 )
+        VT_begin (5);
+#endif
+
+        PDGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+#if ( VAMPIR>=1 )
+        VT_end (5);
+#endif
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Multicasts numeric values of L(:,kk) to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+#if ( VAMPIR>=1 )
+                VT_begin (1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( VAMPIR>=1 )
+                VT_end (1);
+#endif
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d, tags %d:%d \n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj,
+			SLU_MPI_TAG(0,kk0), SLU_MPI_TAG(1,kk0));
+#endif
+            }  /* end if ( ToSendR[lk][pj] != EMPTY ) */
+        } /* end for pj ... */
+    } /* end if( look_ahead[kk] == k0 && kcol == mycol ) */
+} /* end while j < nub and perm_u[j] <k0+NUM_LOOK_AHEAD */
+
diff --git a/SRC/dmach_dist.c b/SRC/dmach_dist.c
new file mode 100644
index 0000000..a481485
--- /dev/null
+++ b/SRC/dmach_dist.c
@@ -0,0 +1,94 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+double dmach_dist(char *cmach)
+{
+/*  -- SuperLU auxiliary routine (version 5.0) --   
+    This uses C99 standard constants, and is thread safe.
+
+    Must be compiled with -std=c99 flag.
+
+
+    Purpose   
+    =======   
+
+    DMACH returns double precision machine parameters.   
+
+    Arguments   
+    =========   
+
+    CMACH   (input) CHARACTER*1   
+            Specifies the value to be returned by DMACH:   
+            = 'E' or 'e',   DMACH := eps   
+            = 'S' or 's ,   DMACH := sfmin   
+            = 'B' or 'b',   DMACH := base   
+            = 'P' or 'p',   DMACH := eps*base   
+            = 'N' or 'n',   DMACH := t   
+            = 'R' or 'r',   DMACH := rnd   
+            = 'M' or 'm',   DMACH := emin   
+            = 'U' or 'u',   DMACH := rmin   
+            = 'L' or 'l',   DMACH := emax   
+            = 'O' or 'o',   DMACH := rmax   
+
+            where   
+
+            eps   = relative machine precision   
+            sfmin = safe minimum, such that 1/sfmin does not overflow   
+            base  = base of the machine   
+            prec  = eps*base   
+            t     = number of (base) digits in the mantissa   
+            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
+            emin  = minimum exponent before (gradual) underflow   
+            rmin  = underflow threshold - base**(emin-1)   
+            emax  = largest exponent before overflow   
+            rmax  = overflow threshold  - (base**emax)*(1-eps)   
+
+   ===================================================================== 
+*/
+
+    double sfmin, small, rmach;
+
+    if ( strncmp(cmach, "E", 1)==0 ) {
+	rmach = DBL_EPSILON * 0.5;
+    } else if ( strncmp(cmach, "S", 1)==0 ) {
+	sfmin = DBL_MIN;
+	small = 1. / DBL_MAX;
+	if (small >= sfmin) {
+	    /* Use SMALL plus a bit, to avoid the possibility of rounding   
+	       causing overflow when computing  1/sfmin. */
+	    sfmin = small * (DBL_EPSILON*0.5 + 1.);
+	}
+	rmach = sfmin;
+    } else if ( strncmp(cmach, "B", 1)==0 ) {
+	rmach = FLT_RADIX;
+    } else if ( strncmp(cmach, "P", 1)==0 ) {
+	rmach = DBL_EPSILON * 0.5 * FLT_RADIX;
+    } else if ( strncmp(cmach, "N", 1)==0 ) {
+	rmach = DBL_MANT_DIG;
+    } else if ( strncmp(cmach, "R", 1)==0 ) {
+	rmach = FLT_ROUNDS;
+    } else if ( strncmp(cmach, "M", 1)==0 ) {
+	rmach = DBL_MIN_EXP;
+    } else if ( strncmp(cmach, "U", 1)==0 ) {
+	rmach = DBL_MIN;
+    } else if ( strncmp(cmach, "L", 1)==0 ) {
+	rmach = DBL_MAX_EXP;
+    } else if ( strncmp(cmach, "O", 1)==0 ) {
+	rmach = DBL_MAX;
+    }
+
+    return rmach;
+
+} /* end dmach_dist */
diff --git a/SRC/dmemory.patch b/SRC/dmemory.patch
new file mode 100644
index 0000000..8e323ee
--- /dev/null
+++ b/SRC/dmemory.patch
@@ -0,0 +1,8 @@
+132c132
+<     buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); 
+---
+>     buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); 
+141c141
+<     buf = (double *) SUPERLU_MALLOC(n * sizeof(double));
+---
+>     buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double));
diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c
new file mode 100644
index 0000000..8f9e7a2
--- /dev/null
+++ b/SRC/dmemory_dist.c
@@ -0,0 +1,169 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Memory utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+
+/* Variables external to this file */
+extern LU_stack_t stack;
+
+
+void *duser_malloc_dist(int_t bytes, int_t which_end)
+{
+    void *buf;
+    
+    if ( StackFull(bytes) ) return (NULL);
+
+    if ( which_end == HEAD ) {
+	buf = (char*) stack.array + stack.top1;
+	stack.top1 += bytes;
+    } else {
+	stack.top2 -= bytes;
+	buf = (char*) stack.array + stack.top2;
+    }
+    
+    stack.used += bytes;
+    return buf;
+}
+
+
+void duser_free_dist(int_t bytes, int_t which_end)
+{
+    if ( which_end == HEAD ) {
+	stack.top1 -= bytes;
+    } else {
+	stack.top2 += bytes;
+    }
+    stack.used -= bytes;
+}
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * </pre>
+ */
+int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+		       SuperLUStat_t *stat, superlu_dist_mem_usage_t *mem_usage)
+{
+    register int_t dword, gb, iword, k, nb, nsupers;
+    int_t *index, *xsup;
+    int iam, mycol, myrow;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    iword = sizeof(int_t);
+    dword = sizeof(double);
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    mem_usage->for_lu = 0.;
+
+    /* For L factor */
+    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->npcol + mycol; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Lrowind_bc_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)
+		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
+		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
+	    }
+	}
+    }
+
+    /* For U factor */
+    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->nprow + myrow; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Ufstnz_br_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)(index[2] * iword);
+		mem_usage->for_lu += (float)(index[1] * dword);
+	    }
+	}
+    }
+
+    /* Working storage to support factorization */
+    mem_usage->total = mem_usage->for_lu;
+#if 0
+    mem_usage->total +=
+	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
+		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
+    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
+    mem_usage->total +=
+	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
+    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
+    k = CEILING( nsupers, grid->nprow );
+    mem_usage->total += (float)(2 * k * iword);
+#else
+    /*mem_usage->total += stat->current_buffer;*/
+    printf(".. dQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
+    mem_usage->total += stat->peak_buffer;
+#endif
+
+    return 0;
+} /* dQuerySpace_dist */
+
+
+/*
+ * Allocate storage for original matrix A
+ */
+void
+dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa)
+{
+    *a    = (double *) doubleMalloc_dist(nnz);
+    *asub = (int_t *) intMalloc_dist(nnz);
+    *xa   = (int_t *) intMalloc_dist(n+1);
+}
+
+
+double *doubleMalloc_dist(int_t n)
+{
+    double *buf;
+    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); 
+    return (buf);
+}
+
+double *doubleCalloc_dist(int_t n)
+{
+    double *buf;
+    register int_t i;
+    double zero = 0.0;
+    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double));
+    if ( !buf ) return (buf);
+    for (i = 0; i < n; ++i) buf[i] = zero;
+    return (buf);
+}
+
diff --git a/SRC/dmyblas2_dist.c b/SRC/dmyblas2_dist.c
new file mode 100644
index 0000000..a7bec6d
--- /dev/null
+++ b/SRC/dmyblas2_dist.c
@@ -0,0 +1,248 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Level 2 BLAS operations: solves and matvec, written in C
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * </pre>
+ */
+/*
+ * File name:		dmyblas2.c
+ * Purpose:
+ *     Level 2 BLAS operations: solves and matvec, written in C.
+ * Note:
+ *     This is only used when the system lacks an efficient BLAS library.
+ */
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense UNIT lower triangular system. The unit lower 
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
+ * The solution will be returned in the rhs vector.
+ * </pre>
+ */
+void dlsolve ( int ldm, int ncol, double *M, double *rhs )
+{
+    int k;
+    double x0, x1, x2, x3, x4, x5, x6, x7;
+    double *M0;
+    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+
+    M0 = &M[0];
+
+    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+      Mki4 = Mki3 + ldm + 1;
+      Mki5 = Mki4 + ldm + 1;
+      Mki6 = Mki5 + ldm + 1;
+      Mki7 = Mki6 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++;
+      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++;
+      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
+      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
+			   - x6 * *Mki6++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      rhs[++firstcol] = x4;
+      rhs[++firstcol] = x5;
+      rhs[++firstcol] = x6;
+      rhs[++firstcol] = x7;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++
+                        - x4 * *Mki4++ - x5 * *Mki5++
+			- x6 * *Mki6++ - x7 * *Mki7++;
+ 
+      M0 += 8 * ldm + 8;
+    }
+
+    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++;
+ 
+      M0 += 4 * ldm + 4;
+    }
+
+    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+
+      rhs[++firstcol] = x1;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
+ 
+    }
+    return;    
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ * </pre>
+ */
+void
+dusolve (
+	int ldm,	/* in */
+	int ncol,	/* in */
+	double *M,	/* in */
+	double *rhs	/* modified */
+)
+{
+    double xj;
+    int jcol, j, irow;
+
+    jcol = ncol - 1;
+
+    for (j = 0; j < ncol; j++) {
+
+	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
+	rhs[jcol] = xj;
+	
+	for (irow = 0; irow < jcol; irow++)
+	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
+
+	jcol--;
+
+    }
+    return;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ * </pre>
+ */
+void dmatvec (
+	int ldm,	/* in -- leading dimension of M */
+	int nrow,	/* in */ 
+	int ncol,	/* in */
+	double *M,	/* in */
+	double *vec,	/* in */
+	double *Mxvec	/* in/out */
+)
+{
+    double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
+    double *M0;
+    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+    int k;
+
+    M0 = &M[0];
+    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+        Mki2 = Mki1 + ldm;
+        Mki3 = Mki2 + ldm;
+	Mki4 = Mki3 + ldm;
+	Mki5 = Mki4 + ldm;
+	Mki6 = Mki5 + ldm;
+	Mki7 = Mki6 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];	
+	vi4 = vec[firstcol++];
+	vi5 = vec[firstcol++];
+	vi6 = vec[firstcol++];
+	vi7 = vec[firstcol++];	
+
+	for (k = 0; k < nrow; k++) 
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
+		      + vi4 * *Mki4++ + vi5 * *Mki5++
+		      + vi6 * *Mki6++ + vi7 * *Mki7++;
+
+	M0 += 8 * ldm;
+    }
+
+    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+	Mki2 = Mki1 + ldm;
+	Mki3 = Mki2 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];	
+	for (k = 0; k < nrow; k++) 
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
+
+	M0 += 4 * ldm;
+    }
+
+    while ( firstcol < ncol ) {		/* Do 1 column */
+
+ 	Mki0 = M0;
+	vi0 = vec[firstcol++];
+	for (k = 0; k < nrow; k++)
+	    Mxvec[k] += vi0 * *Mki0++;
+
+	M0 += ldm;
+    }
+    return;	
+}
+
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
new file mode 100644
index 0000000..9ddc538
--- /dev/null
+++ b/SRC/dreadMM.c
@@ -0,0 +1,243 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file 
+ * \brief 
+ * Contributed by Francois-Henry Rouet.
+ *
+ */
+#include <ctype.h>
+#include "superlu_ddefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    double **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    double *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
+    int expand;
+
+    /* 	File format:
+     *    %%MatrixMarket matrix coordinate real general/symmetric/...
+     *    % ...
+     *    % (optional comments)
+     *    % ...
+     *    #rows    #non-zero
+     *    Triplet in the rest of lines: row    col    value
+     */
+
+     /* 1/ read header */ 
+     fgets(line,512,fp);
+     for (p=line; *p!='\0'; *p=tolower(*p),p++);
+
+     if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
+       printf("Invalid header (first line does not contain 5 tokens)\n");
+       exit;
+     }
+ 
+     if(strcmp(banner,"%%matrixmarket")) {
+       printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n");
+       exit(-1);
+     }
+
+     if(strcmp(mtx,"matrix")) {
+       printf("Not a matrix; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(crd,"coordinate")) {
+       printf("Not in coordinate format; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(arith,"real")) {
+       if(!strcmp(arith,"complex")) {
+         printf("Complex matrix; use zreadMM instead!\n");
+         exit(-1);
+       }
+       else if(!strcmp(arith, "pattern")) {
+         printf("Pattern matrix; values are needed!\n");
+         exit(-1);
+       }
+       else {
+         printf("Unknown arithmetic\n");
+         exit(-1);
+       }
+     }
+
+     if(strcmp(sym,"general")) {
+       printf("Symmetric matrix: will be expanded\n");
+       expand=1;
+     } else
+       expand=0;
+
+     /* 2/ Skip comments */
+     while(banner[0]=='%') {
+       fgets(line,512,fp);
+       sscanf(line,"%s",banner);
+     }
+
+     /* 3/ Read n and nnz */
+#ifdef _LONGINT
+    sscanf(line, "%ld%ld%ld",m, n, nonz);
+#else
+    sscanf(line, "%d%d%d",m, n, nonz);
+#endif
+
+    if(*m!=*n) {
+      printf("Rectangular matrix!. Abort\n");
+      exit(-1);
+   }
+
+    if(expand)
+      new_nonz = 2 * *nonz - *n;
+    else
+      new_nonz = *nonz;
+
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* 4/ Read triplets of values */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
+#else
+	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+            if(expand) {
+	        if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	          ++nz;
+	          row[nz] = col[nz-1];
+	          col[nz] = row[nz-1];
+	          val[nz] = val[nz-1];
+	          ++xa[col[nz]];
+	        }
+            }	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+    if(expand) {
+      printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+    }
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+static void dreadrhs(int m, double *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "dreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf\n", &b[i]);
+      /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
+    /*        readpair_(j, &b[i]);*/
+    fclose(fp);
+}
+
+
diff --git a/SRC/dreadhb.c b/SRC/dreadhb.c
new file mode 100644
index 0000000..2d4d475
--- /dev/null
+++ b/SRC/dreadhb.c
@@ -0,0 +1,389 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_ddefs.h"
+
+/*
+ * Prototypes
+ */
+static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
+static void dReadValues(FILE *, int_t, double *, int_t, int_t);
+extern void FormFullA(int_t, int_t *, double **, int_t **, int_t **);
+static int DumpLine(FILE *);
+static int ParseIntFormat(char *, int_t *, int_t *);
+static int ParseFloatFormat(char *, int_t *, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ * </pre>
+ */
+
+void
+dreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+	     double **nzval, int_t **rowind, int_t **colptr)
+{
+
+    register int_t i, numer_lines, rhscrd = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int_t sym;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter dreadhb_dist()");
+#endif
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+
+    /* Line 2 */
+    for (i=0; i<5; i++) {
+	fscanf(fp, "%14c", buf); buf[14] = 0;
+	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+	if (i == 3) numer_lines = tmp;
+	if (i == 4 && tmp) rhscrd = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+    
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
+    
+    if (tmp != 0)
+	if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+	if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    fscanf(fp, "%20c", buf);
+    DumpLine(fp);
+
+    /* Line 5: right-hand side */    
+    if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) {
+	printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+	printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+	printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+	printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+    
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]);
+#endif
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read rowind[" IFMT "] = " IFMT "\n", *nonz-1, (*rowind)[*nonz-1]);
+#endif
+    if ( numer_lines ) {
+        dReadValues(fp, *nonz, *nzval, valnum, valsize);
+#if ( DEBUGlevel>=1 )
+	if ( !iam ) printf("read nzval[" IFMT "] = %e\n", *nonz-1, (*nzval)[*nonz-1]);
+#endif
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+    fclose(fp);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit dreadhb_dist()");
+#endif
+}
+
+/* Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp); 
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+    
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+	   && *tmp != 'F' && *tmp != 'f') {
+       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); 
+
+    return 0;
+}
+
+static void
+ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    item = atoi(&buf[j*persize]); 
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	    where[i++] = item - 1;
+	}
+    }
+}
+
+void
+dReadValues(FILE *fp, int_t n, double *destination, 
+             int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    destination[i++] = atof(&buf[s]);
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+}
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+extern void
+FormFullA(int_t n, int_t *nonz, double **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    double *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (double*) SUPERLU_MALLOC( *nonz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+    
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+#if (DEBUGlevel >= 2)
+	  if ( fabs(a_val[k]) < 4.047e-300 )
+	      printf("%5d: %e\n", k, a_val[k]);
+#endif
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+#if (DEBUGlevel >= 2)
+	if ( fabs(a_val[k]) < 4.047e-300 )
+	    printf("%5d: %e\n", k, a_val[k]);
+#endif
+	++k;
+      }
+      
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
diff --git a/SRC/dreadrb.c b/SRC/dreadrb.c
new file mode 100644
index 0000000..d62fb7b
--- /dev/null
+++ b/SRC/dreadrb.c
@@ -0,0 +1,347 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file dreadrb.c
+ * \brief Read a matrix stored in Rutherford-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * </pre>
+ *
+ * Purpose
+ * =======
+ *
+ * Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format 
+ * as described below.
+ *
+ * Line 1 (A72, A8)
+ *      Col. 1 - 72   Title (TITLE)
+ *      Col. 73 - 80  Matrix name / identifier (MTRXID)
+ *
+ * Line 2 (I14, 3(1X, I13))
+ *      Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ *      Col. 16 - 28  Number of lines for pointers (PTRCRD)
+ *      Col. 30 - 42  Number of lines for row (or variable) indices (INDCRD)
+ *      Col. 44 - 56  Number of lines for numerical values (VALCRD)
+ *
+ * Line 3 (A3, 11X, 4(1X, I13))
+ *      Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ *      Col. 15 - 28  Compressed Column: Number of rows (NROW)
+ *                    Elemental: Largest integer used to index variable (MVAR)
+ *      Col. 30 - 42  Compressed Column: Number of columns (NCOL)
+ *                    Elemental: Number of element matrices (NELT)
+ *      Col. 44 - 56  Compressed Column: Number of entries (NNZERO)
+ *                    Elemental: Number of variable indeces (NVARIX)
+ *      Col. 58 - 70  Compressed Column: Unused, explicitly zero
+ *                    Elemental: Number of elemental matrix entries (NELTVL)
+ *
+ * Line 4 (2A16, A20)
+ *      Col. 1 - 16   Fortran format for pointers (PTRFMT)
+ *      Col. 17 - 32  Fortran format for row (or variable) indices (INDFMT)
+ *      Col. 33 - 52  Fortran format for numerical values of coefficient matrix
+ *                    (VALFMT)
+ *                    (blank in the case of matrix patterns)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *      R Real matrix
+ *      C Complex matrix
+ *      I integer matrix
+ *      P Pattern only (no numerical values supplied)
+ *      Q Pattern only (numerical values supplied in associated auxiliary value
+ *        file)
+ *
+ * Second Character:
+ *      S Symmetric
+ *      U Unsymmetric
+ *      H Hermitian
+ *      Z Skew symmetric
+ *      R Rectangular
+ *
+ * Third Character:
+ *      A Compressed column form
+ *      E Elemental form
+ *
+ * </pre>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_ddefs.h"
+
+/*! \brief Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp);
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp);
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+           && *tmp != 'F' && *tmp != 'f') {
+        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
+
+    return 0;
+}
+
+static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+        fgets(buf, 100, fp);    /* read a line at a time */
+        for (j=0; j<perline && i<n; j++) {
+            tmp = buf[(j+1)*persize];     /* save the char at that place */
+            buf[(j+1)*persize] = 0;       /* null terminate */
+            item = atoi(&buf[j*persize]); 
+            buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+            where[i++] = item - 1;
+        }
+    }
+
+    return 0;
+}
+
+static int dReadValues(FILE *fp, int_t n, double *destination,
+        int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+        fgets(buf, 100, fp);    /* read a line at a time */
+        for (j=0; j<perline && i<n; j++) {
+            tmp = buf[(j+1)*persize];     /* save the char at that place */
+            buf[(j+1)*persize] = 0;       /* null terminate */
+            s = j*persize;
+            for (k = 0; k < persize; ++k) /* No D_ format in C */
+                if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+            destination[i++] = atof(&buf[s]);
+            buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+        }
+    }
+
+    return 0;
+}
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+static void
+FormFullA(int_t n, int_t *nonz, double **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    double *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (double*) SUPERLU_MALLOC( *nonz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+    
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+	++k;
+      }
+      
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
+
+void
+dreadrb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+        double **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, numer_lines = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int sym;
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+    fputs(buf, stdout);
+
+    /* Line 2 */
+    for (i=0; i<4; i++) {
+        fscanf(fp, "%14c", buf); buf[14] = 0;
+        tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+        if (i == 3) numer_lines = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if (DEBUGlevel >= 1)
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
+    if (tmp != 0)
+        if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+        if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    DumpLine(fp);
+
+#if (DEBUGlevel >= 1)
+    if ( !iam ) {
+        printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+        printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+        printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+        printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+    if ( numer_lines ) {
+        dReadValues(fp, *nonz, *nzval, valnum, valsize);
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+
+    fclose(fp);
+}
diff --git a/SRC/dreadtriple.c b/SRC/dreadtriple.c
new file mode 100644
index 0000000..563f1f9
--- /dev/null
+++ b/SRC/dreadtriple.c
@@ -0,0 +1,180 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_ddefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    double **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    double *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    
+    /* 	File format:
+     *    First line:  #rows    #non-zero
+     *    Triplet in the rest of lines:
+     *                 row    col    value
+     */
+
+#ifdef _LONGINT
+    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+#else
+    fscanf(fp, "%d%d%d", m, n, nonz);
+#endif
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+#else
+	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+void dreadrhs(int m, double *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "dreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf\n", &b[i]);
+      /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
+    /*        readpair_(j, &b[i]);*/
+
+    fclose(fp);
+}
+
+
diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c
new file mode 100644
index 0000000..bc6e7a5
--- /dev/null
+++ b/SRC/dreadtriple_noheader.c
@@ -0,0 +1,199 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_ddefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    double **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    i, j, k, jsize, lasta, nnz, nz, new_nonz, minn = 100;
+    double *a, *val, vali;
+    int_t    *asub, *xa, *row, *col;
+    int      zero_base = 0, ret_val = 0;
+
+    /* 	File format: Triplet in a line for each nonzero entry:
+     *                 row    col    value
+     *         or      row    col    real_part	imaginary_part
+     */
+
+    /* First pass: determine N and NNZ */
+    nz = *n = 0;
+
+#ifdef _LONGINT
+    ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+#else
+    ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
+#endif
+
+    while (ret_val != EOF) {
+	*n = SUPERLU_MAX(*n, i);
+	*n = SUPERLU_MAX(*n, j);
+	minn = SUPERLU_MIN(minn, i);
+	minn = SUPERLU_MIN(minn, j);
+	++nz;
+
+#ifdef _LONGINT
+        ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+#else
+        ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
+#endif
+    }
+    
+    if ( minn == 0 ) { /* zero-based indexing */
+	zero_base = 1;
+	++(*n);
+	printf("triplet file: row/col indices are zero-based.\n");
+    } else {
+	printf("triplet file: row/col indices are one-based.\n");
+    }
+
+    *m = *n;
+    *nonz = nz;
+    rewind(fp);
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+
+    /* Second pass: read the actual matrix values */
+    printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz);
+    dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+#else
+	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+#if 0
+void dreadrhs(int m, double *b)
+{
+    FILE *fp, *fopen();
+    int i, j;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf\n", &b[i]);
+
+    fclose(fp);
+}
+#endif
+
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
new file mode 100644
index 0000000..af18ea8
--- /dev/null
+++ b/SRC/dscatter.c
@@ -0,0 +1,516 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+static void
+dscatter_l_1 (int ib,
+           int ljb,
+           int nsupc,
+           int_t iukp,
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int * usub,
+           int * lsub,
+           double *tempv,
+           int * indirect_thread,
+           int_t ** Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+	   gridinfo_t * grid)
+{
+    // TAU_STATIC_TIMER_START("SCATTER_LB");
+    // printf("hello\n");
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    while (ijb != ib)
+    {
+        /* Search for dest block --
+           blocks are not ordered! */
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+
+        ijb = index[lptrj];
+    }
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    lptrj += LB_DESCRIPTOR;
+    for (i = 0; i < index[lptrj - 1]; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    // tempv =bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        // printf("segsize %d \n",segsize);
+        if (segsize) {
+            /*#pragma _CRI cache_bypass nzval,tempv */
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                nzval[indirect_thread[rel]] -= tempv[i];
+                // printf("i (src) %d, perm (dest) %d  \n",i,indirect_thread[rel]);
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                // if(!(*(long*)&zz == *(long*)&tempv[i]) )
+                printf ("(%d %d, %0.3e, %0.3e, %3e ) ", ljb,
+                        nzval - Lnzval_bc_ptr[ljb] + indirect_thread[rel],
+                        nzval[indirect_thread[rel]] + tempv[i],
+                        nzval[indirect_thread[rel]],tempv[i]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }
+            // printf("\n");
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }
+        nzval += ldv;
+        // printf("%d\n",nzval );
+    }
+    // TAU_STATIC_TIMER_STOP("SCATTER_LB");
+} /* dscatter_l_1 */
+
+static void
+dscatter_l (
+           int ib,    /* row block number of source block L(i,k) */
+           int ljb,   /* local column block number of dest. block L(i,j) */
+           int nsupc, /* number of columns in destination supernode */
+           int_t iukp, /* point to destination supernode's index[] */
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr, /* Input, point to index[] location of block L(i,k) */
+	   int temp_nbrow, /* number of rows in block L(i,k) */
+           int_t* usub,
+           int_t* lsub,
+           double *tempv,
+           int* indirect_thread,int* indirect2,
+           int_t ** Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+           gridinfo_t * grid)
+{
+    
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    
+    while (ijb != ib)  /* Search for destination block L(i,j) */
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+    
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow; 
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow=index[lptrj - 1];
+    
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] =indirect_thread[rel]; 
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+    
+} /* dscatter_l */
+
+
+static void
+dscatter_u (int ib,
+           int jb,
+           int nsupc,
+           int_t iukp,
+           int_t * xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int_t* lsub,
+           int_t* usub,
+           double* tempv,
+           int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr,
+           gridinfo_t * grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    // TAU_STATIC_TIMER_START("SCATTER_U");
+    // TAU_STATIC_TIMER_START("SCATTER_UB");
+
+    int_t jj, i, fnz, rel;
+    int segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* Reinitilize the pointers to the begining of the 
+     * k-th column/row of L/U factors.
+     * usub[] - index array for panel U(k,:)
+     */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+        // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point to fstnz index of
+       block U(i,j). */
+    iuip_lib += UB_DESCRIPTOR;
+
+    // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            ucol = &Unzval_br_ptr[lib][ruip_lib];
+
+            // printf("========Entering loop=========\n");
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+
+                rel = lsub[lptr + i] - fnz;
+                // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
+                // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
+
+                ucol[rel] -= tempv[i];
+
+                // printf("hello\n");
+
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                if (!(*(long *) &zz == *(long *) &tempv[i]))
+                    printf ("(%d, %0.3e, %0.3e ) ", rel, ucol[rel] + tempv[i],
+                            ucol[rel]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }                       /*ig segsize */
+        ruip_lib += ilst - fnz;
+
+    }                           /*for jj=0:nsupc */
+#ifdef PI_DEBUG
+    // printf("\n");
+#endif
+    // TAU_STATIC_TIMER_STOP("SCATTER_UB");
+} /* dscatter_u */
+
+
+/*Divide CPU-GPU dgemm work here*/
+#ifdef PI_DEBUG
+int Ngem = 2;
+// int_t Ngem = 0;
+int min_gpu_col = 6;
+#else
+
+    // int_t Ngem = 0;
+
+#endif
+
+
+#ifdef GPU_ACC
+
+void
+gemm_division_cpu_gpu(
+    int* num_streams_used,  /*number of streams that will be used */
+    int* stream_end_col,    /*array holding last column blk for each partition */
+    int * ncpu_blks,        /*Number of CPU dgemm blks */
+    /*input */
+    int nbrow,              /*number of row in A matrix */
+    int ldu,                /*number of k in dgemm */
+    int nstreams, 
+    int* full_u_cols,       /*array containing prefix sum of work load */
+    int num_blks            /*Number of work load */
+)
+{
+    int Ngem = sp_ienv(7);  /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2 
+     */
+    int i, j;
+
+    // {
+    //     *num_streams_used=0;
+    //     *ncpu_blks = num_blks;
+    //     return;
+    // }
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (full_u_cols[num_blks - 1] < (Ngem / (nbrow * ldu)) || num_blks == 1 )
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+#ifdef PI_DEBUG
+        printf ("full_u_cols[num_blks-1] %d  %d \n",
+                full_u_cols[num_blks - 1], (Ngem / (nbrow * ldu)));
+        printf ("Early return \n");
+#endif
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (full_u_cols[i + 1] > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+        full_u_cols[num_blks - 1] - full_u_cols[*ncpu_blks - 1];
+
+#ifdef PI_DEBUG
+    printf ("Remaining cols %d num_blks %d cpu_blks %d \n", cols_remain,
+            num_blks, *ncpu_blks);
+#endif
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+#ifdef PI_DEBUG
+        printf ("%d %d  %d %d \n", full_u_cols[num_blks - 1],
+                full_u_cols[*ncpu_blks], *ncpu_blks, nstreams);
+#endif
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+#ifdef PI_DEBUG
+        printf ("cols_per_stream :\t%d\n", cols_per_stream);
+#endif
+
+        int_t cutoff = cols_per_stream + full_u_cols[*ncpu_blks - 1];
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+#ifdef PI_DEBUG
+                printf ("i %d, j %d, %d  %d ", i, j, full_u_cols[j + 1],
+                        cutoff);
+#endif
+                if (full_u_cols[j + 1] > cutoff)
+                {
+#ifdef PI_DEBUG
+                    printf ("cutoff met \n");
+#endif
+                    cutoff = cols_per_stream + full_u_cols[j];
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+#ifdef PI_DEBUG
+                printf ("\n");
+#endif
+            }
+
+        }
+
+    }
+}
+
+void
+gemm_division_new (int * num_streams_used,   /*number of streams that will be used */
+                   int * stream_end_col, /*array holding last column blk for each partition */
+                   int * ncpu_blks,  /*Number of CPU dgemm blks */
+                        /*input */
+                   int nbrow,    /*number of row in A matrix */
+                   int ldu,  /*number of k in dgemm */
+                   int nstreams,
+                   Ublock_info_t *Ublock_info,    /*array containing prefix sum of work load */
+                   int num_blks  /*Number of work load */
+    )
+{
+    int Ngem = sp_ienv(7); /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2 
+     */
+    int_t i, j;
+
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (Ublock_info[num_blks - 1].full_u_cols < (Ngem / (nbrow * ldu)) || num_blks == 1)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (Ublock_info[i + 1].full_u_cols > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+       Ublock_info [num_blks - 1].full_u_cols - Ublock_info[*ncpu_blks - 1].full_u_cols;
+
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+
+        int_t cutoff = cols_per_stream + Ublock_info[*ncpu_blks - 1].full_u_cols;
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+                if (Ublock_info[j + 1].full_u_cols > cutoff)
+                {
+
+                    cutoff = cols_per_stream + Ublock_info[j].full_u_cols;
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+
+            }
+
+        }
+
+    }
+}
+
+#endif  /* defined GPU_ACC */
diff --git a/SRC/dsp_blas2_dist.c b/SRC/dsp_blas2_dist.c
new file mode 100644
index 0000000..fef56c0
--- /dev/null
+++ b/SRC/dsp_blas2_dist.c
@@ -0,0 +1,502 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Sparse BLAS 2, using some dense BLAS 2 operations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		sp_blas2.c
+ * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
+ */
+
+#include "superlu_ddefs.h"
+
+
+/* 
+ * Function prototypes 
+ */
+#ifndef USE_VENDOR_BLAS
+extern void dusolve(int, int, double*, double*);
+extern void dlsolve(int, int, double*, double*);
+extern void dmatvec(int, int, int, double*, double*, double*);
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ *   Purpose
+ *   =======
+ *
+ *   sp_dtrsv_dist() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.   
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes, i.e.,
+ *             L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *    
+ *   x       - (input/output) double*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * <pre>
+ */
+int
+sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+	      SuperMatrix *U, double *x, int *info)
+{
+
+#ifdef _CRAY
+    _fcd ftcs1, ftcs2, ftcs3;
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    double   *Lval, *Uval;
+    int incx = 1, incy = 1;
+    double alpha = 1.0, beta = 1.0;
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    double *work;
+    flops_t solve_ops;
+    /*extern SuperLUStat_t SuperLUStat;*/
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U",1) !=0 ) *info = -1;
+    else if ( strncmp(trans, "N",1) !=0 && strncmp(trans, "T", 1) !=0 )
+	*info = -2;
+    else if ( strncmp(diag, "U", 1) !=0 && strncmp(diag, "N", 1) != 0 )
+	*info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerr_dist("sp_dtrsv_dist", &i);
+	return 0;
+    }
+
+    Lstore = (SCformat *) L->Store;
+    Lval = (double *) Lstore->nzval;
+    Ustore = (NCformat *) U->Store;
+    Uval = (double *) Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = doubleCalloc_dist(L->nrow)) )
+	ABORT("Malloc fails for work in sp_dtrsv_dist().");
+    
+    if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = L_FST_SUPC(k);
+		istart = L_SUB_START(fsupc);
+		nsupr = L_SUB_START(fsupc+1) - istart;
+		nsupc = L_FST_SUPC(k+1) - fsupc;
+		luptr = L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+
+	        solve_ops += nsupc * (nsupc - 1);
+	        solve_ops += 2 * nrow * nsupc;
+
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
+			irow = L_SUB(iptr);
+			++luptr;
+			x[irow] -= x[fsupc] * Lval[luptr];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("L", strlen("L"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx, 1, 1, 1);
+		
+		    dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
+#endif /* _CRAY */		
+#else
+		    dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+			&x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = L_SUB(iptr);
+			x[irow] -= work[i];	/* Scatter */
+			work[i] = 0.0;
+
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+		
+    	        solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
+			irow = U_SUB(i);
+			x[irow] -= x[fsupc] * Uval[i];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("U", strlen("U"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else		
+		    dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
+				i++) {
+			    irow = U_SUB(i);
+			    x[irow] -= x[jcol] * Uval[i];
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = L_FST_SUPC(k);
+	    	istart = L_SUB_START(fsupc);
+	    	nsupr = L_SUB_START(fsupc+1) - istart;
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		solve_ops += 2 * (nsupr - nsupc) * nsupc;
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = L_NZ_START(jcol) + nsupc; 
+				i < L_NZ_START(jcol+1); i++) {
+			irow = L_SUB(iptr);
+			x[jcol] -= x[irow] * Lval[i];
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += nsupc * (nsupc - 1);
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
+			irow = U_SUB(i);
+			x[jcol] -= x[irow] * Uval[i];
+		    }
+		}
+
+		solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
+    SUPERLU_FREE(work);
+    return 0;
+}
+
+
+/*! \brief
+
+<pre>
+  Purpose   
+    =======   
+
+    sp_dgemv_dist()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) double
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
+
+    X      - (input) double*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) double
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) double*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+</pre>
+*/
+int
+sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x, 
+	      int incx, double beta, double *y, int incy)
+{
+
+    /* Local variables */
+    NCformat *Astore;
+    double   *Aval;
+    int info;
+    double temp;
+    int lenx, leny, i, j, irow;
+    int iy, jx, jy, kx, ky;
+    int notran;
+
+    notran = (strncmp(trans, "N", 1)==0);
+    Astore = (NCformat *) A->Store;
+    Aval = (double *) Astore->nzval;
+    
+    /* Test the input parameters */
+    info = 0;
+    if ( !notran && strncmp(trans, "T", 1) !=0 && strncmp(trans, "C", 1) != 0)
+	info = 1;
+    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
+    else if (incx == 0) info = 5;
+    else if (incy == 0)	info = 8;
+    if (info != 0) {
+	xerr_dist("sp_dgemv_dist ", &info);
+	return 0;
+    }
+
+    /* Quick return if possible. */
+    if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.)
+	return 0;
+
+    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+       up the start points in  X  and  Y. */
+    if ( strncmp(trans, "N", 1)==0 ) {
+	lenx = A->ncol;
+	leny = A->nrow;
+    } else {
+	lenx = A->nrow;
+	leny = A->ncol;
+    }
+    if (incx > 0) kx = 0;
+    else kx =  - (lenx - 1) * incx;
+    if (incy > 0) ky = 0;
+    else ky =  - (leny - 1) * incy;
+
+    /* Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+    /* First form  y := beta*y. */
+    if (beta != 1.) {
+	if (incy == 1) {
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) y[i] = 0.;
+	    else
+		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
+	} else {
+	    iy = ky;
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = 0.;
+		    iy += incy;
+		}
+	    else
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = beta * y[iy];
+		    iy += incy;
+		}
+	}
+    }
+    
+    if (alpha == 0.) return 0;
+
+    if ( notran ) {
+	/* Form  y := alpha*A*x + y. */
+	jx = kx;
+	if (incy == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		if (x[jx] != 0.) {
+		    temp = alpha * x[jx];
+		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+			irow = Astore->rowind[i];
+			y[irow] += temp * Aval[i];
+		    }
+		}
+		jx += incx;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    } else {
+	/* Form  y := alpha*A'*x + y. */
+	jy = ky;
+	if (incx == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		temp = 0.;
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    irow = Astore->rowind[i];
+		    temp += Aval[i] * x[irow];
+		}
+		y[jy] += alpha * temp;
+		jy += incy;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    }
+    return 0;
+} /* sp_dgemv_dist */
diff --git a/SRC/dsp_blas3_dist.c b/SRC/dsp_blas3_dist.c
new file mode 100644
index 0000000..e12efa5
--- /dev/null
+++ b/SRC/dsp_blas3_dist.c
@@ -0,0 +1,135 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Sparse BLAS3, using some dense BLAS3 operations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		sp_blas3.c
+ * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
+ */
+
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre>
+  Purpose   
+    =======   
+
+    sp_d performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) double
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) double
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.   
+</pre>
+*/
+int
+sp_dgemm_dist(char *transa, int n, double alpha, SuperMatrix *A,
+	      double *b, int ldb, double beta, double *c, int ldc)
+{
+
+    int    incx = 1, incy = 1;
+    int    j;
+
+    for (j = 0; j < n; ++j) {
+	sp_dgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
+    }
+    return 0;
+}
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
new file mode 100644
index 0000000..752edb6
--- /dev/null
+++ b/SRC/dutil_dist.c
@@ -0,0 +1,614 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+void
+dCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
+			    double *nzval, int_t *rowind, int_t *colptr,
+			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NCformat *) A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+dCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
+			       int_t nnz_loc, int_t m_loc, int_t fst_row,
+			       double *nzval, int_t *colind, int_t *rowptr,
+			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat_loc *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NRformat_loc *) A->Store;
+    Astore->nnz_loc = nnz_loc;
+    Astore->fst_row = fst_row;
+    Astore->m_loc = m_loc;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/*! \brief Convert a row compressed storage into a column compressed storage.
+ */
+void
+dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
+                         double *a, int_t *colind, int_t *rowptr,
+                         double **at, int_t **rowind, int_t **colptr)
+{
+    register int i, j, col, relpos;
+    int_t *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (double *) doubleMalloc_dist(nnz);
+    *rowind = intMalloc_dist(nnz);
+    *colptr = intMalloc_dist(n+1);
+    marker = intCalloc_dist(n);
+    
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+/*! \brief Copy matrix A into matrix B. */
+void
+dCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void dPrint_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i;
+    double       *dp;
+    
+    printf("\nCompCol matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NCformat *) A->Store;
+    printf("nrow %lld, ncol %lld, nnz %lld\n", (long long) A->nrow,
+	    (long long) A->ncol, (long long) Astore->nnz);
+    if ( (dp = (double *) Astore->nzval) != NULL ) {
+        printf("nzval:\n");
+        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
+    }
+    printf("\nrowind:\n");
+    for (i = 0; i < Astore->nnz; ++i) 
+        printf("%lld  ", (long long) Astore->rowind[i]);
+    printf("\ncolptr:\n");
+    for (i = 0; i <= A->ncol; ++i) 
+        printf("%lld  ", (long long) Astore->colptr[i]);
+    printf("\nend CompCol matrix.\n");
+}
+
+void dPrint_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    double       *dp;
+    
+    printf("\nDense matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (double *) Astore->nzval;
+    printf("nrow %lld, ncol %lld, lda %lld\n", 
+        (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
+    printf("\nend Dense matrix.\n");
+}
+
+int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc  *Astore;
+    int_t  nnz_loc, m_loc;
+    double  *dp;
+    
+    printf("\n==== CompRowLoc matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    printf("nrow %ld, ncol %ld\n", 
+            (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc, 
+            (long int) m_loc, (long int) Astore->fst_row);
+    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
+    PrintInt10("colind", nnz_loc, Astore->colind);
+    if ( (dp = (double *) Astore->nzval) != NULL )
+        PrintDouble5("nzval", nnz_loc, dp);
+    printf("==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
+{
+    NRformat_loc     *Astore;
+    int_t  nnz_loc, m_loc;
+    double       *dp;
+    
+    fprintf(fp, "\n==== CompRowLoc matrix: ");
+    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    fprintf(fp, "nrow %ld, ncol %ld\n", (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    fprintf(fp, "nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
+    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
+    if ( (dp = (double *) Astore->nzval) != NULL )
+        file_PrintDouble5(fp, "nzval", nnz_loc, dp);
+    fprintf(fp, "==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+void
+dCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, double *x,
+			  int_t ldx, Stype_t stype, Dtype_t dtype,
+			  Mtype_t mtype)
+{
+    DNformat    *Xstore;
+    
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (double *) x;
+}
+
+void
+dCopy_Dense_Matrix_dist(int_t M, int_t N, double *X, int_t ldx,
+			double *Y, int_t ldy)
+{
+/*! \brief
+ *
+ * <pre>
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ * </pre>
+ */
+    int    i, j;
+    
+    for (j = 0; j < N; ++j)
+        for (i = 0; i < M; ++i)
+            Y[i + j*ldy] = X[i + j*ldx];
+}
+
+void
+dCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
+			      double *nzval, int_t *nzval_colptr,
+			      int_t *rowind, int_t *rowind_colptr,
+			      int_t *col_to_sup, int_t *sup_to_col,
+			      Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    SCformat *Lstore;
+
+    L->Stype = stype;
+    L->Dtype = dtype;
+    L->Mtype = mtype;
+    L->nrow = m;
+    L->ncol = n;
+    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
+    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
+    Lstore = L->Store;
+    Lstore->nnz = nnz;
+    Lstore->nsuper = col_to_sup[n];
+    Lstore->nzval = nzval;
+    Lstore->nzval_colptr = nzval_colptr;
+    Lstore->rowind = rowind;
+    Lstore->rowind_colptr = rowind_colptr;
+    Lstore->col_to_sup = col_to_sup;
+    Lstore->sup_to_col = sup_to_col;
+
+}
+
+void
+dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx)
+{
+    int  i, j;
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) {
+	    if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/
+	    else x[i + j*ldx] = 1.0;
+	}
+}
+
+/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
+ */
+void
+dFillRHS_dist(char *trans, int_t nrhs, double *x, int_t ldx,
+	      SuperMatrix *A, double *rhs, int_t ldb)
+{
+    double one = 1.0;
+    double zero = 0.0;
+
+    sp_dgemm_dist(trans, nrhs, one, A, x, ldx, zero, rhs, ldb);
+
+}
+
+/*! \brief Fills a double precision array with a given value.
+ */
+void 
+dfill_dist(double *a, int_t alen, double dval)
+{
+    register int_t i;
+    for (i = 0; i < alen; i++) a[i] = dval;
+}
+
+
+
+/*! \brief Check the inf-norm of the error vector 
+ */
+void dinf_norm_error_dist(int_t n, int_t nrhs, double *x, int_t ldx,
+			  double *xtrue, int_t ldxtrue,
+                          gridinfo_t *grid)
+{
+    double err, xnorm;
+    double *x_work, *xtrue_work;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
+	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
+      }
+      err = err / xnorm;
+      printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
+void PrintDouble5(char *name, int_t len, double *x)
+{
+    register int_t i;
+    
+    printf("%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4);
+	printf("%14e", x[i]);
+    }
+    printf("\n");
+}
+
+int file_PrintDouble5(FILE *fp, char *name, int_t len, double *x)
+{
+    register int_t i;
+    
+    fprintf(fp, "%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 5 == 0 ) fprintf(fp, "\n[%ld-%ld] ", (long int) i, (long int) i+4);
+	fprintf(fp, "%14e", x[i]);
+    }
+    fprintf(fp, "\n");
+    return 0;
+}
+
+/*! \brief Print the blocks in the factored matrix L.
+ */
+void dPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
+		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
+{
+    register int c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb;
+    register int_t k, mycol, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+
+    printf("\n[%d] L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam);
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    printf("[%d] block column %d (local # %d), nsupc %d, # row blocks %d\n",
+		   iam, gb, lb, nsupc, nb);
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", 
+		       iam, c, index[k], len);
+		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
+		for (j = 0; j < nsupc; ++j) {
+		    PrintDouble5("nzval", len, &nzval[r + j*nsupr]);
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+	printf("(%d)", iam);
+ 	PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
+	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
+    }
+    printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
+    k = CEILING( nsupers, grid->nprow );
+    PrintInt10("fmod", k, Llu->fmod);
+    
+} /* DPRINTLBLOCKS */
+
+
+/*! \brief Print the blocks in the factored matrix U.
+ */
+void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, 
+		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
+{
+    register int c, extra, jb, k, lb, len, nb, nrb, nsupc;
+    register int_t myrow, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+
+    printf("\n[%d] U BLOCKS IN ROW-MAJOR ORDER -->\n", iam);
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    nb = index[0];
+	    printf("[%d] block row " IFMT " (local # %d), # column blocks %d\n",
+		   iam, lb*grid->nprow+myrow, lb, nb);
+	    r  = 0;
+	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
+		jb = index[k];
+		len = index[k+1];
+		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", 
+		       iam, c, jb, index[k+1]);
+		nsupc = SuperSize( jb );
+		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
+		PrintDouble5("nzval", len, &nzval[r]);
+		k += UB_DESCRIPTOR + nsupc;
+		r += len;
+	    }
+
+	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
+	}
+    }
+} /* DPRINTUBLOCKS */
+
+int
+dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm,
+                 gridinfo_t *grid)
+{
+  int_t procs = grid->nprow*grid->npcol;
+  fprintf(fp, "TotalIndSend " IFMT "\tTotalValSend " IFMT "\n", gsmv_comm->TotalIndSend,
+	  gsmv_comm->TotalValSend);
+  file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start);
+  file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend);
+  file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv);
+  file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend);
+  file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv);
+  file_PrintInt32(fp, "SendCounts", procs, gsmv_comm->SendCounts);
+  file_PrintInt32(fp, "RecvCounts", procs, gsmv_comm->RecvCounts);
+  return 0;
+}
+
+
+void
+GenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist,
+	    gridinfo_t *grid, double **xact, int *ldx, double **b, int *ldb)
+{
+    int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows,
+          nsupr, nsupers, rel;
+    int_t *supno, *xsup, *lxsup;
+    double *x, *bb;
+    NCformat *Astore;
+    double   *Aval;
+
+    n = A->ncol;
+    *ldb = 0;
+    supno = Glu_persist->supno;
+    xsup = Glu_persist->xsup;
+    nsupers = supno[n-1] + 1;
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    Astore = (NCformat *) A->Store;
+    Aval = (double *) Astore->nzval;
+    lb = CEILING( nsupers, grid->nprow ) + 1;
+    if ( !(lxsup = intMalloc_dist(lb)) )
+	ABORT("Malloc fails for lxsup[].");
+
+    lsup = 0;
+    nlrows = 0;
+    for (j = 0; j < nsupers; ++j) {
+	i = PROW( j, grid );
+	if ( myrow == i ) {
+	    nsupr = SuperSize( j );
+	    *ldb += nsupr;
+	    lxsup[lsup++] = nlrows;
+	    nlrows += nsupr;
+	}
+    }
+    *ldx = n;
+    if ( !(x = doubleMalloc_dist(((size_t)*ldx) * nrhs)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(bb = doubleCalloc_dist(*ldb * nrhs)) )
+	ABORT("Calloc fails for bb[].");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0;
+
+    /* Form b = A*x. */
+    for (j = 0; j < n; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    gb = supno[irow];
+	    gbrow = PROW( gb, grid );
+	    if ( myrow == gbrow ) {
+		rel = irow - xsup[gb];
+		lb = LBi( gb, grid );
+		bb[lxsup[lb] + rel] += Aval[i] * x[j];
+	    }
+	}
+
+    /* Memory allocated but not freed: xact, b */
+    *xact = x;
+    *b = bb;
+
+    SUPERLU_FREE(lxsup);
+
+#if ( PRNTlevel>=2 )
+    for (i = 0; i < grid->nprow*grid->npcol; ++i) {
+	if ( iam == i ) {
+	    printf("\n(%d)\n", iam);
+	    PrintDouble5("rhs", *ldb, *b);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+} /* GENXTRUERHS */
+
+/* g5.rua
+          b = A*x    y = L\b
+   0      1	     1.0000
+   1      0	     0.2500
+   2      1	     1.0000
+   3      2	     2.0000
+   4      1	     1.7500
+   5      1	     1.8917
+   6      0	     1.1879
+   7      2	     2.0000
+   8      2	     2.0000
+   9      1	     1.0000
+   10     1	     1.7500
+   11     0	          0
+   12     1	     1.8750
+   13     2	     2.0000
+   14     1	     1.0000
+   15     0	     0.2500
+   16     1	     1.7667
+   17     0	     0.6419
+   18     1	     2.2504
+   19     0	     1.1563
+   20     0	     0.9069
+   21     0	     1.4269
+   22     1	     2.7510
+   23     1	     2.2289
+   24     0	     2.4332
+
+   g6.rua
+       b=A*x  y=L\b
+    0    0         0
+    1    1    1.0000
+    2    1    1.0000
+    3    2    2.5000
+    4    0         0
+    5    2    2.0000
+    6    1    1.0000
+    7    1    1.7500
+    8    1    1.0000
+    9    0    0.2500
+   10    0    0.5667
+   11    1    2.0787
+   12    0    0.8011
+   13    1    1.9838
+   14    1    1.0000
+   15    1    1.0000
+   16    2    2.5000
+   17    0    0.8571
+   18    0         0
+   19    1    1.0000
+   20    0    0.2500
+   21    1    1.0000
+   22    2    2.0000
+   23    1    1.7500
+   24    1    1.8917
+   25    0    1.1879
+   26    0    0.8011
+   27    1    1.9861
+   28    1    2.0199
+   29    0    1.3620
+   30    0    0.6136
+   31    1    2.3677
+   32    0    1.1011
+   33    0    1.5258
+   34    0    1.7628
+   35    0    2.1658
+*/
diff --git a/SRC/etree.c b/SRC/etree.c
new file mode 100644
index 0000000..25a5c59
--- /dev/null
+++ b/SRC/etree.c
@@ -0,0 +1,431 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Elimination tree computation and layout routines
+ *
+ * <pre>
+ *  Implementation of disjoint set union routines.
+ *  Elements are integers in 0..n-1, and the 
+ *  names of the sets themselves are of type int.
+ *  
+ *  Calls are:
+ *  initialize_disjoint_sets (n) initial call.
+ *  s = make_set (i)             returns a set containing only i.
+ *  s = link (t, u)		 returns s = t union u, destroying t and u.
+ *  s = find (i)		 return name of set containing i.
+ *  finalize_disjoint_sets 	 final call.
+ *
+ *  This implementation uses path compression but not weighted union.
+ *  See Tarjan's book for details.
+ *  John Gilbert, CMI, 1987.
+ *
+ *  Implemented path-halving by XL 7/5/95.
+ * </pre>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_ddefs.h"
+
+
+static 
+int_t *mxCallocInt(int_t n)
+{
+    register int_t i;
+    int_t *buf;
+
+    buf = (int_t *) SUPERLU_MALLOC( n * sizeof(int_t) );
+    if ( buf ) 
+	for (i = 0; i < n; i++) buf[i] = 0;
+    return (buf);
+}
+      
+static
+void initialize_disjoint_sets (
+			       int_t n,
+			       int_t **pp    /* parent array for sets */
+			       )
+{
+	if ( !( (*pp) = mxCallocInt(n)) ) 
+	    ABORT("mxCallocInit fails for pp[]");
+}
+
+
+static
+int_t make_set (
+		int_t   i,
+		int_t	*pp    /* parent array for sets */
+		)
+{
+	pp[i] = i;
+	return i;
+}
+
+
+static
+int_t link (
+	    int_t s,
+	    int_t t,
+	    int_t *pp
+	    )
+{
+	pp[s] = t;
+	return t;
+}
+
+
+/* PATH HALVING */
+static
+int_t find (
+	    int_t i,
+	    int_t *pp
+	    )
+{
+    register int_t p, gp;
+    
+    p = pp[i];
+    gp = pp[p];
+    while (gp != p) {
+	pp[i] = gp;
+	i = gp;
+	p = pp[i];
+	gp = pp[p];
+    }
+    return (p);
+}
+
+#if 0
+/* PATH COMPRESSION */
+static
+int_t find (
+	    int_t i
+	    )
+{
+	if (pp[i] != i) 
+		pp[i] = find (pp[i]);
+	return pp[i];
+}
+#endif
+
+static
+void finalize_disjoint_sets (
+			     int_t *pp
+			    )
+{
+	SUPERLU_FREE(pp);
+}
+
+/*! \brief Symmetric elimination tree
+ *
+ * <pre>
+ *      p = spsymetree (A);
+ *
+ *      Find the elimination tree for symmetric matrix A.
+ *      This uses Liu's algorithm, and runs in time O(nz*log n).
+ *
+ *      Input:
+ *        Square sparse matrix A.  No check is made for symmetry;
+ *        elements below and on the diagonal are ignored.
+ *        Numeric values are ignored, so any explicit zeros are 
+ *        treated as nonzero.
+ *      Output:
+ *        Integer array of parents representing the etree, with n
+ *        meaning a root of the elimination forest.
+ *      Note:  
+ *        This routine uses only the upper triangle, while sparse
+ *        Cholesky (as in spchol.c) uses only the lower.  Matlab's
+ *        dense Cholesky uses only the upper.  This routine could
+ *        be modified to use the lower triangle either by transposing
+ *        the matrix or by traversing it by rows with auxiliary
+ *        pointer and link arrays.
+ *
+ *      John R. Gilbert, Xerox, 10 Dec 1990
+ *      Based on code by JRG dated 1987, 1988, and 1990.
+ *      Modified by X.S. Li, November 1999.
+ * </pre>
+ */
+int
+sp_symetree_dist(
+	    int_t *acolst, int_t *acolend, /* column starts and ends past 1 */
+	    int_t *arow,            /* row indices of A */
+	    int_t n,                /* dimension of A */
+	    int_t *parent	    /* parent in elim tree */
+	    )
+{
+	int_t	*root;		    /* root of subtee of etree 	*/
+	int_t	rset, cset;             
+	int_t	row, col;
+	int_t	rroot;
+	int_t	p;
+	int_t   *pp;
+
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(0, "Enter sp_symetree()");
+#endif
+
+	root = mxCallocInt (n);
+	initialize_disjoint_sets (n, &pp);
+
+	for (col = 0; col < n; col++) {
+		cset = make_set (col, pp);
+		root[cset] = col;
+		parent[col] = n; /* Matlab */
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = arow[p];
+			if (row >= col) continue;
+			rset = find (row, pp);
+			rroot = root[rset];
+			if (rroot != col) {
+				parent[rroot] = col;
+				cset = link (cset, rset, pp);
+				root[cset] = col;
+			}
+		}
+	}
+	SUPERLU_FREE (root);
+	finalize_disjoint_sets (pp);
+
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(0, "Exit sp_symetree()");
+#endif
+	return 0;
+} /* SP_SYMETREE_DIST */
+
+
+/*! \brief Nonsymmetric elimination tree
+ *
+ * <pre>
+ *      Find the elimination tree for A'*A.
+ *      This uses something similar to Liu's algorithm. 
+ *      It runs in time O(nz(A)*log n) and does not form A'*A.
+ *
+ *      Input:
+ *        Sparse matrix A.  Numeric values are ignored, so any
+ *        explicit zeros are treated as nonzero.
+ *      Output:
+ *        Integer array of parents representing the elimination
+ *        tree of the symbolic product A'*A.  Each vertex is a
+ *        column of A, and nc means a root of the elimination forest.
+ *
+ *      John R. Gilbert, Xerox, 10 Dec 1990
+ *      Based on code by JRG dated 1987, 1988, and 1990.
+ * </pre>
+ */
+int
+sp_coletree_dist(
+	    int_t *acolst, int_t *acolend, /* column start and end past 1 */
+	    int_t *arow,                   /* row indices of A */
+	    int_t nr, int_t nc,            /* dimension of A */
+	    int_t *parent	           /* parent in elim tree */
+	    )
+{
+	int_t	*root;			/* root of subtee of etree 	*/
+	int_t   *firstcol;		/* first nonzero col in each row*/
+	int_t	rset, cset;             
+	int_t	row, col;
+	int_t	rroot;
+	int_t	p;
+	int_t   *pp;
+
+#if ( DEBUGlevel>=1 )
+	int iam = 0;
+	CHECK_MALLOC(iam, "Enter sp_coletree()");
+#endif
+
+	root = mxCallocInt (nc);
+	initialize_disjoint_sets (nc, &pp);
+
+	/* Compute firstcol[row] = first nonzero column in row */
+
+	firstcol = mxCallocInt (nr);
+	for (row = 0; row < nr; firstcol[row++] = nc);
+	for (col = 0; col < nc; col++) 
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = arow[p];
+			firstcol[row] = SUPERLU_MIN(firstcol[row], col);
+		}
+
+	/* Compute etree by Liu's algorithm for symmetric matrices,
+           except use (firstcol[r],c) in place of an edge (r,c) of A.
+	   Thus each row clique in A'*A is replaced by a star
+	   centered at its first vertex, which has the same fill. */
+
+	for (col = 0; col < nc; col++) {
+		cset = make_set (col, pp);
+		root[cset] = col;
+		parent[col] = nc; /* Matlab */
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = firstcol[arow[p]];
+			if (row >= col) continue;
+			rset = find (row, pp);
+			rroot = root[rset];
+			if (rroot != col) {
+				parent[rroot] = col;
+				cset = link (cset, rset, pp);
+				root[cset] = col;
+			}
+		}
+	}
+
+	SUPERLU_FREE (root);
+	SUPERLU_FREE (firstcol);
+	finalize_disjoint_sets (pp);
+
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit sp_coletree()");
+#endif
+	return 0;
+} /* SP_COLETREE_DIST */
+
+/*! \brief Depth-first search from vertext
+ *
+ * <pre>
+ *  q = TreePostorder_dist (n, p);
+ *
+ *	Postorder a tree.
+ *	Input:
+ *	  p is a vector of parent pointers for a forest whose
+ *        vertices are the integers 0 to n-1; p[root]==n.
+ *	Output:
+ *	  q is a vector indexed by 0..n-1 such that q[i] is the
+ *	  i-th vertex in a postorder numbering of the tree.
+ *
+ *        ( 2/7/95 modified by X.Li:
+ *          q is a vector indexed by 0:n-1 such that vertex i is the
+ *          q[i]-th vertex in a postorder numbering of the tree.
+ *          That is, this is the inverse of the previous q. )
+ *
+ *	In the child structure, lower-numbered children are represented
+ *	first, so that a tree which is already numbered in postorder
+ *	will not have its order changed.
+ *    
+ *  Written by John Gilbert, Xerox, 10 Dec 1990.
+ *  Based on code written by John Gilbert at CMI in 1987.
+ * </pre>
+ */
+
+static int_t	*first_kid, *next_kid;	/* Linked list of children.	*/
+static int_t	*post, postnum;
+
+static
+/*
+ * Depth-first search from vertex v.
+ */
+void etdfs (
+	    int_t	  v,
+	    int_t   first_kid[],
+	    int_t   next_kid[],
+	    int_t   post[], 
+	    int_t   *postnum
+	    )
+{
+	int	w;
+
+	for (w = first_kid[v]; w != -1; w = next_kid[w]) {
+		etdfs (w, first_kid, next_kid, post, postnum);
+	}
+	/* post[postnum++] = v; in Matlab */
+	post[v] = (*postnum)++;    /* Modified by X. Li on 08/10/07 */
+}
+
+
+static
+/*
+ * Depth-first search from vertex n.
+ * No recursion.
+ */
+void nr_etdfs (int_t n, int_t *parent,
+	       int_t *first_kid, int_t *next_kid,
+	       int_t *post, int_t postnum)
+{
+    int_t current = n, first, next;
+
+    while (postnum != n){
+     
+        /* no kid for the current node */
+        first = first_kid[current];
+
+        /* no first kid for the current node */
+        if (first == -1){
+
+            /* numbering this node because it has no kid */
+            post[current] = postnum++;
+
+            /* looking for the next kid */
+            next = next_kid[current];
+
+            while (next == -1){
+
+                /* no more kids : back to the parent node */
+                current = parent[current];
+
+                /* numbering the parent node */
+                post[current] = postnum++;
+
+                /* get the next kid */
+                next = next_kid[current];
+	    }
+            
+            /* stopping criterion */
+            if (postnum==n+1) return;
+
+            /* updating current node */
+            current = next;
+        }
+        /* updating current node */
+        else {
+            current = first;
+	}
+    }
+}
+
+/*
+ * Post order a tree
+ */
+int_t *TreePostorder_dist(
+			  int_t n,
+			  int_t *parent
+			  )
+{
+	int_t	v, dad;
+	int_t   *first_kid, *next_kid, *post, postnum;
+
+	/* Allocate storage for working arrays and results	*/
+	if ( !(first_kid = mxCallocInt (n+1)) )
+	    ABORT("mxCallocInt fails for first_kid[]");
+	if ( !(next_kid = mxCallocInt (n+1)) )
+	    ABORT("mxCallocInt fails for next_kid[]");
+	if ( !(post = mxCallocInt (n+1)) )
+	    ABORT("mxCallocInt fails for post[]");
+
+	/* Set up structure describing children */
+	for (v = 0; v <= n; first_kid[v++] = -1);
+	for (v = n-1; v >= 0; v--) {
+		dad = parent[v];
+		next_kid[v] = first_kid[dad];
+		first_kid[dad] = v;
+	}
+
+	/* Depth-first search from dummy root vertex #n */
+	postnum = 0;
+#if 0
+	/* recursion */
+	etdfs (n, first_kid, next_kid, post, &postnum);
+#else
+	/* no recursion */
+	nr_etdfs(n, parent, first_kid, next_kid, post, postnum);
+#endif
+
+	SUPERLU_FREE(first_kid);
+	SUPERLU_FREE(next_kid);
+	return post;
+}
+
diff --git a/SRC/get_perm_c.c b/SRC/get_perm_c.c
new file mode 100644
index 0000000..14b208d
--- /dev/null
+++ b/SRC/get_perm_c.c
@@ -0,0 +1,544 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Gets matrix permutation
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * November 1, 2007
+ * Feburary 20, 2008
+ * </pre>
+ *
+ * Last update: 7/27/2011  fix a bug with metis ordering on empty graph.
+ *
+ */
+
+#include "superlu_ddefs.h"
+
+
+void
+get_metis(
+	  int_t n,         /* dimension of matrix B */
+	  int_t bnz,       /* number of nonzeros in matrix A. */
+	  int_t *b_colptr, /* column pointer of size n+1 for matrix B. */
+	  int_t *b_rowind, /* row indices of size bnz for matrix B. */
+	  int_t *perm_c    /* out - the column permutation vector. */
+	  )
+{
+    /*#define METISOPTIONS 8*/
+#define METISOPTIONS 40
+    int_t metis_options[METISOPTIONS];
+    int_t i, nm, numflag = 0; /* C-Style ordering */
+    int_t *perm, *iperm;
+    int_t *b_colptr_int, *b_rowind_int;
+    extern int check_perm_dist(char *what, int_t n, int_t *perm);
+
+    extern int METIS_NodeND(int_t*, int_t*, int_t*, int_t*, int_t*,
+			    int_t*, int_t*);
+
+    metis_options[0] = 0; /* Use Defaults for now */
+
+    perm = (int_t*) SUPERLU_MALLOC(2*n * sizeof(int_t));
+    if (!perm) ABORT("SUPERLU_MALLOC fails for perm.");
+    iperm = perm + n;
+    nm = n;
+
+#if 0
+#if defined(_LONGINT)
+    /* Metis can only take 32-bit integers */
+
+    if ( !(b_colptr_int = (int*) SUPERLU_MALLOC((n+1) * sizeof(int))) )
+	 ABORT("SUPERLU_MALLOC fails for b_colptr_int.");
+    for (i = 0; i < n+1; ++i) b_colptr_int[i] = b_colptr[i];
+    SUPERLU_FREE(b_colptr);
+    
+    if ( !(b_rowind_int = (int*) SUPERLU_MALLOC(bnz * sizeof(int))) )
+	ABORT("SUPERLU_MALLOC fails for b_rowind_int.");
+
+    for (i = 0; i < bnz; ++i) b_rowind_int[i] = b_rowind[i];
+    SUPERLU_FREE(b_rowind);
+#else
+    b_colptr_int = b_colptr;
+    b_rowind_int = b_rowind;
+#endif
+#endif
+
+    /* Call metis */
+#undef USEEND
+#ifdef USEEND
+    METIS_EdgeND(&nm, b_colptr_int, b_rowind_int, &numflag, metis_options,
+		 perm, iperm);
+#else
+
+    /* Earlier version 3.x.x */
+    /* METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options,
+       perm, iperm);*/
+
+    /* Latest version 4.x.x */
+    METIS_NodeND(&nm, b_colptr, b_rowind, NULL, NULL, perm, iperm);
+
+    /*check_perm_dist("metis perm",  n, perm);*/
+#endif
+
+    /* Copy the permutation vector into SuperLU data structure. */
+    for (i = 0; i < n; ++i) perm_c[i] = iperm[i];
+
+#if 0
+    SUPERLU_FREE(b_colptr_int);
+    SUPERLU_FREE(b_rowind_int);
+#else
+    SUPERLU_FREE(b_colptr);
+    SUPERLU_FREE(b_rowind);
+#endif
+    SUPERLU_FREE(perm);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'*A. A is an m-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'*A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (ata_colptr, ata_rowind).
+ *
+ * This routine is modified from GETATA routine by Tim Davis.
+ * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
+ * i.e., the sum of the square of the row counts.
+ *
+ * Questions
+ * =========
+ *     o  Do I need to withhold the *dense* rows?
+ *     o  How do I know the number of nonzeros in A'*A?
+ * </pre>
+ */
+void
+getata_dist(
+	    const int_t m,    /* number of rows in matrix A. */
+	    const int_t n,    /* number of columns in matrix A. */
+	    const int_t nz,   /* number of nonzeros in matrix A */
+	    int_t *colptr,    /* column pointer of size n+1 for matrix A. */
+	    int_t *rowind,    /* row indices of size nz for matrix A. */
+	    int_t *atanz,     /* out - on exit, returns the actual number of
+				 nonzeros in matrix A'*A. */
+	    int_t **ata_colptr, /* out - size n+1 */
+	    int_t **ata_rowind  /* out - size *atanz */
+	    )
+{
+
+    register int_t i, j, k, col, num_nz, ti, trow;
+    int_t *marker, *b_colptr, *b_rowind;
+    int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */
+
+    if ( !(marker = (int_t*) SUPERLU_MALLOC( (SUPERLU_MAX(m,n)+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (m+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+
+    
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < m; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = colptr[j]; i < colptr[j+1]; ++i)
+	    ++marker[rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < m; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose the matrix from A to T */
+    for (j = 0; j < n; ++j)
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    col = rowind[i];
+	    t_rowind[marker[col]] = j;
+	    ++marker[col];
+	}
+
+    
+    /* ----------------------------------------------------------------
+       compute B = T * A, where column j of B is:
+
+       Struct (B_*j) =    UNION   ( Struct (T_*k) )
+                        A_kj != 0
+
+       do not include the diagonal entry
+   
+       ( Partition A as: A = (A_*1, ..., A_*n)
+         Then B = T * A = (T * A_*1, ..., T * A_*n), where
+         T * A_*j = (T_*1, ..., T_*m) * A_*j.  )
+       ---------------------------------------------------------------- */
+
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+
+    /* First pass determines number of nonzeros in B */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+	    k = rowind[i];
+	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+		trow = t_rowind[ti];
+		if ( marker[trow] != j ) {
+		    marker[trow] = j;
+		    num_nz++;
+		}
+	    }
+	}
+    }
+    *atanz = num_nz;
+    
+    /* Allocate storage for A'*A */
+    if ( !(*ata_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for ata_colptr[]");
+    if ( *atanz ) {
+	if ( !(*ata_rowind = (int_t*)SUPERLU_MALLOC(*atanz*sizeof(int_t)) ) ) {
+	    fprintf(stderr, ".. atanz = %lld\n", (long long) *atanz);
+	    ABORT("SUPERLU_MALLOC fails for ata_rowind[]");
+	}
+    }
+    b_colptr = *ata_colptr; /* aliasing */
+    b_rowind = *ata_rowind;
+    
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+    
+    /* Compute each column of B, one at a time */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	b_colptr[j] = num_nz;
+	
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+	    k = rowind[i];
+	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+		trow = t_rowind[ti];
+		if ( marker[trow] != j ) {
+		    marker[trow] = j;
+		    b_rowind[num_nz++] = trow;
+		}
+	    }
+	}
+    }
+    b_colptr[n] = num_nz;
+       
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_colptr);
+    SUPERLU_FREE(t_rowind);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'+A. A is an n-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'+A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (b_colptr, b_rowind).
+ * </pre>
+ */
+void
+at_plus_a_dist(
+	       const int_t n,    /* number of columns in matrix A. */
+	       const int_t nz,   /* number of nonzeros in matrix A */
+	       int_t *colptr,    /* column pointer of size n+1 for matrix A. */
+	       int_t *rowind,    /* row indices of size nz for matrix A. */
+	       int_t *bnz,       /* out - on exit, returns the actual number of
+				    nonzeros in matrix A'+A. */
+	       int_t **b_colptr, /* out - size n+1 */
+	       int_t **b_rowind  /* out - size *bnz */
+	       )
+{
+
+    register int_t i, j, k, col, num_nz;
+    int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */
+    int_t *marker;
+
+    if ( !(marker = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_colptr[]");
+    if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails t_rowind[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = colptr[j]; i < colptr[j+1]; ++i)
+	    ++marker[rowind[i]];
+    }
+
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose the matrix from A to T */
+    for (j = 0; j < n; ++j) {
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    col = rowind[i];
+	    t_rowind[marker[col]] = j;
+	    ++marker[col];
+	}
+    }
+
+
+    /* ----------------------------------------------------------------
+       compute B = A + T, where column j of B is:
+
+       Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k)
+
+       do not include the diagonal entry
+       ---------------------------------------------------------------- */
+
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+
+    /* First pass determines number of nonzeros in B */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	/* Add pattern of column A_*k to B_*j */
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    k = rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		++num_nz;
+	    }
+	}
+
+	/* Add pattern of column T_*k to B_*j */
+	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	    k = t_rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		++num_nz;
+	    }
+	}
+    }
+    *bnz = num_nz;
+
+
+    /* Allocate storage for A+A' */
+    if ( !(*b_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for b_colptr[]");
+    if ( *bnz ) {
+	if ( !(*b_rowind = (int_t*) SUPERLU_MALLOC( *bnz * sizeof(int_t)) ) )
+	    ABORT("SUPERLU_MALLOC fails for b_rowind[]");
+    }
+    
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+    
+    /* Compute each column of B, one at a time */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	(*b_colptr)[j] = num_nz;
+	
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	/* Add pattern of column A_*k to B_*j */
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    k = rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		(*b_rowind)[num_nz++] = k;
+	    }
+	}
+
+	/* Add pattern of column T_*k to B_*j */
+	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	    k = t_rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		(*b_rowind)[num_nz++] = k;
+	    }
+	}
+    }
+    (*b_colptr)[n] = num_nz;
+       
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_colptr);
+    SUPERLU_FREE(t_rowind);
+} /* at_plus_a_dist */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple
+ * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A',
+ * or using approximate minimum degree column ordering by Davis et. al.
+ * The LU factorization of A*Pc tends to have less fill than the LU 
+ * factorization of A.
+ *
+ * Arguments
+ * =========
+ *
+ * ispec   (input) colperm_t
+ *         Specifies what type of column permutation to use to reduce fill.
+ *         = NATURAL: natural ordering (i.e., Pc = I)
+ *         = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A
+ *         = MMD_ATA: minimum degree ordering on structure of A'*A
+ *         = METIS_AT_PLUS_A: MeTis on A'+A
+ * 
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A 
+ *         can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
+ *         In the future, more general A can be handled.
+ *
+ * perm_c  (output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ * </pre>
+ */
+void
+get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c)
+
+{
+    NCformat *Astore = A->Store;
+    int_t m, n, bnz = 0, *b_colptr, *b_rowind, i;
+    int_t delta, maxint, nofsub, *invp;
+    int_t *dhead, *qsize, *llist, *marker;
+    double t, SuperLU_timer_();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC((int)pnum, "Enter get_perm_c_dist()");
+#endif
+
+    m = A->nrow;
+    n = A->ncol;
+
+    t = SuperLU_timer_();
+
+    switch ( ispec ) {
+
+        case NATURAL: /* Natural ordering */
+	      for (i = 0; i < n; ++i) perm_c[i] = i;
+#if ( PRNTlevel>=1 )
+	      if ( !pnum ) printf(".. Use natural column ordering\n");
+#endif
+	      return;
+
+        case MMD_AT_PLUS_A: /* Minimum degree ordering on A'+A */
+	      if ( m != n ) ABORT("Matrix is not square");
+	      at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			     &bnz, &b_colptr, &b_rowind);
+	      t = SuperLU_timer_() - t;
+	      /*printf("Form A'+A time = %8.3f\n", t);*/
+#if ( PRNTlevel>=1 )
+	      if ( !pnum ) printf(".. Use minimum degree ordering on A'+A.\n");
+#endif
+	      break;
+
+        case MMD_ATA: /* Minimum degree ordering on A'*A */
+	      getata_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+			  &bnz, &b_colptr, &b_rowind);
+	      t = SuperLU_timer_() - t;
+	      /*printf("Form A'*A time = %8.3f\n", t);*/
+#if ( PRNTlevel>=1 )
+	      if ( !pnum ) printf(".. Use minimum degree ordering on A'*A\n");
+#endif
+	      break;
+
+        case METIS_AT_PLUS_A: /* METIS ordering on A'+A */
+	      if ( m != n ) ABORT("Matrix is not square");
+	      at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			     &bnz, &b_colptr, &b_rowind);
+
+	      if ( bnz ) { /* non-empty adjacency structure */
+		  get_metis(n, bnz, b_colptr, b_rowind, perm_c);
+	      } else { /* e.g., diagonal matrix */
+		  for (i = 0; i < n; ++i) perm_c[i] = i;
+		  SUPERLU_FREE(b_colptr);
+		  /* b_rowind is not allocated in this case */
+	      }
+
+#if ( PRNTlevel>=1 )
+	      if ( !pnum ) printf(".. Use METIS ordering on A'+A\n");
+#endif
+	      return;
+
+        default:
+	      ABORT("Invalid ISPEC");
+    }
+
+    if ( bnz ) {
+	t = SuperLU_timer_();
+
+	/* Initialize and allocate storage for GENMMD. */
+	delta = 0; /* DELTA is a parameter to allow the choice of nodes
+		      whose degree <= min-degree + DELTA. */
+	maxint = 2147483647; /* 2**31 - 1 */
+	invp = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
+	if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp.");
+	dhead = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
+	if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead.");
+	qsize = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
+	if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize.");
+	llist = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
+	if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist.");
+	marker = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
+	if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker.");
+	
+	/* Transform adjacency list into 1-based indexing required by GENMMD.*/
+	for (i = 0; i <= n; ++i) ++b_colptr[i];
+	for (i = 0; i < bnz; ++i) ++b_rowind[i];
+	
+	genmmd_dist_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
+		     qsize, llist, marker, &maxint, &nofsub);
+
+	/* Transform perm_c into 0-based indexing. */
+	for (i = 0; i < n; ++i) --perm_c[i];
+
+	SUPERLU_FREE(invp);
+	SUPERLU_FREE(dhead);
+	SUPERLU_FREE(qsize);
+	SUPERLU_FREE(llist);
+	SUPERLU_FREE(marker);
+	SUPERLU_FREE(b_rowind);
+
+	t = SuperLU_timer_() - t;
+	/*    printf("call GENMMD time = %8.3f\n", t);*/
+
+    } else { /* Empty adjacency structure */
+	for (i = 0; i < n; ++i) perm_c[i] = i;
+    }
+
+    SUPERLU_FREE(b_colptr);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC((int) pnum, "Exit get_perm_c_dist()");
+#endif
+} /* get_perm_c_dist */
diff --git a/SRC/get_perm_c_parmetis.c b/SRC/get_perm_c_parmetis.c
new file mode 100644
index 0000000..92e7724
--- /dev/null
+++ b/SRC/get_perm_c_parmetis.c
@@ -0,0 +1,920 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Gets matrix permutation
+ *
+ * <pre>
+ * -- Distributed symbolic factorization auxialiary routine  (version 2.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
+ * INRIA France - January 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * </pre>
+ */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+#include <math.h>
+#include "parmetis.h"
+#include "superlu_ddefs.h"
+
+/*
+ * Internal protypes
+ */
+
+static float
+a_plus_at_CompRow_loc
+(int, int_t *, int, int_t *, int_t , int_t *, int_t *,  
+ int, int_t *, int_t *, int_t **,  int_t **, gridinfo_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a
+ * graph partitioning algorithm to the symmetrized graph A+A'.  The
+ * multilevel graph partitioning algorithm used is the
+ * ParMETIS_V3_NodeND routine available in the parallel graph
+ * partitioning package parMETIS.  
+ *
+ * The number of independent sub-domains noDomains computed by this
+ * algorithm has to be a power of 2.  Hence noDomains is the larger
+ * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow
+ * * npcol is the number of processors used in SuperLU_DIST.
+ *
+ * Arguments
+ * =========
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow.  Matrix A is distributed
+ *         in NRformat_loc format.
+ *
+ * perm_r  (input) int_t*
+ *         Row permutation vector of size A->nrow, which defines the 
+ *         permutation matrix Pr; perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *
+ * perm_c  (output) int_t*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ * nprocs_i (input) int*
+ *         Number of processors the input matrix is distributed on in a block
+ *         row format.  It corresponds to number of processors used in
+ *         SuperLU_DIST.
+ *
+ * noDomains (input) int*, must be power of 2
+ *         Number of independent domains to be computed by the graph
+ *         partitioning algorithm.  ( noDomains <= nprocs_i )
+ *
+ * sizes   (output) int_t**, of size 2 * noDomains
+ *         Returns pointer to an array containing the number of nodes
+ *         for each sub-domain and each separator.  Separators are stored 
+ *         from left to right.
+ *         Memory for the array is allocated in this routine.
+ *
+ * fstVtxSep (output) int_t**, of size 2 * noDomains
+ *         Returns pointer to an array containing first node for each
+ *         sub-domain and each separator.
+ *         Memory for the array is allocated in this routine.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the symbolic factorization.
+ *   > 0, number of bytes allocated when out of memory.
+ * </pre>
+ */
+float
+get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
+		     int nprocs_i, int noDomains, 
+		     int_t **sizes, int_t **fstVtxSep,
+		     gridinfo_t *grid, MPI_Comm *metis_comm)
+
+{
+  NRformat_loc *Astore;
+  int   iam, p;
+#if 0
+  int   *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;
+  int   *options, numflag;
+#else /* 64-bit integers */
+  int_t options[4]={0,0,0,1}, numflag;
+#endif
+  int_t m_loc, fst_row;
+  int_t m, n, bnz, i, j;
+  int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
+  int_t *b_rowptr, *b_colind;
+  int_t *dist_order;
+  int  *recvcnts, *displs;
+  /* first row index on each processor when the matrix is distributed
+     on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
+  int_t  *vtxdist_i, *vtxdist_o; 
+  int_t szSep, k, noNodes;
+  float apat_mem_l; /* memory used during the computation of the graph of A+A' */
+  float mem;  /* Memory used during this routine */
+  MPI_Status status;
+
+  /* Initialization. */
+  MPI_Comm_rank (grid->comm, &iam);
+  n = A->ncol;
+  m = A->nrow;
+  if ( m != n ) ABORT("Matrix is not square");
+  mem = 0.;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");
+#endif
+
+  Astore = (NRformat_loc *) A->Store;
+  m_loc = Astore->m_loc;     /* number of rows local to this processor */
+  fst_row = Astore->fst_row; /* global index of the first row */
+  rowptr = Astore->rowptr;   /* pointer to rows and column indices */
+  colind = Astore->colind;
+  
+#if ( PRNTlevel>=1 )
+  if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",
+		     noDomains);
+#endif
+
+  numflag = 0;
+  /* determine first row on each processor */
+  vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
+  if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");
+  vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
+  if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");
+
+  MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,
+		 grid->comm);
+  vtxdist_i[nprocs_i] = m;
+
+  if (noDomains == nprocs_i) {
+    /* keep the same distribution of A */
+    for (p = 0; p <= nprocs_i; p++)
+      vtxdist_o[p] = vtxdist_i[p];
+  }
+  else {
+    i = n / noDomains;
+    j = n % noDomains;
+    for (k = 0, p = 0; p < noDomains; p++) {
+      vtxdist_o[p] = k;
+      k += i;
+      if (p < j)  k++;
+    }
+    /* The remaining non-participating processors get the same 
+       first-row-number as the last processor.   */
+    for (p = noDomains; p <= nprocs_i; p++)
+      vtxdist_o[p] = k;
+  }
+
+#if ( DEBUGlevel>=2 )
+  if (!iam)
+    PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);
+#endif  
+
+  /* Compute distributed A + A' */
+  if ((apat_mem_l = 
+       a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,
+			     n, rowptr, colind, noDomains, vtxdist_o,
+			     &bnz, &b_rowptr, &b_colind, grid)) > 0)
+    return (apat_mem_l);
+  mem += -apat_mem_l;
+  
+  /* Initialize and allocate storage for parMetis. */    
+  (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
+  if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");
+  l_sizes = *sizes;
+  (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
+  if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+  l_fstVtxSep = *fstVtxSep;
+  m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];
+  
+  if ( iam < noDomains) 
+    /* dist_order is the perm returned by parMetis, distributed */
+    if (! (dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
+      ABORT("SUPERLU_MALLOC fails for dist_order.");
+
+#if 0  /* Obsolate -- now ParMETIS has 64 bit integer support. */
+
+  /* ParMETIS represents the column pointers and row indices of *
+   * the input matrix using integers. When SuperLU_DIST uses    *
+   * long int for the int_t type, then several supplementary    *
+   * copies need to be performed in order to call ParMETIS.     */
+#if defined (_LONGINT)
+  l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));
+  if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");
+  
+  /* Allocate storage */
+  if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))
+    ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");
+  for (i = 0; i <= m_loc; i++)
+    b_rowptr_int[i] = b_rowptr[i];
+  SUPERLU_FREE (b_rowptr);
+  
+  if ( bnz ) {
+    if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))
+      ABORT("SUPERLU_MALLOC fails for b_colind_int[]");
+    for (i = 0; i < bnz; i++)
+      b_colind_int[i] = b_colind[i];
+    SUPERLU_FREE (b_colind);
+  }
+  
+  if ( !(vtxdist_o_int = 
+	 (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))
+    ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");
+  for (i = 0; i <= nprocs_i; i++)
+    vtxdist_o_int[i] = vtxdist_o[i];
+  SUPERLU_FREE (vtxdist_o);
+
+#else  /* Default */
+
+  vtxdist_o_int = vtxdist_o;
+  b_rowptr_int = b_rowptr; b_colind_int = b_colind;
+  l_sizes_int = l_sizes;
+
+#endif
+#endif
+    
+  if ( iam < noDomains) {
+
+    ParMETIS_V3_NodeND(vtxdist_o, b_rowptr, b_colind, 
+		       &numflag, options,
+		       dist_order, l_sizes, metis_comm);
+  }
+
+  if (bnz) SUPERLU_FREE (b_colind);
+  SUPERLU_FREE (b_rowptr);
+
+#if 0  
+  if ( iam < noDomains) {
+    SUPERLU_FREE (options);
+  }
+
+#if defined (_LONGINT)
+  /* Copy data from dist_order_int to dist_order */
+  if ( iam < noDomains) {
+    /* dist_order is the perm returned by parMetis, distributed */
+    if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
+      ABORT("SUPERLU_MALLOC fails for dist_order.");
+    for (i = 0; i < m_loc; i++)
+      dist_order[i] = dist_order_int[i];
+    SUPERLU_FREE(dist_order_int);
+    
+    for (i = 0; i < 2*noDomains; i++)
+      l_sizes[i] = l_sizes_int[i];
+    SUPERLU_FREE(l_sizes_int);
+  }
+#else 
+  dist_order = dist_order_int;
+#endif
+
+#endif
+  
+  /* Allgatherv dist_order to get perm_c */
+  if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
+    ABORT ("SUPERLU_MALLOC fails for displs.");
+  if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
+    ABORT ("SUPERLU_MALLOC fails for recvcnts.");
+  for (i = 0; i < nprocs_i; i++)
+    recvcnts[i] = vtxdist_o[i+1] - vtxdist_o[i];
+  displs[0]=0;
+  for(i=1; i < nprocs_i; i++) 
+    displs[i] = displs[i-1] + recvcnts[i-1];
+  
+  MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, 
+		  mpi_int_t, grid->comm);
+
+  if ( iam < noDomains) {
+    SUPERLU_FREE (dist_order);
+  }
+  SUPERLU_FREE (vtxdist_i);
+  SUPERLU_FREE (vtxdist_o);
+  SUPERLU_FREE (recvcnts);
+  SUPERLU_FREE (displs);
+  
+  /* send l_sizes to every processor p >= noDomains */
+  if (!iam)
+    for (p = noDomains; p < nprocs_i; p++)
+      MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);
+  if (noDomains <= iam && iam < nprocs_i)
+    MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,
+	      &status);
+  
+  /* Determine the first node in each separator, store it in l_fstVtxSep */  
+  for (j = 0; j < 2 * noDomains; j++)
+    l_fstVtxSep[j] = 0;
+  l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];
+  szSep = noDomains;
+  i = 0;
+  while (szSep != 1) {
+    for (j = i; j < i + szSep; j++) {
+      l_fstVtxSep[j] += l_sizes[j]; 	      
+    }
+    for (j = i; j < i + szSep; j++) {
+      k = i + szSep + (j-i) / 2;
+      l_fstVtxSep[k] += l_fstVtxSep[j]; 
+    }
+    i += szSep;
+    szSep = szSep / 2;
+  }
+  
+  l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];
+  i = 2 * noDomains - 2;
+  szSep = 1;
+  while (i > 0) {
+    for (j = i; j < i + szSep; j++) {
+      k = (i - 2 * szSep) + (j-i) * 2 + 1;
+      noNodes = l_fstVtxSep[k];
+      l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];
+      l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - 
+	noNodes - l_sizes[k-1];
+    }
+    szSep *= 2;
+    i -= szSep;
+  }
+
+#if ( PRNTlevel>=2 )
+  if (!iam ) {
+    PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);
+    PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);
+  }
+#endif
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");
+#endif
+  
+  return (-mem);
+
+} /* get_perm_c_parmetis */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in
+ * NRformat_loc format, represented by (rowptr, colind). The output
+ * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row
+ * oriented), represented by (b_rowptr, b_colind).
+ *
+ * The input matrix A is distributed in block row format on nprocs_i
+ * processors.  The output matrix B is distributed in block row format
+ * on nprocs_o processors, where nprocs_o <= nprocs_i.  On output, the
+ * matrix B has its rows permuted according to perm_r.
+ *
+ * Sketch of the algorithm
+ * =======================
+ *
+ * Let iam by my process number.  Let fst_row, lst_row = m_loc +
+ * fst_row be the first/last row stored on iam.
+ * 
+ * Compute Pr' - the inverse row permutation, stored in iperm_r.
+ *
+ * Compute the transpose  of the block row of Pr*A that iam owns:
+ *    T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr'
+ *
+ *
+ * All to all communication such that every processor iam receives all
+ * the blocks of the transpose matrix that it needs, that is
+ *           T[fst_row:lst_row, :]
+ *
+ * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :]
+ *
+ * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that
+ * is compute Pr*B) and redistribute from nprocs_i to nprocs_o
+ * according to the block row distribution in vtxdist_i, vtxdist_o.
+ * </pre>
+ */
+  
+static float
+a_plus_at_CompRow_loc
+(
+ int   iam,         /* Input - my processor number */
+ int_t *perm_r,     /* Input - row permutation vector Pr */
+ int   nprocs_i,    /* Input - number of processors the input matrix
+		       is distributed on */
+ int_t *vtxdist_i,  /* Input - index of first row on each processor of the input matrix */
+ int_t n,           /* Input - number of columns in matrix A. */
+ int_t *rowptr,     /* Input - row pointers of size m_loc+1 for matrix A. */
+ int_t *colind,     /* Input - column indices of size nnz_loc for matrix A. */
+ int   nprocs_o,    /* Input - number of processors the output matrix
+		       is distributed on */
+ int_t *vtxdist_o,  /* Input - index of first row on each processor of the output matrix */
+ int_t *p_bnz,      /* Output - on exit, returns the actual number of
+		       local nonzeros in matrix A'+A. */
+ int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */
+ int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */
+ gridinfo_t *grid    /* Input - grid of processors information */
+ )
+{
+
+  int_t i, j, k, col, num_nz, nprocs;
+  int_t *tcolind_recv; /* temporary receive buffer */
+  int_t *tcolind_send; /* temporary send buffer */
+  int_t sz_tcolind_send, sz_tcolind_recv;
+  int_t ind, ind_rcv;
+  int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */
+  int_t *marker, *iperm_r;
+  int_t *sendCnts, *recvCnts;
+  int_t *sdispls, *rdispls;
+  int_t *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t;
+  int_t p, t_ind, nelts, ipcol;
+  int_t m_loc, m_loc_o;      /* number of local rows */ 
+  int_t fst_row, fst_row_o;  /* index of first local row */
+  int_t nnz_loc;    /* number of local nonzeros in matrix A */
+  float apat_mem, apat_mem_max;
+  int   *intBuf1, *intBuf2, *intBuf3, *intBuf4;  
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");
+#endif
+  
+  fst_row    = vtxdist_i[iam];
+  m_loc      = vtxdist_i[iam+1] - vtxdist_i[iam];
+  nnz_loc    = rowptr[m_loc];
+  redist_pra = FALSE;  
+  nprocs     = SUPERLU_MAX(nprocs_i, nprocs_o);
+  apat_mem_max = 0.;
+  
+  if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for marker[]");
+  if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for iperm_r[]");
+  if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for sendCnts[]");
+  if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for recvCnts[]");
+  if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for sdispls[]");
+  if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for rdispls[]");
+  apat_mem = 2 * n + 4 * nprocs + 3;
+
+#if defined (_LONGINT)
+  intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int));
+  intBuf2 = intBuf1 + nprocs;
+  intBuf3 = intBuf1 + 2 * nprocs;
+  intBuf4 = intBuf1 + 3 * nprocs;
+  apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);
+#endif  
+
+  /* compute the inverse row permutation vector */
+  for (i = 0; i < n; i++) {
+    marker[i] = 1;
+    if (perm_r[i] != i)
+      redist_pra = TRUE;
+    iperm_r[perm_r[i]] = i;
+  }
+
+  /* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam.         */
+  /* THE RESULT IS STORED IN TCOLIND_SEND.             */
+  /* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX.   */
+
+  /* First pass to get counts of each row of T, and set up column pointers */
+  for (j = 0; j < m_loc; j++) {
+    for (i = rowptr[j]; i < rowptr[j+1]; i++){
+      marker[iperm_r[colind[i]]]++;
+    }
+  }
+  /* determine number of elements to be sent to each processor */
+  for (p = 0; p < nprocs_i; p++) {
+    sendCnts[p] = 0;
+    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) 
+      sendCnts[p] += marker[i];
+  }
+  /* exchange send/receive counts information in between all processors */
+  MPI_Alltoall (sendCnts, 1, mpi_int_t,
+		recvCnts, 1, mpi_int_t, grid->comm);
+  sendCnts[iam] = 0;
+  
+  for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
+    rdispls[p] = j;
+    j += recvCnts[p];
+    sdispls[p] = i;  
+    i += sendCnts[p];
+  }
+  recvCnts[iam] = 0;
+  sz_tcolind_recv = j;
+  sz_tcolind_send = i;
+  
+  /* allocate memory to receive necessary blocks of transpose matrix T */
+  if (sz_tcolind_recv) {
+    if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
+						   * sizeof(int_t) )))
+      ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
+    apat_mem += sz_tcolind_recv;
+  }
+  /* allocate memory to send blocks of local transpose matrix T to other processors */
+  if (sz_tcolind_send) {
+    if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
+						  * sizeof(int_t))))
+      ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
+    apat_mem += sz_tcolind_send;
+  }
+
+  /* Set up marker[] to point at the beginning of each row in the
+     send/receive buffer.  For each row, we store first its number of
+     elements, and then the elements. */  
+  ind_rcv = rdispls[iam];
+  for (p = 0; p < nprocs_i; p++) {
+    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) {
+      nelts = marker[i] - 1;
+      if (p == iam) {
+	tcolind_recv[ind_rcv] = nelts;
+	marker[i] = ind_rcv + 1;
+	ind_rcv += nelts + 1;
+      }
+      else {
+	tcolind_send[sdispls[p]] = nelts;
+	marker[i] = sdispls[p] + 1;
+	sdispls[p] += nelts + 1;
+      }
+    }
+  }
+  /* reset sdispls vector */
+  for (i = 0, p = 0; p < nprocs_i; p++) {
+    sdispls[p] = i;  
+    i += sendCnts[p];
+  }
+  /* Second pass of the local matrix A to copy data to be sent */
+  for (j = 0; j < m_loc; j++)
+    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
+      col = colind[i];
+      ipcol = iperm_r[col];      
+      if (ipcol >= fst_row && ipcol < fst_row + m_loc)  /* local data */
+	tcolind_recv[marker[ipcol]] = perm_r[j + fst_row];      
+      else /* remote */ 
+	tcolind_send[marker[ipcol]] = perm_r[j + fst_row];
+      marker[ipcol] ++;
+    }
+  sendCnts[iam] = 0;
+  recvCnts[iam] = 0;
+
+#if defined (_LONGINT)
+  for (p=0; p<nprocs; p++) {
+    if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
+	recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
+      ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+    intBuf1[p] = (int) sendCnts[p];
+    intBuf2[p] = (int) sdispls[p];
+    intBuf3[p] = (int) recvCnts[p];
+    intBuf4[p] = (int) rdispls[p];
+  }
+#else  /* Default */
+  intBuf1 = sendCnts;  intBuf2 = sdispls;
+  intBuf3 = recvCnts;  intBuf4 = rdispls;
+#endif
+  
+  /* send/receive transpose matrix T */
+  MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
+		 tcolind_recv, intBuf3, intBuf4, mpi_int_t,
+		 grid->comm);
+  /* ------------------------------------------------------------
+     DEALLOCATE SEND COMMUNICATION STORAGE
+     ------------------------------------------------------------*/
+  if (sz_tcolind_send) {
+    SUPERLU_FREE( tcolind_send );
+    apat_mem_max = apat_mem;
+    apat_mem -= sz_tcolind_send;
+  }
+
+  /* ----------------------------------------------------------------
+     FOR LOCAL ROWS:
+       compute B = A + T, where row j of B is:
+       Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:))
+       do not include the diagonal entry
+     THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T.   
+     ------------------------------------------------------------------ */
+  
+  /* Reset marker to EMPTY */
+  for (i = 0; i < n; ++i) marker[i] = EMPTY;
+  /* save rdispls information */
+  for (p = 0; p < nprocs_i; p++)
+    sdispls[p] = rdispls[p];
+
+  /* First pass determines number of nonzeros in B */
+  num_nz = 0;
+  for (j = 0; j < m_loc; j++) {
+    /* Flag the diagonal so it's not included in the B matrix */
+    marker[perm_r[j + fst_row]] = j;
+    
+    /* Add pattern of row A(j,:) to B(j,:) */
+    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
+      k = colind[i];
+      if ( marker[k] != j ) {
+	marker[k] = j;
+	++num_nz;
+      }
+    }
+    
+    /* Add pattern of row T(j,:) to B(j,:) */
+    for (p = 0; p < nprocs_i; p++) {
+      t_ind = rdispls[p];
+      nelts = tcolind_recv[t_ind]; t_ind ++;
+      for (i = t_ind; i < t_ind + nelts; i++) {
+	k = tcolind_recv[i];
+	if ( marker[k] != j ) {
+	  marker[k] = j;
+	  ++num_nz;
+	}
+      }
+      t_ind += nelts;
+      rdispls[p] = t_ind;
+    }
+  }
+  bnz_t = num_nz;
+
+  /* Allocate storage for B=Pr*A+A'*Pr' */
+  if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t))))
+    ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]");
+  if ( bnz_t ) {
+    if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t))))
+      ABORT("SUPERLU_MALLOC fails for b_colind_t[]");
+  }
+  apat_mem += m_loc + 1 + bnz_t;
+  if (apat_mem > apat_mem_max)
+    apat_mem_max = apat_mem;
+  
+  /* Reset marker to EMPTY */
+  for (i = 0; i < n; i++) marker[i] = EMPTY;
+  /* restore rdispls information */
+  for (p = 0; p < nprocs_i; p++)
+    rdispls[p] = sdispls[p];
+  
+  /* Second pass, compute each row of B, one at a time */
+  num_nz = 0;
+  t_ind = 0;
+  for (j = 0; j < m_loc; j++) {
+    b_rowptr_t[j] = num_nz;
+    
+    /* Flag the diagonal so it's not included in the B matrix */
+    marker[perm_r[j + fst_row]] = j;
+
+    /* Add pattern of row A(j,:) to B(j,:) */
+    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
+      k = colind[i];
+      if ( marker[k] != j ) {
+	marker[k] = j;
+	b_colind_t[num_nz] = k; num_nz ++;
+      }
+    }
+
+    /* Add pattern of row T(j,:) to B(j,:) */
+    for (p = 0; p < nprocs_i; p++) {
+      t_ind = rdispls[p];
+      nelts = tcolind_recv[t_ind]; t_ind++;
+      for (i = t_ind; i < t_ind + nelts; i++) {
+	k = tcolind_recv[i];
+	if ( marker[k] != j ) {
+	  marker[k] = j;
+	  b_colind_t[num_nz] = k; num_nz++;
+	}
+      }
+      t_ind += nelts;
+      rdispls[p] = t_ind;
+    }
+  }
+  b_rowptr_t[m_loc] = num_nz;
+
+  for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++) 
+    if (vtxdist_i[p] != vtxdist_o[p])
+      redist_pra = TRUE;
+  
+  if (sz_tcolind_recv) {
+    SUPERLU_FREE (tcolind_recv);
+    apat_mem -= sz_tcolind_recv;
+  }
+  SUPERLU_FREE (marker);
+  SUPERLU_FREE (iperm_r);
+  apat_mem -= 2 * n + 1;
+  
+  /* redistribute permuted matrix (by rows) from nproc_i processors
+     to nproc_o processors */
+  if (redist_pra) {
+    m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam];
+    fst_row_o = vtxdist_o[iam];
+    nnz_loc = 0;
+    
+    if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) )
+      ABORT("Malloc fails for *b_rowptr[].");
+    apat_mem += m_loc_o + 1;
+    if (apat_mem > apat_mem_max)
+      apat_mem_max = apat_mem;
+
+    for (p = 0; p < nprocs_i; p++) {
+      sendCnts[p] = 0;
+      recvCnts[p] = 0;
+    }
+
+    for (i = 0; i < m_loc; i++) {
+      k = perm_r[i+fst_row];
+      /* find the processor to which row k belongs */
+      j = FALSE; p = 0;
+      while (!j) {
+	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
+	  j = TRUE;
+	else 
+	  p ++;
+      }
+      if (p == iam) {
+	b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i];
+	nnz_loc += b_rowptr[k-fst_row_o];
+      }
+      else
+	sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2;
+    }
+    /* exchange send/receive counts information in between all processors */
+    MPI_Alltoall (sendCnts, 1, mpi_int_t,
+		  recvCnts, 1, mpi_int_t, grid->comm);
+    
+    for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
+      rdispls[p] = j;
+      j += recvCnts[p];
+      sdispls[p] = i;  
+      i += sendCnts[p];
+    }
+    rdispls[p] = j;
+    sdispls[p] = i;
+    sz_tcolind_recv = j;
+    sz_tcolind_send = i;
+
+    /* allocate memory for local data */
+    tcolind_recv = NULL;
+    tcolind_send = NULL;
+    if (sz_tcolind_recv) {
+      if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
+						     * sizeof(int_t) )))
+	ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
+      apat_mem += sz_tcolind_recv;
+    }
+    /* allocate memory to receive necessary data */
+    if (sz_tcolind_send) {
+      if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
+						    * sizeof(int_t))))
+	ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
+      apat_mem += sz_tcolind_send;
+    }
+    if (apat_mem > apat_mem_max)
+      apat_mem_max = apat_mem;
+
+    /* Copy data to be sent */
+    ind_rcv = rdispls[iam];
+    for (i = 0; i < m_loc; i++) {
+      k = perm_r[i+fst_row];
+      /* find the processor to which row k belongs */
+      j = FALSE; p = 0;
+      while (!j) {
+	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
+	  j = TRUE;
+	else 
+	  p ++;
+      }
+      if (p != iam) { /* remote */ 
+	tcolind_send[sdispls[p]] = k;
+	tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i];
+	sdispls[p] += 2;
+	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) {
+	  tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++;
+	}
+      }
+    }
+  
+    /* reset sdispls vector */
+    for (i = 0, p = 0; p < nprocs_i; p++) {
+      sdispls[p] = i;  
+      i += sendCnts[p];
+    }
+    sendCnts[iam] = 0;
+    recvCnts[iam] = 0;
+    
+#if defined (_LONGINT)
+    for (p=0; p<nprocs; p++) {
+      if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
+	  recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
+	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+      intBuf1[p] = (int) sendCnts[p];
+      intBuf2[p] = (int) sdispls[p];
+      intBuf3[p] = (int) recvCnts[p];
+      intBuf4[p] = (int) rdispls[p];
+    }
+#else  /* Default */
+    intBuf1 = sendCnts;  intBuf2 = sdispls;
+    intBuf3 = recvCnts;  intBuf4 = rdispls;
+#endif
+
+    /* send/receive permuted matrix T by rows */
+    MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
+		   tcolind_recv, intBuf3, intBuf4, mpi_int_t,
+		   grid->comm);
+    /* ------------------------------------------------------------
+       DEALLOCATE COMMUNICATION STORAGE
+       ------------------------------------------------------------*/
+    if (sz_tcolind_send) {
+      SUPERLU_FREE( tcolind_send );
+      apat_mem -= sz_tcolind_send;
+    }
+    
+    /* ------------------------------------------------------------
+       STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER
+       ------------------------------------------------------------*/
+    for (p = 0; p < nprocs; p++) {
+      if (p != iam) {
+	i = rdispls[p];
+	while (i < rdispls[p+1]) {
+	  j = tcolind_recv[i];
+	  nelts = tcolind_recv[i+1];
+	  i += 2 + nelts;
+	  b_rowptr[j-fst_row_o] = nelts;
+	  nnz_loc += nelts;
+	}
+      }
+    }
+
+    if (nnz_loc) {
+      if ( !(b_colind = intMalloc_dist(nnz_loc)) ) {
+	ABORT("Malloc fails for bcolind[].");
+	apat_mem += nnz_loc;
+	if (apat_mem > apat_mem_max)
+	  apat_mem_max = apat_mem;
+      }
+    }
+
+    /* Initialize the array of row pointers */
+    k = 0;
+    for (j = 0; j < m_loc_o; j++) {
+      i = b_rowptr[j];
+      b_rowptr[j] = k;
+      k += i;
+    }
+    if (m_loc_o) b_rowptr[j] = k;
+    
+    /* Copy the data into the row oriented storage */
+    for (p = 0; p < nprocs; p++) {
+      if (p != iam) {
+	i = rdispls[p];
+	while (i < rdispls[p+1]) {
+	  j = tcolind_recv[i];
+	  nelts = tcolind_recv[i+1];
+	  for (i += 2, k = b_rowptr[j-fst_row_o]; 
+	       k < b_rowptr[j-fst_row_o+1]; i++, k++) 
+	    b_colind[k] = tcolind_recv[i];
+	}
+      }
+    }
+    for (i = 0; i < m_loc; i++) {
+      k = perm_r[i+fst_row];
+      if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) {
+	ind = b_rowptr[k-fst_row_o];
+	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++)
+	  b_colind[ind] = b_colind_t[j];
+      }
+    }
+    
+    SUPERLU_FREE(b_rowptr_t);
+    if ( bnz_t )
+      SUPERLU_FREE(b_colind_t);
+    if (sz_tcolind_recv)
+      SUPERLU_FREE(tcolind_recv);
+    apat_mem -= bnz_t + m_loc + sz_tcolind_recv;
+    
+    *p_bnz = nnz_loc;
+    *p_b_rowptr = b_rowptr;
+    *p_b_colind = b_colind;
+  }
+  else { /* no need for redistribution */
+    *p_bnz = bnz_t;
+    *p_b_rowptr = b_rowptr_t;
+    *p_b_colind = b_colind_t;
+  }
+  
+  SUPERLU_FREE (rdispls);
+  SUPERLU_FREE (sdispls);
+  SUPERLU_FREE (sendCnts);
+  SUPERLU_FREE (recvCnts);
+  apat_mem -= 4 * nprocs + 2;
+#if defined (_LONGINT)
+  SUPERLU_FREE (intBuf1);
+  apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t);
+#endif
+  
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()");
+#endif
+  
+  return (- apat_mem_max * sizeof(int_t));
+} /* a_plus_at_CompRow_loc */
+
+
diff --git a/SRC/html_mainpage.h b/SRC/html_mainpage.h
new file mode 100644
index 0000000..2a81f7b
--- /dev/null
+++ b/SRC/html_mainpage.h
@@ -0,0 +1,20 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! \mainpage SuperLU_DIST Documentation
+ 
+  SuperLU_DIST is a parallel library for the direct solution of large,
+  sparse, nonsymmetric systems of linear equations for distributed
+  memory machines. The library is written in C and MPI, and is callable
+  from either C or Fortran. The library routines perform an LU
+  decomposition with static pivoting and triangular system solutions
+  through forward and back substitution. 
+ 
+ */
diff --git a/SRC/machines.h b/SRC/machines.h
new file mode 100644
index 0000000..d4c1528
--- /dev/null
+++ b/SRC/machines.h
@@ -0,0 +1,63 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief These macros define which machine will be used
+ *
+ * <pre>
+ * -- SuperLU MT routine (version 1.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 15, 1997
+ *
+ * These macros define which machine will be used.
+ * </pre>
+ */
+
+#ifndef __SUPERLU_MACHINES /* allow multiple inclusions */
+#define __SUPERLU_MACHINES
+
+#define SGI	        0
+#define ORIGIN	        1
+#define DEC	        2
+#define CRAY_T3E	3
+#define SUN             4
+#define PTHREAD         5
+#define IBM             6
+
+#ifdef _SGI
+#define MACH SGI 
+#endif
+
+#ifdef _ORIGIN
+#define MACH ORIGIN 
+#endif
+
+#ifdef _DEC
+#define MACH DEC 
+#endif
+
+#ifdef _CRAY
+#define MACH CRAY_T3E 
+#endif
+
+#ifdef _SOLARIS
+#define MACH SUN 
+#endif
+
+#ifdef _PTHREAD
+#define MACH PTHREAD
+#endif
+
+#if ( defined(_SP2) || defined(_SP) )
+#define MACH IBM
+#endif
+
+#endif /* __SUPERLU_MACHINES */
diff --git a/SRC/mc64ad_dist.c b/SRC/mc64ad_dist.c
new file mode 100644
index 0000000..bf722fd
--- /dev/null
+++ b/SRC/mc64ad_dist.c
@@ -0,0 +1,2654 @@
+/* mc64ad.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "superlu_ddefs.h"
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define min(a,b) ((a) < (b)) ? (a) : (b)
+
+/* Table of constant values */
+
+static int_t c__1 = 1;
+static int_t c__2 = 2;
+
+/*! @file 
+ * \brief Permute large entries to the main diagonal
+ */
+/* CCCC COPYRIGHT (c) 1999  Council for the Central Laboratory of the */
+/* CCCC Research Councils.    All rights reserved. */
+/* CCCC PACKAGE MC64A/AD */
+/* CCCC AUTHORS Iain Duff (i.duff at rl.ac.uk) and Jacko Koster (jak at ii.uib.no) */
+/* CCCC LAST UPDATE 20/09/99 */
+/* CCCC */
+/* *** Conditions on external use *** */
+
+/* The user shall acknowledge the contribution of this */
+/* package in any publication of material dependent upon the use of */
+/* the package. The user shall use reasonable endeavours to notify */
+/* the authors of the package of this publication. */
+
+/* The user can modify this code but, at no time */
+/* shall the right or title to all or any part of this package pass */
+/* to the user. The user shall make available free of charge */
+/* to the authors for any purpose all information relating to any */
+/* alteration or addition made to this package for the purposes of */
+/* extending the capabilities or enhancing the performance of this */
+/* package. */
+
+/* The user shall not pass this code directly to a third party without the */
+/* express prior consent of the authors.  Users wanting to licence their */
+/* own copy of these routines should send email to hsl at aeat.co.uk */
+
+/* None of the comments from the Copyright notice up to and including this */
+/* one shall be removed or altered in any way. */
+/* ********************************************************************** */
+/* </pre>
+ */
+
+/* Subroutine */ int_t mc64id_dist(int_t *icntl)
+{
+    int_t i__;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/*  Purpose */
+/*  ======= */
+
+/*  The components of the array ICNTL control the action of MC64A/AD. */
+/*  Default values for these are set in this subroutine. */
+
+/*  Parameters */
+/*  ========== */
+
+
+/*  Local variables */
+
+/*    ICNTL(1) has default value 6. */
+/*     It is the output stream for error messages. If it */
+/*     is negative, these messages will be suppressed. */
+
+/*    ICNTL(2) has default value 6. */
+/*     It is the output stream for warning messages. */
+/*     If it is negative, these messages are suppressed. */
+
+/*    ICNTL(3) has default value -1. */
+/*     It is the output stream for monitoring printing. */
+/*     If it is negative, these messages are suppressed. */
+
+/*    ICNTL(4) has default value 0. */
+/*     If left at the defaut value, the incoming data is checked for */
+/*     out-of-range indices and duplicates.  Setting ICNTL(4) to any */
+/*     other will avoid the checks but is likely to cause problems */
+/*     later if out-of-range indices or duplicates are present. */
+/*     The user should only set ICNTL(4) non-zero, if the data is */
+/*     known to avoid these problems. */
+
+/*    ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */
+/*     zero in this routine. */
+/* Initialization of the ICNTL array. */
+    /* Parameter adjustments */
+    --icntl;
+
+    /* Function Body */
+    icntl[1] = 6;
+    icntl[2] = 6;
+    icntl[3] = -1;
+    for (i__ = 4; i__ <= 10; ++i__) {
+	icntl[i__] = 0;
+/* L10: */
+    }
+    return 0;
+} /* mc64id_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64ad_dist(int_t *job, int_t *n, int_t *ne, int_t *
+	ip, int_t *irn, double *a, int_t *num, int_t *cperm, 
+	int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t *
+	icntl, int_t *info)
+{
+    /* System generated locals */
+    int_t i__1, i__2;
+    double d__1, d__2;
+
+    /* Builtin functions */
+    double log(double);
+
+    /* Local variables */
+    int_t i__, j, k;
+    double fact, rinf;
+
+    extern /* Subroutine */ int_t mc21ad_dist(int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *),
+	    mc64bd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t 
+	    *, int_t *, int_t *, int_t *, int_t *, int_t *, double *),
+	    mc64rd_dist(int_t *, int_t *, int_t *, int_t *, double *),
+	    mc64sd_dist(int_t *, int_t *, int_t *, int_t *
+	    , double *, int_t *, int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *),
+	    mc64wd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t 
+	    *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t 
+	    *, double *, double *);
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/*  Purpose */
+/*  ======= */
+
+/*! \brief
+ * <pre>
+ * This subroutine attempts to find a column permutation for an NxN 
+ * sparse matrix A = {a_ij} that makes the permuted matrix have N 
+ * entries on its diagonal. 
+ * If the matrix is structurally nonsingular, the subroutine optionally 
+ * returns a column permutation that maximizes the smallest element 
+ * on the diagonal, maximizes the sum of the diagonal entries, or 
+ * maximizes the product of the diagonal entries of the permuted matrix. 
+ * For the latter option, the subroutine also finds scaling factors 
+ * that may be used to scale the matrix so that the nonzero diagonal 
+ * entries of the permuted matrix are one in absolute value and all the 
+ * off-diagonal entries are less than or equal to one in absolute value. 
+ * The natural logarithms of the scaling factors u(i), i=1..N, for the 
+ * rows and v(j), j=1..N, for the columns are returned so that the 
+ * scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). 
+ * </pre>
+ */
+ 
+/*  Parameters */
+/*  ========== */
+
+
+/* JOB is an INT_T variable which must be set by the user to */
+/* control the action. It is not altered by the subroutine. */
+/* Possible values for JOB are: */
+/*   1 Compute a column permutation of the matrix so that the */
+/*     permuted matrix has as many entries on its diagonal as possible. */
+/*     The values on the diagonal are of arbitrary size. HSL subroutine */
+/*     MC21A/AD is used for this. See [1]. */
+/*   2 Compute a column permutation of the matrix so that the smallest */
+/*     value on the diagonal of the permuted matrix is maximized. */
+/*     See [3]. */
+/*   3 Compute a column permutation of the matrix so that the smallest */
+/*     value on the diagonal of the permuted matrix is maximized. */
+/*     The algorithm differs from the one used for JOB = 2 and may */
+/*     have quite a different performance. See [2]. */
+/*   4 Compute a column permutation of the matrix so that the sum */
+/*     of the diagonal entries of the permuted matrix is maximized. */
+/*     See [3]. */
+/*   5 Compute a column permutation of the matrix so that the product */
+/*     of the diagonal entries of the permuted matrix is maximized */
+/*     and vectors to scale the matrix so that the nonzero diagonal */
+/*     entries of the permuted matrix are one in absolute value and */
+/*     all the off-diagonal entries are less than or equal to one in */
+/*     absolute value. See [3]. */
+/*  Restriction: 1 <= JOB <= 5. */
+
+/* N is an INT_T variable which must be set by the user to the */
+/*   order of the matrix A. It is not altered by the subroutine. */
+/*   Restriction: N >= 1. */
+
+/* NE is an INT_T variable which must be set by the user to the */
+/*   number of entries in the matrix. It is not altered by the */
+/*   subroutine. */
+/*   Restriction: NE >= 1. */
+
+/* IP is an INT_T array of length N+1. */
+/*   IP(J), J=1..N, must be set by the user to the position in array IRN */
+/*   of the first row index of an entry in column J. IP(N+1) must be set */
+/*   to NE+1. It is not altered by the subroutine. */
+
+/* IRN is an INT_T array of length NE. */
+/*   IRN(K), K=1..NE, must be set by the user to hold the row indices of */
+/*   the entries of the matrix. Those belonging to column J must be */
+/*   stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */
+/*   of the row indices within each column is unimportant. Repeated */
+/*   entries are not allowed. The array IRN is not altered by the */
+/*   subroutine. */
+
+/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
+/*   The user must set A(K), K=1..NE, to the numerical value of the */
+/*   entry that corresponds to IRN(K). */
+/*   It is not used by the subroutine when JOB = 1. */
+/*   It is not altered by the subroutine. */
+
+/* NUM is an INT_T variable that need not be set by the user. */
+/*   On successful exit, NUM will be the number of entries on the */
+/*   diagonal of the permuted matrix. */
+/*   If NUM < N, the matrix is structurally singular. */
+
+/* CPERM is an INT_T array of length N that need not be set by the */
+/*   user. On successful exit, CPERM contains the column permutation. */
+/*   Column CPERM(J) of the original matrix is column J in the permuted */
+/*   matrix, J=1..N. */
+
+/* LIW is an INT_T variable that must be set by the user to */
+/*   the dimension of array IW. It is not altered by the subroutine. */
+/*   Restriction: */
+/*     JOB = 1 :  LIW >= 5N */
+/*     JOB = 2 :  LIW >= 4N */
+/*     JOB = 3 :  LIW >= 10N + NE */
+/*     JOB = 4 :  LIW >= 5N */
+/*     JOB = 5 :  LIW >= 5N */
+
+/* IW is an INT_T array of length LIW that is used for workspace. */
+
+/* LDW is an INT_T variable that must be set by the user to the */
+/*   dimension of array DW. It is not altered by the subroutine. */
+/*   Restriction: */
+/*     JOB = 1 :  LDW is not used */
+/*     JOB = 2 :  LDW >= N */
+/*     JOB = 3 :  LDW >= NE */
+/*     JOB = 4 :  LDW >= 2N + NE */
+/*     JOB = 5 :  LDW >= 3N + NE */
+
+/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */
+/*   that is used for workspace. If JOB = 5, on return, */
+/*   DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */
+
+/* ICNTL is an INT_T array of length 10. Its components control the */
+/*   output of MC64A/AD and must be set by the user before calling */
+/*   MC64A/AD. They are not altered by the subroutine. */
+
+/*   ICNTL(1) must be set to specify the output stream for */
+/*   error messages. If ICNTL(1) < 0, messages are suppressed. */
+/*   The default value set by MC46I/ID is 6. */
+
+/*   ICNTL(2) must be set by the user to specify the output stream for */
+/*   warning messages. If ICNTL(2) < 0, messages are suppressed. */
+/*   The default value set by MC46I/ID is 6. */
+
+/*   ICNTL(3) must be set by the user to specify the output stream for */
+/*   diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */
+/*   The default value set by MC46I/ID is -1. */
+
+/*   ICNTL(4) must be set by the user to a value other than 0 to avoid */
+/*   checking of the input data. */
+/*   The default value set by MC46I/ID is 0. */
+
+/* INFO is an INT_T array of length 10 which need not be set by the */
+/*   user. INFO(1) is set non-negative to indicate success. A negative */
+/*   value is returned if an error occurred, a positive value if a */
+/*   warning occurred. INFO(2) holds further information on the error. */
+/*   On exit from the subroutine, INFO(1) will take one of the */
+/*   following values: */
+/*    0 : successful entry (for structurally nonsingular matrix). */
+/*   +1 : successful entry (for structurally singular matrix). */
+/*   +2 : the returned scaling factors are large and may cause */
+/*        overflow when used to scale the matrix. */
+/*        (For JOB = 5 entry only.) */
+/*   -1 : JOB < 1 or JOB > 5.  Value of JOB held in INFO(2). */
+/*   -2 : N < 1.  Value of N held in INFO(2). */
+/*   -3 : NE < 1. Value of NE held in INFO(2). */
+/*   -4 : the defined length LIW violates the restriction on LIW. */
+/*        Value of LIW required given by INFO(2). */
+/*   -5 : the defined length LDW violates the restriction on LDW. */
+/*        Value of LDW required given by INFO(2). */
+/*   -6 : entries are found whose row indices are out of range. INFO(2) */
+/*        contains the index of a column in which such an entry is found. */
+/*   -7 : repeated entries are found. INFO(2) contains the index of a */
+/*        column in which such entries are found. */
+/*  INFO(3) to INFO(10) are not currently used and are set to zero by */
+/*        the routine. */
+
+/* References: */
+/*  [1]  I. S. Duff, (1981), */
+/*       "Algorithm 575. Permutations for a zero-free diagonal", */
+/*       ACM Trans. Math. Software 7(3), 387-390. */
+/*  [2]  I. S. Duff and J. Koster, (1998), */
+/*       "The design and use of algorithms for permuting large */
+/*       entries to the diagonal of sparse matrices", */
+/*       SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */
+/*  [3]  I. S. Duff and J. Koster, (1999), */
+/*       "On algorithms for permuting large entries to the diagonal */
+/*       of sparse matrices", */
+/*       Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */
+/* Local variables and parameters */
+/* External routines and functions */
+/*     EXTERNAL FD05AD */
+/*     DOUBLE PRECISION FD05AD */
+/* Intrinsic functions */
+/* Set RINF to largest positive real number (infinity) */
+/* XSL    RINF = FD05AD(5) */
+    /* Parameter adjustments */
+    --cperm;
+    --ip;
+    --a;
+    --irn;
+    --iw;
+    --dw;
+    --icntl;
+    --info;
+
+    /* Function Body */
+    rinf = dmach_dist("Overflow");
+/* Check value of JOB */
+    if (*job < 1 || *job > 5) {
+	info[1] = -1;
+	info[2] = *job;
+	if (icntl[1] >= 0) {
+	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+		   " because JOB = " IFMT "\n",  info[1], *job);
+	}
+	goto L99;
+    }
+/* Check value of N */
+    if (*n < 1) {
+	info[1] = -2;
+	info[2] = *n;
+	if (icntl[1] >= 0) {
+	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+		   " because N = " IFMT "\n", info[1], *job);
+	}
+	goto L99;
+    }
+/* Check value of NE */
+    if (*ne < 1) {
+	info[1] = -3;
+	info[2] = *ne;
+	if (icntl[1] >= 0) {
+	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
+		   " because NE = " IFMT "\n", info[1], *job);
+	}
+	goto L99;
+    }
+/* Check LIW */
+    if (*job == 1) {
+	k = *n * 5;
+    }
+    if (*job == 2) {
+	k = *n << 2;
+    }
+    if (*job == 3) {
+	k = *n * 10 + *ne;
+    }
+    if (*job == 4) {
+	k = *n * 5;
+    }
+    if (*job == 5) {
+	k = *n * 5;
+    }
+    if (*liw < k) {
+	info[1] = -4;
+	info[2] = k;
+	if (icntl[1] >= 0) {
+	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+		   " LIW too small, must be at least " IFMT "\n", info[1], k);
+	}
+	goto L99;
+    }
+/* Check LDW */
+/* If JOB = 1, do not check */
+    if (*job > 1) {
+	if (*job == 2) {
+	    k = *n;
+	}
+	if (*job == 3) {
+	    k = *ne;
+	}
+	if (*job == 4) {
+	    k = (*n << 1) + *ne;
+	}
+	if (*job == 5) {
+	    k = *n * 3 + *ne;
+	}
+	if (*ldw < k) {
+	    info[1] = -5;
+	    info[2] = k;
+	    if (icntl[1] >= 0) {
+		printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+		       " LDW too small, must be at least " IFMT "\n", info[1], k);
+	    }
+	    goto L99;
+	}
+    }
+    if (icntl[4] == 0) {
+/* Check row indices. Use IW(1:N) as workspace */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    iw[i__] = 0;
+/* L3: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		i__ = irn[k];
+/* Check for row indices that are out of range */
+		if (i__ < 1 || i__ > *n) {
+		    info[1] = -6;
+		    info[2] = j;
+		    if (icntl[1] >= 0) {
+			printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+			       " Column " IFMT 
+			       " contains an entry with invalid row index " IFMT "\n",
+			       info[1], j, i__);
+		    }
+		    goto L99;
+		}
+/* Check for repeated row indices within a column */
+		if (iw[i__] == j) {
+		    info[1] = -7;
+		    info[2] = j;
+		    if (icntl[1] >= 0) {
+			printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
+			       "        Column " IFMT
+			       " contains two or more entries with row index " IFMT "\n",
+			       info[1], j, i__);
+		    }
+		    goto L99;
+		} else {
+		    iw[i__] = j;
+		}
+/* L4: */
+	    }
+/* L6: */
+	    }
+    }
+/* Print diagnostics on input */
+    if (icntl[3] >= 0) {
+	printf("  ****** Input parameters for MC64A/AD: JOB = " IFMT ","
+	       " N = " IFMT ", NE = " IFMT "\n", *job, *n, *ne);
+	printf(" IP(1:N+1)   = ");
+	for (j=1; j<=(*n+1); ++j) {
+	    printf(IFMT, ip[j]);
+	    if (j%8 == 0) printf("\n");
+	}
+	printf("\n IRN(1:NE) = ");
+	for (j=1; j<=(*ne); ++j) {
+	    printf(IFMT, irn[j]);
+	    if (j%8 == 0) printf("\n");
+	}
+	printf("\n");
+
+	if (*job > 1) {
+	    printf(" A(1:NE)     = ");
+	    for (j=1; j<=(*ne); ++j) {
+		printf("%f14.4", a[j]);
+		if (j%4 == 0) printf("\n");
+	    }
+	    printf("\n");
+	}
+    }
+/* Set components of INFO to zero */
+    for (i__ = 1; i__ <= 10; ++i__) {
+	info[i__] = 0;
+/* L8: */
+    }
+/* Compute maximum matching with MC21A/AD */
+    if (*job == 1) {
+/* Put length of column J in IW(J) */
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    iw[j] = ip[j + 1] - ip[j];
+/* L10: */
+	}
+/* IW(N+1:5N) is workspace */
+#if 0
+	mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]);
+#else
+	printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n");
+#endif
+	goto L90;
+    }
+/* Compute bottleneck matching */
+    if (*job == 2) {
+/* IW(1:5N), DW(1:N) are workspaces */
+	mc64bd_dist(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num,
+		    &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1],
+		    &dw[1]);
+	goto L90;
+    }
+/* Compute bottleneck matching */
+    if (*job == 3) {
+/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */
+	i__1 = *ne;
+	for (k = 1; k <= i__1; ++k) {
+	    iw[k] = irn[k];
+	    dw[k] = (d__1 = a[k], abs(d__1));
+/* L20: */
+	}
+/* Sort entries in each column by decreasing value. */
+	mc64rd_dist(n, ne, &ip[1], &iw[1], &dw[1]);
+/* IW(NE+1:NE+10N) is workspace */
+	mc64sd_dist(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num,
+		    &iw[*ne + 1], &iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1],
+		    &iw[*ne + *n * 3 + 1], &iw[*ne + (*n << 2) + 1],
+		    &iw[*ne + *n * 5 + 1], &iw[*ne + *n * 6 + 1]);
+	goto L90;
+    }
+    if (*job == 4) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    fact = 0.;
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		if ((d__1 = a[k], abs(d__1)) > fact) {
+		    fact = (d__2 = a[k], abs(d__2));
+		}
+/* L30: */
+	    }
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1));
+/* L40: */
+	    }
+/* L50: */
+	}
+/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */
+	mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1],
+		    num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], 
+		    &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
+	goto L90;
+    }
+    if (*job == 5) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    fact = 0.;
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		dw[*n * 3 + k] = (d__1 = a[k], abs(d__1));
+		if (dw[*n * 3 + k] > fact) {
+		    fact = dw[*n * 3 + k];
+		}
+/* L60: */
+	    }
+	    dw[(*n << 1) + j] = fact;
+	    if (fact != 0.) {
+		fact = log(fact);
+	    } else {
+		fact = rinf / *n;
+	    }
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		if (dw[*n * 3 + k] != 0.) {
+		    dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]);
+		} else {
+		    dw[*n * 3 + k] = rinf / *n;
+		}
+/* L70: */
+	    }
+/* L75: */
+	}
+/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */
+	mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1],
+		    num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1],
+		    &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
+	if (*num == *n) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (dw[(*n << 1) + j] != 0.) {
+		    dw[*n + j] -= log(dw[(*n << 1) + j]);
+		} else {
+		    dw[*n + j] = 0.;
+		}
+/* L80: */
+	    }
+	}
+/* Check size of scaling factors */
+	fact = log(rinf) * .5f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (dw[j] < fact && dw[*n + j] < fact) {
+		goto L86;
+	    }
+	    info[1] = 2;
+	    goto L90;
+L86:
+	    ;
+	}
+/*       GO TO 90 */
+    }
+L90:
+    if (info[1] == 0 && *num < *n) {
+/* Matrix is structurally singular, return with warning */
+	info[1] = 1;
+	if (icntl[2] >= 0) {
+	    printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT
+		   " The matrix is structurally singular.\n",  info[1]);
+	}
+    }
+    if (info[1] == 2) {
+/* Scaling factors are large, return with warning */
+	if (icntl[2] >= 0) {
+	    printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT "\n"
+		   "        Some scaling factors may be too large.\n", info[1]);
+	}
+    }
+/* Print diagnostics on output */
+    if (icntl[3] >= 0) {
+	printf(" ****** Output parameters for MC64A/AD: INFO(1:2)  = " IFMT IFMT "\n",
+	       info[1], info[2]);
+	printf(" NUM        = " IFMT, *num);
+	printf(" CPERM(1:N) = ");
+	for (j=1; j<=*n; ++j) {
+	    printf(IFMT, cperm[j]);
+	    if (j%8 == 0) printf("\n");
+	}
+	if (*job == 5) {
+	    printf("\n DW(1:N)    = ");
+	    for (j=1; j<=*n; ++j) {
+		printf("%11.3f", dw[j]);
+		if (j%5 == 0) printf("\n");
+	    }
+	    printf("\n DW(N+1:2N) = ");
+	    for (j=1; j<=*n; ++j) {
+		printf("%11.3f", dw[*n+j]);
+		if (j%5 == 0) printf("\n");
+	    }
+	    printf("\n");
+	}
+    }
+/* Return from subroutine. */
+L99:
+    return 0;
+} /* mc64ad_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64bd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
+	irn, double *a, int_t *iperm, int_t *num, int_t *jperm, 
+	int_t *pr, int_t *q, int_t *l, double *d__)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3;
+    double d__1, d__2, d__3;
+
+    /* Local variables */
+    int_t i__, j, k;
+    double a0;
+    int_t i0, q0;
+    double ai, di;
+    int_t ii, jj, kk;
+    double bv;
+    int_t up;
+    double dq0;
+    int_t kk1, kk2;
+    double csp;
+    int_t isp, jsp, low;
+    double dnew;
+    int_t jord, qlen, idum, jdum;
+    double rinf;
+    extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, 
+	    double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
+	     int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
+	    , int_t *, int_t *, int_t *, double *, int_t *, int_t *);
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* N, NE, IP, IRN are described in MC64A/AD. */
+/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */
+/*   NE. A(K), K=1..NE, must be set to the value of the entry */
+/*   that corresponds to IRN(K). It is not altered. */
+/* IPERM is an INT_T array of length N. On exit, it contains the */
+/*    matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */
+/* NUM is INT_T variable. On exit, it contains the cardinality of the */
+/*    matching stored in IPERM. */
+/* IW is an INT_T work array of length 4N. */
+/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */
+/* Local variables */
+/* Local parameters */
+/* Intrinsic functions */
+/* External subroutines and/or functions */
+/*      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DMACH */
+/*      DOUBLE PRECISION FD05AD, DMACH */
+/* Set RINF to largest positive real number */
+/* XSL  RINF = FD05AD(5) */
+    /* Parameter adjustments */
+    --d__;
+    --l;
+    --q;
+    --pr;
+    --jperm;
+    --iperm;
+    --ip;
+    --a;
+    --irn;
+
+    /* Function Body */
+    rinf = dmach_dist("Overflow");
+/* Initialization */
+    *num = 0;
+    bv = rinf;
+    i__1 = *n;
+    for (k = 1; k <= i__1; ++k) {
+	iperm[k] = 0;
+	jperm[k] = 0;
+	pr[k] = ip[k];
+	d__[k] = 0.;
+/* L10: */
+    }
+/* Scan columns of matrix; */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	a0 = -1.;
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    ai = (d__1 = a[k], abs(d__1));
+	    if (ai > d__[i__]) {
+		d__[i__] = ai;
+	    }
+	    if (jperm[j] != 0) {
+		goto L30;
+	    }
+	    if (ai >= bv) {
+		a0 = bv;
+		if (iperm[i__] != 0) {
+		    goto L30;
+		}
+		jperm[j] = i__;
+		iperm[i__] = j;
+		++(*num);
+	    } else {
+		if (ai <= a0) {
+		    goto L30;
+		}
+		a0 = ai;
+		i0 = i__;
+	    }
+L30:
+	    ;
+	}
+	if (a0 != -1. && a0 < bv) {
+	    bv = a0;
+	    if (iperm[i0] != 0) {
+		goto L20;
+	    }
+	    iperm[i0] = j;
+	    jperm[j] = i0;
+	    ++(*num);
+	}
+L20:
+	;
+    }
+/* Update BV with smallest of all the largest maximum absolute values */
+/* of the rows. */
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MIN */
+	d__1 = bv, d__2 = d__[i__];
+	bv = min(d__1,d__2);
+/* L25: */
+    }
+    if (*num == *n) {
+	goto L1000;
+    }
+/* Rescan unassigned columns; improve initial assignment */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (jperm[j] != 0) {
+	    goto L95;
+	}
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    ai = (d__1 = a[k], abs(d__1));
+	    if (ai < bv) {
+		goto L50;
+	    }
+	    if (iperm[i__] == 0) {
+		goto L90;
+	    }
+	    jj = iperm[i__];
+	    kk1 = pr[jj];
+	    kk2 = ip[jj + 1] - 1;
+	    if (kk1 > kk2) {
+		goto L50;
+	    }
+	    i__3 = kk2;
+	    for (kk = kk1; kk <= i__3; ++kk) {
+		ii = irn[kk];
+		if (iperm[ii] != 0) {
+		    goto L70;
+		}
+		if ((d__1 = a[kk], abs(d__1)) >= bv) {
+		    goto L80;
+		}
+L70:
+		;
+	    }
+	    pr[jj] = kk2 + 1;
+L50:
+	    ;
+	}
+	goto L95;
+L80:
+	jperm[jj] = ii;
+	iperm[ii] = jj;
+	pr[jj] = kk + 1;
+L90:
+	++(*num);
+	jperm[j] = i__;
+	iperm[i__] = j;
+	pr[j] = k + 1;
+L95:
+	;
+    }
+    if (*num == *n) {
+	goto L1000;
+    }
+/* Prepare for main loop */
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = -1.;
+	l[i__] = 0;
+/* L99: */
+    }
+/* Main loop ... each pass round this loop is similar to Dijkstra's */
+/* algorithm for solving the single source shortest path problem */
+    i__1 = *n;
+    for (jord = 1; jord <= i__1; ++jord) {
+	if (jperm[jord] != 0) {
+	    goto L100;
+	}
+	qlen = 0;
+	low = *n + 1;
+	up = *n + 1;
+/* CSP is cost of shortest path to any unassigned row */
+/* ISP is matrix position of unassigned row element in shortest path */
+/* JSP is column index of unassigned row element in shortest path */
+	csp = -1.;
+/* Build shortest path tree starting from unassigned column JORD */
+	j = jord;
+	pr[j] = -1;
+/* Scan column J */
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    dnew = (d__1 = a[k], abs(d__1));
+	    if (csp >= dnew) {
+		goto L115;
+	    }
+	    if (iperm[i__] == 0) {
+/* Row I is unassigned; update shortest path info */
+		csp = dnew;
+		isp = i__;
+		jsp = j;
+		if (csp >= bv) {
+		    goto L160;
+		}
+	    } else {
+		d__[i__] = dnew;
+		if (dnew >= bv) {
+/* Add row I to Q2 */
+		    --low;
+		    q[low] = i__;
+		} else {
+/* Add row I to Q, and push it */
+		    ++qlen;
+		    l[i__] = qlen;
+		    mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
+		}
+		jj = iperm[i__];
+		pr[jj] = j;
+	    }
+L115:
+	    ;
+	}
+	i__2 = *num;
+	for (jdum = 1; jdum <= i__2; ++jdum) {
+/* If Q2 is empty, extract new rows from Q */
+	    if (low == up) {
+		if (qlen == 0) {
+		    goto L160;
+		}
+		i__ = q[1];
+		if (csp >= d__[i__]) {
+		    goto L160;
+		}
+		bv = d__[i__];
+		i__3 = *n;
+		for (idum = 1; idum <= i__3; ++idum) {
+		    mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__1);
+		    l[i__] = 0;
+		    --low;
+		    q[low] = i__;
+		    if (qlen == 0) {
+			goto L153;
+		    }
+		    i__ = q[1];
+		    if (d__[i__] != bv) {
+			goto L153;
+		    }
+/* L152: */
+		}
+/* End of dummy loop; this point is never reached */
+	    }
+/* Move row Q0 */
+L153:
+	    --up;
+	    q0 = q[up];
+	    dq0 = d__[q0];
+	    l[q0] = up;
+/* Scan column that matches with row Q0 */
+	    j = iperm[q0];
+	    i__3 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__3; ++k) {
+		i__ = irn[k];
+/* Update D(I) */
+		if (l[i__] >= up) {
+		    goto L155;
+		}
+/* Computing MIN */
+		d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1));
+		dnew = min(d__2,d__3);
+		if (csp >= dnew) {
+		    goto L155;
+		}
+		if (iperm[i__] == 0) {
+/* Row I is unassigned; update shortest path info */
+		    csp = dnew;
+		    isp = i__;
+		    jsp = j;
+		    if (csp >= bv) {
+			goto L160;
+		    }
+		} else {
+		    di = d__[i__];
+		    if (di >= bv || di >= dnew) {
+			goto L155;
+		    }
+		    d__[i__] = dnew;
+		    if (dnew >= bv) {
+/* Delete row I from Q (if necessary); add row I to Q2 */
+			if (di != -1.) {
+			    mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], 
+				    &c__1);
+			}
+			l[i__] = 0;
+			--low;
+			q[low] = i__;
+		    } else {
+/* Add row I to Q (if necessary); push row I up Q */
+			if (di == -1.) {
+			    ++qlen;
+			    l[i__] = qlen;
+			}
+			mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
+		    }
+/* Update tree */
+		    jj = iperm[i__];
+		    pr[jj] = j;
+		}
+L155:
+		;
+	    }
+/* L150: */
+	}
+/* If CSP = MINONE, no augmenting path is found */
+L160:
+	if (csp == -1.) {
+	    goto L190;
+	}
+/* Update bottleneck value */
+	bv = min(bv,csp);
+/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
+	++(*num);
+	i__ = isp;
+	j = jsp;
+	i__2 = *num + 1;
+	for (jdum = 1; jdum <= i__2; ++jdum) {
+	    i0 = jperm[j];
+	    jperm[j] = i__;
+	    iperm[i__] = j;
+	    j = pr[j];
+	    if (j == -1) {
+		goto L190;
+	    }
+	    i__ = i0;
+/* L170: */
+	}
+/* End of dummy loop; this point is never reached */
+L190:
+	i__2 = *n;
+	for (kk = up; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    d__[i__] = -1.;
+	    l[i__] = 0;
+/* L191: */
+	}
+	i__2 = up - 1;
+	for (kk = low; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    d__[i__] = -1.;
+/* L192: */
+	}
+	i__2 = qlen;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    d__[i__] = -1.;
+	    l[i__] = 0;
+/* L193: */
+	}
+L100:
+	;
+    }
+/* End of main loop */
+/* BV is bottleneck value of final matching */
+    if (*num == *n) {
+	goto L1000;
+    }
+/* Matrix is structurally singular, complete IPERM. */
+/* JPERM, PR are work arrays */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jperm[j] = 0;
+/* L300: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (iperm[i__] == 0) {
+	    ++k;
+	    pr[k] = i__;
+	} else {
+	    j = iperm[i__];
+	    jperm[j] = i__;
+	}
+/* L310: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (jperm[i__] != 0) {
+	    goto L320;
+	}
+	++k;
+	jdum = pr[k];
+	iperm[jdum] = i__;
+L320:
+	;
+    }
+L1000:
+    return 0;
+} /* mc64bd_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64dd_dist(int_t *i__, int_t *n, int_t *q, double 
+	*d__, int_t *l, int_t *iway)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    double di;
+    int_t qk, pos, idum, posk;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* Variables N,Q,D,L are described in MC64B/BD */
+/* IF IWAY is equal to 1, then */
+/* node I is pushed from its current position upwards */
+/* IF IWAY is not equal to 1, then */
+/* node I is pushed from its current position downwards */
+/* Local variables and parameters */
+    /* Parameter adjustments */
+    --l;
+    --d__;
+    --q;
+
+    /* Function Body */
+    di = d__[*i__];
+    pos = l[*i__];
+/* POS is index of current position of I in the tree */
+    if (*iway == 1) {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    if (pos <= 1) {
+		goto L20;
+	    }
+	    posk = pos / 2;
+	    qk = q[posk];
+	    if (di <= d__[qk]) {
+		goto L20;
+	    }
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L10: */
+	}
+/* End of dummy loop; this point is never reached */
+    } else {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    if (pos <= 1) {
+		goto L20;
+	    }
+	    posk = pos / 2;
+	    qk = q[posk];
+	    if (di >= d__[qk]) {
+		goto L20;
+	    }
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L15: */
+	}
+/* End of dummy loop; this point is never reached */
+    }
+/* End of dummy if; this point is never reached */
+L20:
+    q[pos] = *i__;
+    l[*i__] = pos;
+    return 0;
+} /* mc64dd_dist */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64ed_dist(int_t *qlen, int_t *n, int_t *q, 
+	double *d__, int_t *l, int_t *iway)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    int_t i__;
+    double di, dk, dr;
+    int_t pos, idum, posk;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
+/*     MC64W/WD (IWAY = 2) */
+/* The root node is deleted from the binary heap. */
+/* Local variables and parameters */
+/* Move last element to begin of Q */
+    /* Parameter adjustments */
+    --l;
+    --d__;
+    --q;
+
+    /* Function Body */
+    i__ = q[*qlen];
+    di = d__[i__];
+    --(*qlen);
+    pos = 1;
+    if (*iway == 1) {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    posk = pos << 1;
+	    if (posk > *qlen) {
+		goto L20;
+	    }
+	    dk = d__[q[posk]];
+	    if (posk < *qlen) {
+		dr = d__[q[posk + 1]];
+		if (dk < dr) {
+		    ++posk;
+		    dk = dr;
+		}
+	    }
+	    if (di >= dk) {
+		goto L20;
+	    }
+/* Exchange old last element with larger priority child */
+	    q[pos] = q[posk];
+	    l[q[pos]] = pos;
+	    pos = posk;
+/* L10: */
+	}
+/* End of dummy loop; this point is never reached */
+    } else {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    posk = pos << 1;
+	    if (posk > *qlen) {
+		goto L20;
+	    }
+	    dk = d__[q[posk]];
+	    if (posk < *qlen) {
+		dr = d__[q[posk + 1]];
+		if (dk > dr) {
+		    ++posk;
+		    dk = dr;
+		}
+	    }
+	    if (di <= dk) {
+		goto L20;
+	    }
+/* Exchange old last element with smaller child */
+	    q[pos] = q[posk];
+	    l[q[pos]] = pos;
+	    pos = posk;
+/* L15: */
+	}
+/* End of dummy loop; this point is never reached */
+    }
+/* End of dummy if; this point is never reached */
+L20:
+    q[pos] = i__;
+    l[i__] = pos;
+    return 0;
+} /* mc64ed_dist */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64fd_dist(int_t *pos0, int_t *qlen, int_t *n, 
+	int_t *q, double *d__, int_t *l, int_t *iway)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    int_t i__;
+    double di, dk, dr;
+    int_t qk, pos, idum, posk;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
+/*     MC64WD (IWAY = 2). */
+/* Move last element in the heap */
+/* Quick return, if possible */
+    /* Parameter adjustments */
+    --l;
+    --d__;
+    --q;
+
+    /* Function Body */
+    if (*qlen == *pos0) {
+	--(*qlen);
+	return 0;
+    }
+/* Move last element from queue Q to position POS0 */
+/* POS is current position of node I in the tree */
+    i__ = q[*qlen];
+    di = d__[i__];
+    --(*qlen);
+    pos = *pos0;
+    if (*iway == 1) {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    if (pos <= 1) {
+		goto L20;
+	    }
+	    posk = pos / 2;
+	    qk = q[posk];
+	    if (di <= d__[qk]) {
+		goto L20;
+	    }
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L10: */
+	}
+/* End of dummy loop; this point is never reached */
+L20:
+	q[pos] = i__;
+	l[i__] = pos;
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    posk = pos << 1;
+	    if (posk > *qlen) {
+		goto L40;
+	    }
+	    dk = d__[q[posk]];
+	    if (posk < *qlen) {
+		dr = d__[q[posk + 1]];
+		if (dk < dr) {
+		    ++posk;
+		    dk = dr;
+		}
+	    }
+	    if (di >= dk) {
+		goto L40;
+	    }
+	    qk = q[posk];
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L30: */
+	}
+/* End of dummy loop; this point is never reached */
+    } else {
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    if (pos <= 1) {
+		goto L34;
+	    }
+	    posk = pos / 2;
+	    qk = q[posk];
+	    if (di >= d__[qk]) {
+		goto L34;
+	    }
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L32: */
+	}
+/* End of dummy loop; this point is never reached */
+L34:
+	q[pos] = i__;
+	l[i__] = pos;
+	i__1 = *n;
+	for (idum = 1; idum <= i__1; ++idum) {
+	    posk = pos << 1;
+	    if (posk > *qlen) {
+		goto L40;
+	    }
+	    dk = d__[q[posk]];
+	    if (posk < *qlen) {
+		dr = d__[q[posk + 1]];
+		if (dk > dr) {
+		    ++posk;
+		    dk = dr;
+		}
+	    }
+	    if (di <= dk) {
+		goto L40;
+	    }
+	    qk = q[posk];
+	    q[pos] = qk;
+	    l[qk] = pos;
+	    pos = posk;
+/* L36: */
+	}
+/* End of dummy loop; this point is never reached */
+    }
+/* End of dummy if; this point is never reached */
+L40:
+    q[pos] = i__;
+    l[i__] = pos;
+    return 0;
+} /* mc64fd_dist */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64rd_dist(int_t *n, int_t *ne, int_t *ip,
+				   int_t *irn, double *a)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3;
+
+    /* Local variables */
+    int_t j, k, r__, s;
+    double ha;
+    int_t hi, td, mid, len, ipj;
+    double key;
+    int_t last, todo[50], first;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* This subroutine sorts the entries in each column of the */
+/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */
+/* numerical value. */
+/* Local constants */
+/* Local variables */
+/* Local arrays */
+    /* Parameter adjustments */
+    --ip;
+    --a;
+    --irn;
+
+    /* Function Body */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	len = ip[j + 1] - ip[j];
+	if (len <= 1) {
+	    goto L100;
+	}
+	ipj = ip[j];
+/* Sort array roughly with partial quicksort */
+	if (len < 15) {
+	    goto L400;
+	}
+	todo[0] = ipj;
+	todo[1] = ipj + len;
+	td = 2;
+L500:
+	first = todo[td - 2];
+	last = todo[td - 1];
+/* KEY is the smallest of two values present in interval [FIRST,LAST) */
+	key = a[(first + last) / 2];
+	i__2 = last - 1;
+	for (k = first; k <= i__2; ++k) {
+	    ha = a[k];
+	    if (ha == key) {
+		goto L475;
+	    }
+	    if (ha > key) {
+		goto L470;
+	    }
+	    key = ha;
+	    goto L470;
+L475:
+	    ;
+	}
+/* Only one value found in interval, so it is already sorted */
+	td += -2;
+	goto L425;
+/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */
+L470:
+	mid = first;
+	i__2 = last - 1;
+	for (k = first; k <= i__2; ++k) {
+	    if (a[k] <= key) {
+		goto L450;
+	    }
+	    ha = a[mid];
+	    a[mid] = a[k];
+	    a[k] = ha;
+	    hi = irn[mid];
+	    irn[mid] = irn[k];
+	    irn[k] = hi;
+	    ++mid;
+L450:
+	    ;
+	}
+/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */
+/* Stack the longest of the two subintervals first */
+	if (mid - first >= last - mid) {
+	    todo[td + 1] = last;
+	    todo[td] = mid;
+	    todo[td - 1] = mid;
+/*          TODO(TD-1) = FIRST */
+	} else {
+	    todo[td + 1] = mid;
+	    todo[td] = first;
+	    todo[td - 1] = last;
+	    todo[td - 2] = mid;
+	}
+	td += 2;
+L425:
+	if (td == 0) {
+	    goto L400;
+	}
+/* There is still work to be done */
+	if (todo[td - 1] - todo[td - 2] >= 15) {
+	    goto L500;
+	}
+/* Next interval is already short enough for straightforward insertion */
+	td += -2;
+	goto L425;
+/* Complete sorting with straightforward insertion */
+L400:
+	i__2 = ipj + len - 1;
+	for (r__ = ipj + 1; r__ <= i__2; ++r__) {
+	    if (a[r__ - 1] < a[r__]) {
+		ha = a[r__];
+		hi = irn[r__];
+		a[r__] = a[r__ - 1];
+		irn[r__] = irn[r__ - 1];
+		i__3 = ipj + 1;
+		for (s = r__ - 1; s >= i__3; --s) {
+		    if (a[s - 1] < ha) {
+			a[s] = a[s - 1];
+			irn[s] = irn[s - 1];
+		    } else {
+			a[s] = ha;
+			irn[s] = hi;
+			goto L200;
+		    }
+/* L300: */
+		}
+		a[ipj] = ha;
+		irn[ipj] = hi;
+	    }
+L200:
+	    ;
+	}
+L100:
+	;
+    }
+    return 0;
+} /* mc64rd_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64sd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
+	irn, double *a, int_t *iperm, int_t *numx, int_t *w, 
+	int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, 
+	int_t *iw4)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    int_t i__, j, k, l, ii, mod, cnt, num;
+    double bval, bmin, bmax, rinf;
+    int_t nval, wlen, idum1, idum2, idum3;
+    extern /* Subroutine */ int_t mc64qd_dist(int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, double *, int_t *, double *), 
+	    mc64ud_dist(int_t *, int_t *, int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *);
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* N, NE, IP, IRN, are described in MC64A/AD. */
+/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
+/*   A(K), K=1..NE, must be set to the value of the entry that */
+/*   corresponds to IRN(k). The entries in each column must be */
+/*   non-negative and ordered by decreasing value. */
+/* IPERM is an INT_T array of length N. On exit, it contains the */
+/*   bottleneck matching: IPERM(I) - 0 or row I is matched to column */
+/*   IPERM(I). */
+/* NUMX is an INT_T variable. On exit, it contains the cardinality */
+/*   of the matching stored in IPERM. */
+/* IW is an INT_T work array of length 10N. */
+/* FC is an int_t array of length N that contains the list of */
+/*   unmatched columns. */
+/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */
+/*   to entries in matrix column J. */
+/*   In the matrix defined by the column parts IP(J)+LENL(J) we know */
+/*   a matching does not exist; in the matrix defined by the column */
+/*   parts IP(J)+LENH(J) we know one exists. */
+/*   LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */
+/*   that is tested for a maximum matching. */
+/* W is an int_t array of length N and contains the indices of the */
+/*   columns for which LENL ne LENH. */
+/* WLEN is number of indices stored in array W. */
+/* IW is int_t work array of length N. */
+/* IW4 is int_t work array of length 4N used by MC64U/UD. */
+/*      EXTERNAL FD05AD,MC64QD,MC64UD */
+/*      DOUBLE PRECISION FD05AD */
+/* BMIN and BMAX are such that a maximum matching exists for the input */
+/*   matrix in which all entries smaller than BMIN are dropped. */
+/*   For BMAX, a maximum matching does not exist. */
+/* BVAL is a value between BMIN and BMAX. */
+/* CNT is the number of calls made to MC64U/UD so far. */
+/* NUM is the cardinality of last matching found. */
+/* Set RINF to largest positive real number */
+/* XSL      RINF = FD05AD(5) */
+    /* Parameter adjustments */
+    --iw4;
+    --iw;
+    --fc;
+    --lenh;
+    --lenl;
+    --len;
+    --w;
+    --iperm;
+    --ip;
+    --a;
+    --irn;
+
+    /* Function Body */
+    rinf = dmach_dist("Overflow");
+/* Compute a first maximum matching from scratch on whole matrix. */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	fc[j] = j;
+	iw[j] = 0;
+	len[j] = ip[j + 1] - ip[j];
+/* L20: */
+    }
+/* The first call to MC64U/UD */
+    cnt = 1;
+    mod = 1;
+    *numx = 0;
+    mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1],
+		numx, n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1],
+		&iw4[*n * 3 + 1]);
+/* IW contains a maximum matching of length NUMX. */
+    num = *numx;
+    if (num != *n) {
+/* Matrix is structurally singular */
+	bmax = rinf;
+    } else {
+/* Matrix is structurally nonsingular, NUM=NUMX=N; */
+/* Set BMAX just above the smallest of all the maximum absolute */
+/* values of the columns */
+	bmax = rinf;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    bval = 0.f;
+	    i__2 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__2; ++k) {
+		if (a[k] > bval) {
+		    bval = a[k];
+		}
+/* L25: */
+	    }
+	    if (bval < bmax) {
+		bmax = bval;
+	    }
+/* L30: */
+	}
+	bmax *= 1.001f;
+    }
+/* Initialize BVAL,BMIN */
+    bval = 0.f;
+    bmin = 0.f;
+/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */
+/* Set LEN(J), LENH(J) just after last entry in column J. */
+/* Set LENL(J) just after last entry in column J with value ge BMAX. */
+    wlen = 0;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	l = ip[j + 1] - ip[j];
+	lenh[j] = l;
+	len[j] = l;
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    if (a[k] < bmax) {
+		goto L46;
+	    }
+/* L45: */
+	}
+/* Column J is empty or all entries are ge BMAX */
+	k = ip[j + 1];
+L46:
+	lenl[j] = k - ip[j];
+/* Add J to W if LENL(J) ne LENH(J) */
+	if (lenl[j] == l) {
+	    goto L48;
+	}
+	++wlen;
+	w[wlen] = j;
+L48:
+	;
+    }
+/* Main loop */
+    i__1 = *ne;
+    for (idum1 = 1; idum1 <= i__1; ++idum1) {
+	if (num == *numx) {
+/* We have a maximum matching in IW; store IW in IPERM */
+	    i__2 = *n;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		iperm[i__] = iw[i__];
+/* L50: */
+	    }
+/* Keep going round this loop until matching IW is no longer maximum. */
+	    i__2 = *ne;
+	    for (idum2 = 1; idum2 <= i__2; ++idum2) {
+		bmin = bval;
+		if (bmax == bmin) {
+		    goto L99;
+		}
+/* Find splitting value BVAL */
+		mc64qd_dist(&ip[1], &lenl[1], &len[1], &w[1], &wlen,
+			    &a[1], &nval, &bval);
+		if (nval <= 1) {
+		    goto L99;
+		}
+/* Set LEN such that all matrix entries with value lt BVAL are */
+/* discarded. Store old LEN in LENH. Do this for all columns W(K). */
+/* Each step, either K is incremented or WLEN is decremented. */
+		k = 1;
+		i__3 = *n;
+		for (idum3 = 1; idum3 <= i__3; ++idum3) {
+		    if (k > wlen) {
+			goto L71;
+		    }
+		    j = w[k];
+		    i__4 = ip[j] + lenl[j];
+		    for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) {
+			if (a[ii] >= bval) {
+			    goto L60;
+			}
+			i__ = irn[ii];
+			if (iw[i__] != j) {
+			    goto L55;
+			}
+/* Remove entry from matching */
+			iw[i__] = 0;
+			--num;
+			fc[*n - num] = j;
+L55:
+			;
+		    }
+L60:
+		    lenh[j] = len[j];
+/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */
+		    len[j] = ii - ip[j] + 1;
+/* If LENH(J) = LENL(J), remove J from W */
+		    if (lenl[j] == lenh[j]) {
+			w[k] = w[wlen];
+			--wlen;
+		    } else {
+			++k;
+		    }
+/* L70: */
+		}
+L71:
+		if (num < *numx) {
+		    goto L81;
+		}
+/* L80: */
+	    }
+/* End of dummy loop; this point is never reached */
+/* Set mode for next call to MC64U/UD */
+L81:
+	    mod = 1;
+	} else {
+/* We do not have a maximum matching in IW. */
+	    bmax = bval;
+/* BMIN is the bottleneck value of a maximum matching; */
+/* for BMAX the matching is not maximum, so BMAX>BMIN */
+/*          IF (BMAX .EQ. BMIN) GO TO 99 */
+/* Find splitting value BVAL */
+	    mc64qd_dist(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1],
+			&nval, &bval);
+	    if (nval == 0 || bval == bmin) {
+		goto L99;
+	    }
+/* Set LEN such that all matrix entries with value ge BVAL are */
+/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */
+/* Each step, either K is incremented or WLEN is decremented. */
+	    k = 1;
+	    i__2 = *n;
+	    for (idum3 = 1; idum3 <= i__2; ++idum3) {
+		if (k > wlen) {
+		    goto L88;
+		}
+		j = w[k];
+		i__3 = ip[j] + lenh[j] - 1;
+		for (ii = ip[j] + len[j]; ii <= i__3; ++ii) {
+		    if (a[ii] < bval) {
+			goto L86;
+		    }
+/* L85: */
+		}
+L86:
+		lenl[j] = len[j];
+		len[j] = ii - ip[j];
+		if (lenl[j] == lenh[j]) {
+		    w[k] = w[wlen];
+		    --wlen;
+		} else {
+		    ++k;
+		}
+/* L87: */
+	    }
+/* End of dummy loop; this point is never reached */
+/* Set mode for next call to MC64U/UD */
+L88:
+	    mod = 0;
+	}
+	++cnt;
+	mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1],
+		    &iw[1], &num, numx, &iw4[1], &iw4[*n + 1],
+		    &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]);
+/* IW contains maximum matching of length NUM */
+/* L90: */
+    }
+/* End of dummy loop; this point is never reached */
+/* BMIN is bottleneck value of final matching */
+L99:
+    if (*numx == *n) {
+	goto L1000;
+    }
+/* The matrix is structurally singular, complete IPERM */
+/* W, IW are work arrays */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	w[j] = 0;
+/* L300: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (iperm[i__] == 0) {
+	    ++k;
+	    iw[k] = i__;
+	} else {
+	    j = iperm[i__];
+	    w[j] = i__;
+	}
+/* L310: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (w[j] != 0) {
+	    goto L320;
+	}
+	++k;
+	idum1 = iw[k];
+	iperm[idum1] = j;
+L320:
+	;
+    }
+L1000:
+    return 0;
+} /* mc64sd_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64qd_dist(int_t *ip, int_t *lenl, int_t *lenh, 
+	int_t *w, int_t *wlen, double *a, int_t *nval, double *val)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3;
+
+    /* Local variables */
+    int_t j, k, s;
+    double ha;
+    int_t ii, pos;
+    double split[10];
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* This routine searches for at most XX different numerical values */
+/* in the columns W(1:WLEN). XX>=2. */
+/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */
+/* until XX values are found or all columns have been considered. */
+/* On output, NVAL is the number of different values that is found */
+/* and SPLIT(1:NVAL) contains the values in decreasing order. */
+/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */
+
+/* Scan columns in W(1:WLEN). For each encountered value, if value not */
+/* already present in SPLIT(1:NVAL), insert value such that SPLIT */
+/* remains sorted by decreasing value. */
+/* The sorting is done by straightforward insertion; therefore the use */
+/* of this routine should be avoided for large XX (XX < 20). */
+    /* Parameter adjustments */
+    --a;
+    --w;
+    --lenh;
+    --lenl;
+    --ip;
+
+    /* Function Body */
+    *nval = 0;
+    i__1 = *wlen;
+    for (k = 1; k <= i__1; ++k) {
+	j = w[k];
+	i__2 = ip[j] + lenh[j] - 1;
+	for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) {
+	    ha = a[ii];
+	    if (*nval == 0) {
+		split[0] = ha;
+		*nval = 1;
+	    } else {
+/* Check presence of HA in SPLIT */
+		for (s = *nval; s >= 1; --s) {
+		    if (split[s - 1] == ha) {
+			goto L15;
+		    }
+		    if (split[s - 1] > ha) {
+			pos = s + 1;
+			goto L21;
+		    }
+/* L20: */
+		}
+		pos = 1;
+/* The insertion */
+L21:
+		i__3 = pos;
+		for (s = *nval; s >= i__3; --s) {
+		    split[s] = split[s - 1];
+/* L22: */
+		}
+		split[pos - 1] = ha;
+		++(*nval);
+	    }
+/* Exit loop if XX values are found */
+	    if (*nval == 10) {
+		goto L11;
+	    }
+L15:
+	    ;
+	}
+/* L10: */
+    }
+/* Determine VAL */
+L11:
+    if (*nval > 0) {
+	*val = split[(*nval + 1) / 2 - 1];
+    }
+    return 0;
+} /* mc64qd_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64ud_dist(int_t *id, int_t *mod, int_t *n, int_t *
+	irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t *
+	iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, 
+	int_t *cv, int_t *out)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2, 
+	    jord, last;
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* PR(J) is the previous column to J in the depth first search. */
+/*   Array PR is used as workspace in the sorting algorithm. */
+/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */
+/*   algorithm unless N assignments have not been made in which case */
+/*   N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */
+/* CV(I) is the most recent loop number (ID+JORD) at which row I */
+/*   was visited. */
+/* ARP(J) is the number of entries in column J which have been scanned */
+/*   when looking for a cheap assignment. */
+/* OUT(J) is one less than the number of entries in column J which have */
+/*   not been scanned during one pass through the main loop. */
+/* NUMX is maximum possible size of matching. */
+    /* Parameter adjustments */
+    --out;
+    --cv;
+    --arp;
+    --pr;
+    --iperm;
+    --fc;
+    --lenc;
+    --ip;
+    --irn;
+
+    /* Function Body */
+    if (*id == 1) {
+/* The first call to MC64U/UD. */
+/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    cv[i__] = 0;
+	    arp[i__] = 0;
+/* L5: */
+	}
+	num1 = *n;
+	num2 = *n;
+    } else {
+/* Not the first call to MC64U/UD. */
+/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */
+	if (*mod == 1) {
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		arp[i__] = 0;
+/* L8: */
+	    }
+	}
+	num1 = *numx;
+	num2 = *n - *numx;
+    }
+    num0 = *num;
+/* NUM0 is size of input matching */
+/* NUM1 is maximum possible size of matching */
+/* NUM2 is maximum allowed number of unassigned rows/columns */
+/* NUM is size of current matching */
+/* Quick return if possible */
+/*      IF (NUM.EQ.N) GO TO 199 */
+/* NFC is number of rows/columns that could not be assigned */
+    nfc = 0;
+/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */
+/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */
+    id0 = (*id - 1) * *n;
+/* Main loop. Each pass round this loop either results in a new */
+/* assignment or gives a column with no assignment */
+    i__1 = *n;
+    for (jord = num0 + 1; jord <= i__1; ++jord) {
+/* Each pass uses unique number ID1 */
+	id1 = id0 + jord;
+/* J is unmatched column */
+	j = fc[jord - num0];
+	pr[j] = -1;
+	i__2 = jord;
+	for (k = 1; k <= i__2; ++k) {
+/* Look for a cheap assignment */
+	    if (arp[j] >= lenc[j]) {
+		goto L30;
+	    }
+	    in1 = ip[j] + arp[j];
+	    in2 = ip[j] + lenc[j] - 1;
+	    i__3 = in2;
+	    for (ii = in1; ii <= i__3; ++ii) {
+		i__ = irn[ii];
+		if (iperm[i__] == 0) {
+		    goto L80;
+		}
+/* L20: */
+	    }
+/* No cheap assignment in row */
+	    arp[j] = lenc[j];
+/* Begin looking for assignment chain starting with row J */
+L30:
+	    out[j] = lenc[j] - 1;
+/* Inner loop.  Extends chain by one or backtracks */
+	    i__3 = jord;
+	    for (kk = 1; kk <= i__3; ++kk) {
+		in1 = out[j];
+		if (in1 < 0) {
+		    goto L50;
+		}
+		in2 = ip[j] + lenc[j] - 1;
+		in1 = in2 - in1;
+/* Forward scan */
+		i__4 = in2;
+		for (ii = in1; ii <= i__4; ++ii) {
+		    i__ = irn[ii];
+		    if (cv[i__] == id1) {
+			goto L40;
+		    }
+/* Column J has not yet been accessed during this pass */
+		    j1 = j;
+		    j = iperm[i__];
+		    cv[i__] = id1;
+		    pr[j] = j1;
+		    out[j1] = in2 - ii - 1;
+		    goto L70;
+L40:
+		    ;
+		}
+/* Backtracking step. */
+L50:
+		j1 = pr[j];
+		if (j1 == -1) {
+/* No augmenting path exists for column J. */
+		    ++nfc;
+		    fc[nfc] = j;
+		    if (nfc > num2) {
+/* A matching of maximum size NUM1 is not possible */
+			last = jord;
+			goto L101;
+		    }
+		    goto L100;
+		}
+		j = j1;
+/* L60: */
+	    }
+/* End of dummy loop; this point is never reached */
+L70:
+	    ;
+	}
+/* End of dummy loop; this point is never reached */
+/* New assignment is made. */
+L80:
+	iperm[i__] = j;
+	arp[j] = ii - ip[j] + 1;
+	++(*num);
+	i__2 = jord;
+	for (k = 1; k <= i__2; ++k) {
+	    j = pr[j];
+	    if (j == -1) {
+		goto L95;
+	    }
+	    ii = ip[j] + lenc[j] - out[j] - 2;
+	    i__ = irn[ii];
+	    iperm[i__] = j;
+/* L90: */
+	}
+/* End of dummy loop; this point is never reached */
+L95:
+	if (*num == num1) {
+/* A matching of maximum size NUM1 is found */
+	    last = jord;
+	    goto L101;
+	}
+
+L100:
+	;
+    }
+/* All unassigned columns have been considered */
+    last = *n;
+/* Now, a transversal is computed or is not possible. */
+/* Complete FC before returning. */
+L101:
+    i__1 = *n;
+    for (jord = last + 1; jord <= i__1; ++jord) {
+	++nfc;
+	fc[nfc] = fc[jord - num0];
+/* L110: */
+    }
+/*  199 RETURN */
+    return 0;
+} /* mc64ud_ */
+
+/* ********************************************************************** */
+/* Subroutine */ int_t mc64wd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
+	irn, double *a, int_t *iperm, int_t *num, int_t *jperm, 
+	int_t *out, int_t *pr, int_t *q, int_t *l, double *u, 
+	double *d__)
+{
+    /* System generated locals */
+    int_t i__1, i__2, i__3;
+
+    /* Local variables */
+    int_t i__, j, k, i0, k0, k1, k2, q0;
+    double di;
+    int_t ii, jj, kk;
+    double vj;
+    int_t up;
+    double dq0;
+    int_t kk1, kk2;
+    double csp;
+    int_t isp, jsp, low;
+    double dmin__, dnew;
+    int_t jord, qlen, jdum;
+    double rinf;
+    extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, 
+	    double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
+	     int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
+	    , int_t *, int_t *, int_t *, double *, int_t *, 
+	    int_t *);
+
+
+/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
+/*     Research Councils                                             *** */
+/* *** Although every effort has been made to ensure robustness and  *** */
+/* *** reliability of the subroutines in this MC64 suite, we         *** */
+/* *** disclaim any liability arising through the use or misuse of   *** */
+/* *** any of the subroutines.                                       *** */
+/* *** Any problems?   Contact ... */
+/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
+
+/* N, NE, IP, IRN are described in MC64A/AD. */
+/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
+/*   A(K), K=1..NE, must be set to the value of the entry that */
+/*   corresponds to IRN(K). It is not altered. */
+/*   All values A(K) must be non-negative. */
+/* IPERM is an INT_T array of length N. On exit, it contains the */
+/*   weighted matching: IPERM(I) = 0 or row I is matched to column */
+/*   IPERM(I). */
+/* NUM is an INT_T variable. On exit, it contains the cardinality of */
+/*   the matching stored in IPERM. */
+/* IW is an INT_T work array of length 5N. */
+/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */
+/*   On exit, U = D(1:N) contains the dual row variable and */
+/*   V = D(N+1:2N) contains the dual column variable. If the matrix */
+/*   is structurally nonsingular (NUM = N), the following holds: */
+/*      U(I)+V(J) <= A(I,J)  if IPERM(I) |= J */
+/*      U(I)+V(J)  = A(I,J)  if IPERM(I)  = J */
+/*      U(I) = 0  if IPERM(I) = 0 */
+/*      V(J) = 0  if there is no I for which IPERM(I) = J */
+/* Local variables */
+/* Local parameters */
+/* External subroutines and/or functions */
+/*      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */
+/*      DOUBLE PRECISION FD05AD */
+/* Set RINF to largest positive real number */
+/* XSL      RINF = FD05AD(5) */
+    /* Parameter adjustments */
+    --d__;
+    --u;
+    --l;
+    --q;
+    --pr;
+    --out;
+    --jperm;
+    --iperm;
+    --ip;
+    --a;
+    --irn;
+
+    /* Function Body */
+    rinf = dmach_dist("Overflow");
+/* Initialization */
+    *num = 0;
+    i__1 = *n;
+    for (k = 1; k <= i__1; ++k) {
+	u[k] = rinf;
+	d__[k] = 0.;
+	iperm[k] = 0;
+	jperm[k] = 0;
+	pr[k] = ip[k];
+	l[k] = 0;
+/* L10: */
+    }
+/* Initialize U(I) */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    if (a[k] > u[i__]) {
+		goto L20;
+	    }
+	    u[i__] = a[k];
+	    iperm[i__] = j;
+	    l[i__] = k;
+L20:
+	    ;
+	}
+/* L30: */
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	j = iperm[i__];
+	if (j == 0) {
+	    goto L40;
+	}
+/* Row I is not empty */
+	iperm[i__] = 0;
+	if (jperm[j] != 0) {
+	    goto L40;
+	}
+/* Assignment of column J to row I */
+	++(*num);
+	iperm[i__] = j;
+	jperm[j] = l[i__];
+L40:
+	;
+    }
+    if (*num == *n) {
+	goto L1000;
+    }
+/* Scan unassigned columns; improve assignment */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+/* JPERM(J) ne 0 iff column J is already assigned */
+	if (jperm[j] != 0) {
+	    goto L95;
+	}
+	k1 = ip[j];
+	k2 = ip[j + 1] - 1;
+/* Continue only if column J is not empty */
+	if (k1 > k2) {
+	    goto L95;
+	}
+	vj = rinf;
+	i__2 = k2;
+	for (k = k1; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    di = a[k] - u[i__];
+	    if (di > vj) {
+		goto L50;
+	    }
+	    if (di < vj || di == rinf) {
+		goto L55;
+	    }
+	    if (iperm[i__] != 0 || iperm[i0] == 0) {
+		goto L50;
+	    }
+L55:
+	    vj = di;
+	    i0 = i__;
+	    k0 = k;
+L50:
+	    ;
+	}
+	d__[j] = vj;
+	k = k0;
+	i__ = i0;
+	if (iperm[i__] == 0) {
+	    goto L90;
+	}
+	i__2 = k2;
+	for (k = k0; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    if (a[k] - u[i__] > vj) {
+		goto L60;
+	    }
+	    jj = iperm[i__];
+/* Scan remaining part of assigned column JJ */
+	    kk1 = pr[jj];
+	    kk2 = ip[jj + 1] - 1;
+	    if (kk1 > kk2) {
+		goto L60;
+	    }
+	    i__3 = kk2;
+	    for (kk = kk1; kk <= i__3; ++kk) {
+		ii = irn[kk];
+		if (iperm[ii] > 0) {
+		    goto L70;
+		}
+		if (a[kk] - u[ii] <= d__[jj]) {
+		    goto L80;
+		}
+L70:
+		;
+	    }
+	    pr[jj] = kk2 + 1;
+L60:
+	    ;
+	}
+	goto L95;
+L80:
+	jperm[jj] = kk;
+	iperm[ii] = jj;
+	pr[jj] = kk + 1;
+L90:
+	++(*num);
+	jperm[j] = k;
+	iperm[i__] = j;
+	pr[j] = k + 1;
+L95:
+	;
+    }
+    if (*num == *n) {
+	goto L1000;
+    }
+/* Prepare for main loop */
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = rinf;
+	l[i__] = 0;
+/* L99: */
+    }
+/* Main loop ... each pass round this loop is similar to Dijkstra's */
+/* algorithm for solving the single source shortest path problem */
+    i__1 = *n;
+    for (jord = 1; jord <= i__1; ++jord) {
+	if (jperm[jord] != 0) {
+	    goto L100;
+	}
+/* JORD is next unmatched column */
+/* DMIN is the length of shortest path in the tree */
+	dmin__ = rinf;
+	qlen = 0;
+	low = *n + 1;
+	up = *n + 1;
+/* CSP is the cost of the shortest augmenting path to unassigned row */
+/* IRN(ISP). The corresponding column index is JSP. */
+	csp = rinf;
+/* Build shortest path tree starting from unassigned column (root) JORD */
+	j = jord;
+	pr[j] = -1;
+/* Scan column J */
+	i__2 = ip[j + 1] - 1;
+	for (k = ip[j]; k <= i__2; ++k) {
+	    i__ = irn[k];
+	    dnew = a[k] - u[i__];
+	    if (dnew >= csp) {
+		goto L115;
+	    }
+	    if (iperm[i__] == 0) {
+		csp = dnew;
+		isp = k;
+		jsp = j;
+	    } else {
+		if (dnew < dmin__) {
+		    dmin__ = dnew;
+		}
+		d__[i__] = dnew;
+		++qlen;
+		q[qlen] = k;
+	    }
+L115:
+	    ;
+	}
+/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */
+	q0 = qlen;
+	qlen = 0;
+	i__2 = q0;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    k = q[kk];
+	    i__ = irn[k];
+	    if (csp <= d__[i__]) {
+		d__[i__] = rinf;
+		goto L120;
+	    }
+	    if (d__[i__] <= dmin__) {
+		--low;
+		q[low] = i__;
+		l[i__] = low;
+	    } else {
+		++qlen;
+		l[i__] = qlen;
+		mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
+	    }
+/* Update tree */
+	    jj = iperm[i__];
+	    out[jj] = k;
+	    pr[jj] = j;
+L120:
+	    ;
+	}
+	i__2 = *num;
+	for (jdum = 1; jdum <= i__2; ++jdum) {
+/* If Q2 is empty, extract rows from Q */
+	    if (low == up) {
+		if (qlen == 0) {
+		    goto L160;
+		}
+		i__ = q[1];
+		if (d__[i__] >= csp) {
+		    goto L160;
+		}
+		dmin__ = d__[i__];
+L152:
+		mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__2);
+		--low;
+		q[low] = i__;
+		l[i__] = low;
+		if (qlen == 0) {
+		    goto L153;
+		}
+		i__ = q[1];
+		if (d__[i__] > dmin__) {
+		    goto L153;
+		}
+		goto L152;
+	    }
+/* Q0 is row whose distance D(Q0) to the root is smallest */
+L153:
+	    q0 = q[up - 1];
+	    dq0 = d__[q0];
+/* Exit loop if path to Q0 is longer than the shortest augmenting path */
+	    if (dq0 >= csp) {
+		goto L160;
+	    }
+	    --up;
+/* Scan column that matches with row Q0 */
+	    j = iperm[q0];
+	    vj = dq0 - a[jperm[j]] + u[q0];
+	    i__3 = ip[j + 1] - 1;
+	    for (k = ip[j]; k <= i__3; ++k) {
+		i__ = irn[k];
+		if (l[i__] >= up) {
+		    goto L155;
+		}
+/* DNEW is new cost */
+		dnew = vj + a[k] - u[i__];
+/* Do not update D(I) if DNEW ge cost of shortest path */
+		if (dnew >= csp) {
+		    goto L155;
+		}
+		if (iperm[i__] == 0) {
+/* Row I is unmatched; update shortest path info */
+		    csp = dnew;
+		    isp = k;
+		    jsp = j;
+		} else {
+/* Row I is matched; do not update D(I) if DNEW is larger */
+		    di = d__[i__];
+		    if (di <= dnew) {
+			goto L155;
+		    }
+		    if (l[i__] >= low) {
+			goto L155;
+		    }
+		    d__[i__] = dnew;
+		    if (dnew <= dmin__) {
+			if (l[i__] != 0) {
+			    mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], 
+				    &c__2);
+			}
+			--low;
+			q[low] = i__;
+			l[i__] = low;
+		    } else {
+			if (l[i__] == 0) {
+			    ++qlen;
+			    l[i__] = qlen;
+			}
+			mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
+		    }
+/* Update tree */
+		    jj = iperm[i__];
+		    out[jj] = k;
+		    pr[jj] = j;
+		}
+L155:
+		;
+	    }
+/* L150: */
+	}
+/* If CSP = RINF, no augmenting path is found */
+L160:
+	if (csp == rinf) {
+	    goto L190;
+	}
+/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
+	++(*num);
+	i__ = irn[isp];
+	iperm[i__] = jsp;
+	jperm[jsp] = isp;
+	j = jsp;
+	i__2 = *num;
+	for (jdum = 1; jdum <= i__2; ++jdum) {
+	    jj = pr[j];
+	    if (jj == -1) {
+		goto L180;
+	    }
+	    k = out[j];
+	    i__ = irn[k];
+	    iperm[i__] = jj;
+	    jperm[jj] = k;
+	    j = jj;
+/* L170: */
+	}
+/* End of dummy loop; this point is never reached */
+/* Update U for rows in Q(UP:N) */
+L180:
+	i__2 = *n;
+	for (kk = up; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    u[i__] = u[i__] + d__[i__] - csp;
+/* L185: */
+	}
+L190:
+	i__2 = *n;
+	for (kk = low; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    d__[i__] = rinf;
+	    l[i__] = 0;
+/* L191: */
+	}
+	i__2 = qlen;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__ = q[kk];
+	    d__[i__] = rinf;
+	    l[i__] = 0;
+/* L193: */
+	}
+L100:
+	;
+    }
+/* End of main loop */
+/* Set dual column variable in D(1:N) */
+L1000:
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	k = jperm[j];
+	if (k != 0) {
+	    d__[j] = a[k] - u[irn[k]];
+	} else {
+	    d__[j] = 0.;
+	}
+	if (iperm[j] == 0) {
+	    u[j] = 0.;
+	}
+/* L200: */
+    }
+    if (*num == *n) {
+	goto L1100;
+    }
+/* The matrix is structurally singular, complete IPERM. */
+/* JPERM, OUT are work arrays */
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jperm[j] = 0;
+/* L300: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (iperm[i__] == 0) {
+	    ++k;
+	    out[k] = i__;
+	} else {
+	    j = iperm[i__];
+	    jperm[j] = i__;
+	}
+/* L310: */
+    }
+    k = 0;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (jperm[j] != 0) {
+	    goto L320;
+	}
+	++k;
+	jdum = out[k];
+	iperm[jdum] = j;
+L320:
+	;
+    }
+L1100:
+    return 0;
+} /* mc64wd_ */
+
+
diff --git a/SRC/memory.c b/SRC/memory.c
new file mode 100644
index 0000000..fd54862
--- /dev/null
+++ b/SRC/memory.c
@@ -0,0 +1,580 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Memory utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+/*
+ * Global variables
+ */
+ExpHeader *expanders; /* Array of pointers to 4 types of memory */
+LU_stack_t stack;
+int_t no_expand;
+
+
+/*
+ * Prototype
+ */
+static int_t memory_usage(const int_t, const int_t, const int_t);
+static void *expand(int_t *, MemType, int_t, int_t,
+		    Glu_freeable_t *);
+
+/*
+ * Internal prototypes
+ */
+void  SetupSpace (void *, int_t, LU_space_t *);
+
+
+void
+superlu_abort_and_exit_dist(char *msg)
+{
+    /*fprintf(stderr, msg);
+    fflush(stderr);*/
+    printf("%s", msg);
+    exit (-1);
+}
+
+long int superlu_malloc_total = 0;
+
+#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */
+
+#define PAD_FACTOR  2
+#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */
+
+void *superlu_malloc_dist(size_t size)
+{
+    char *buf;
+    int iam;
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &iam);
+    buf = (char *) malloc(size + DWORD);
+    if ( !buf ) {
+	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lld\n",
+	       iam, superlu_malloc_total*1e-6, size);
+	ABORT("superlu_malloc: out of memory");
+    }
+
+    ((size_t *) buf)[0] = size;
+#if 0
+    superlu_malloc_total += size + DWORD;
+#else
+    superlu_malloc_total += size;
+#endif
+    return (void *) (buf + DWORD);
+}
+
+void superlu_free_dist(void *addr)
+{
+    char *p = ((char *) addr) - DWORD;
+
+    if ( !addr )
+	ABORT("superlu_free: tried to free NULL pointer");
+
+    if ( !p )
+	ABORT("superlu_free: tried to free NULL+DWORD pointer");
+
+    { 
+	int_t n = ((size_t *) p)[0];
+	
+	if ( !n )
+	    ABORT("superlu_free: tried to free a freed pointer");
+	*((size_t *) p) = 0; /* Set to zero to detect duplicate free's. */
+#if 0	
+	superlu_malloc_total -= (n + DWORD);
+#else
+	superlu_malloc_total -= n;
+#endif
+
+	if ( superlu_malloc_total < 0 )
+	    ABORT("superlu_malloc_total went negative");
+	
+	/*free (addr);*/
+	free (p);
+    }
+
+}
+
+#else  /* The production mode. */
+
+void *superlu_malloc_dist(size_t size)
+{
+    void *buf;
+    buf = (void *) malloc(size);
+    return (buf);
+}
+
+void superlu_free_dist(void *addr)
+{
+    free (addr);
+}
+
+#endif  /* End debug malloc/free. */
+
+
+
+static void
+copy_mem_int(int_t howmany, void *old, void *new)
+{
+    register int_t i;
+    int_t *iold = old;
+    int_t *inew = new;
+    for (i = 0; i < howmany; i++) inew[i] = iold[i];
+}
+
+
+static void
+user_bcopy(char *src, char *dest, int_t bytes)
+{
+    char *s_ptr, *d_ptr;
+
+    s_ptr = src + bytes - 1;
+    d_ptr = dest + bytes - 1;
+    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
+}
+
+
+
+int_t *intMalloc_dist(int_t n)
+{
+    int_t *buf;
+    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
+    return (buf);
+}
+
+int_t *intCalloc_dist(int_t n)
+{
+    int_t *buf;
+    register int_t i;
+    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
+    if ( buf )
+	for (i = 0; i < n; ++i) buf[i] = 0;
+    return (buf);
+}
+
+
+void *user_malloc_dist(int_t bytes, int_t which_end)
+{
+    void *buf;
+    
+    if ( StackFull(bytes) ) return (NULL);
+
+    if ( which_end == HEAD ) {
+	buf = (char*) stack.array + stack.top1;
+	stack.top1 += bytes;
+    } else {
+	stack.top2 -= bytes;
+	buf = (char*) stack.array + stack.top2;
+    }
+    
+    stack.used += bytes;
+    return buf;
+}
+
+void user_free_dist(int_t bytes, int_t which_end)
+{
+    if ( which_end == HEAD ) {
+	stack.top1 -= bytes;
+    } else {
+	stack.top2 += bytes;
+    }
+    stack.used -= bytes;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Setup the memory model to be used for factorization.
+ *    lwork = 0: use system malloc;
+ *    lwork > 0: use user-supplied work[] space.
+ * </pre>
+ */
+void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel)
+{
+    if ( lwork == 0 ) {
+	*MemModel = SYSTEM; /* malloc/free */
+    } else if ( lwork > 0 ) {
+	*MemModel = USER;   /* user provided space */
+	stack.used = 0;
+	stack.top1 = 0;
+	stack.top2 = (lwork/4)*4; /* must be word addressable */
+	stack.size = stack.top2;
+	stack.array = (void *) work;
+    }
+}
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Allocate storage for the data structures common to symbolic factorization
+ * routines. For those unpredictable size, make a guess as FILL * nnz(A).
+ * Return value:
+ *     If lwork = -1, return the estimated amount of space required, plus n;
+ *     otherwise, return the amount of space actually allocated when
+ *     memory allocation failure occurred.
+ * </pre>
+ */
+
+int_t symbfact_SubInit
+/************************************************************************/
+(
+ fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz,
+ Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable
+ )
+{
+    int_t  iword;
+    int_t  *xsup, *supno;
+    int_t  *lsub, *xlsub;
+    int_t  *usub, *xusub;
+    int_t  nzlmax, nzumax;
+    int_t  FILL = sp_ienv_dist(6);
+    int iam;
+
+#if ( DEBUGlevel>=1 )
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter symbfact_SubInit()");
+#endif
+
+    no_expand = 0;
+    iword     = sizeof(int_t);
+
+    expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) );
+    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
+    
+    if ( fact == DOFACT || fact == SamePattern ) {
+	/* Guess for L\U factors */
+	nzlmax = FILL * annz;
+	nzumax = FILL/2.0 * annz;
+
+	if ( lwork == -1 ) {
+	    return ( GluIntArray(n) * iword + TempSpace(m,1)
+		    + (nzlmax+nzumax)*iword + n );
+        } else {
+	    SetupSpace(work, lwork, &Glu_freeable->MemModel);
+	}
+	
+#if ( PRNTlevel>=2 )
+	printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", 
+		annz, nzlmax, nzumax);
+#endif	
+	
+	/* Integer pointers for L\U factors */
+	if ( Glu_freeable->MemModel == SYSTEM ) {
+	    xsup   = intMalloc_dist(n+1);
+	    supno  = intMalloc_dist(n+1);
+	    xlsub  = intMalloc_dist(n+1);
+	    xusub  = intMalloc_dist(n+1);
+	} else {
+	    xsup   = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
+	    supno  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
+	    xlsub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
+	    xusub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
+	}
+
+	lsub  = (int_t *) expand(&nzlmax, (MemType) LSUB, 0, 0, Glu_freeable);
+	usub  = (int_t *) expand(&nzumax, (MemType) USUB, 0, 0, Glu_freeable);
+
+	while ( !lsub || !usub ) {
+	    if ( Glu_freeable->MemModel == SYSTEM ) {
+		SUPERLU_FREE(lsub); 
+		SUPERLU_FREE(usub);
+	    } else {
+		user_free_dist((nzlmax+nzumax)*iword, HEAD);
+	    }
+	    nzlmax /= 2;
+	    nzumax /= 2;
+	    if ( nzumax < annz/2 ) {
+		printf("Not enough memory to perform factorization.\n");
+		return (memory_usage(nzlmax, nzumax, n) + n);
+	    }
+#if ( PRNTlevel>=1 )
+	    printf("(%d).. symbfact_SubInit() reduce size:"
+		   "nzlmax %ld, nzumax %ld\n", iam, (long long) nzlmax, (long long) nzumax);
+	    fflush(stdout);
+#endif
+	    lsub  = (int_t *) expand( &nzlmax, (MemType) LSUB, 0, 0, Glu_freeable );
+	    usub  = (int_t *) expand( &nzumax, (MemType) USUB, 0, 1, Glu_freeable );
+	}
+
+	Glu_persist->xsup    = xsup;
+	Glu_persist->supno   = supno;
+	Glu_freeable->lsub   = lsub;
+	Glu_freeable->xlsub  = xlsub;
+	Glu_freeable->usub   = usub;
+	Glu_freeable->xusub  = xusub;
+	Glu_freeable->nzlmax = nzlmax;
+	Glu_freeable->nzumax = nzumax;
+    } else {
+	/* fact == SamePattern_SameRowPerm */
+	if ( lwork == -1 ) {
+	    return ( GluIntArray(n) * iword + TempSpace(m, 1)
+		    + (nzlmax+nzumax)*iword + n );
+        } else if ( lwork == 0 ) {
+	    Glu_freeable->MemModel = SYSTEM;
+	} else {
+	    Glu_freeable->MemModel = USER;
+	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
+	    stack.size = stack.top2;
+	}
+	
+	expanders[USUB].mem = Glu_freeable->usub;
+	expanders[LSUB].mem = Glu_freeable->lsub;
+	expanders[USUB].size = nzumax;
+	expanders[LSUB].size = nzlmax;
+    }
+
+    ++no_expand;
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed: xsup, supno */
+    CHECK_MALLOC(iam, "Exit symbfact_SubInit()");
+#endif
+
+    return 0;
+    
+} /* SYMBFACT_SUBINIT */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Expand the data structures for L and U during the factorization.
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * </pre>
+ */
+
+int_t symbfact_SubXpand
+/************************************************************************/
+(
+ int_t n,           /* total number of columns */
+ int_t jcol,        /* current column */
+ int_t next,        /* number of elements currently in the factors */
+ MemType mem_type,  /* which type of memory to expand  */
+ int_t *maxlen,     /* modified - maximum length of a data structure */
+ Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
+ )
+{
+    void   *new_mem;
+    
+#if ( DEBUGlevel>=1 )
+    printf("symbfact_SubXpand(): jcol " IFMT ", next " IFMT ", maxlen " IFMT
+	   ", MemType " IFMT "\n",
+	   jcol, next, *maxlen, mem_type);
+#endif    
+
+    new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
+    
+    if ( !new_mem ) {
+	int_t    nzlmax  = Glu_freeable->nzlmax;
+	int_t    nzumax  = Glu_freeable->nzumax;
+    	fprintf(stderr, "Can't expand MemType %d: jcol " IFMT "\n", mem_type, jcol);
+    	return (memory_usage(nzlmax, nzumax, n) + n);
+    }
+
+    if ( mem_type == LSUB ) {
+	Glu_freeable->lsub   = (int_t *) new_mem;
+	Glu_freeable->nzlmax = *maxlen;
+    } else if ( mem_type == USUB ) {
+	Glu_freeable->usub   = (int_t *) new_mem;
+	Glu_freeable->nzumax = *maxlen;
+    } else ABORT("Tries to expand nonexisting memory type.\n");
+    
+    return 0;
+    
+} /* LUSUB_XPAND */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Deallocate storage of the data structures common to symbolic
+ * factorization routines.
+ * </pre>
+ */
+
+int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable)
+/************************************************************************/
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter symbfact_SubFree()");
+#endif
+    
+    SUPERLU_FREE(expanders);
+    SUPERLU_FREE(Glu_freeable->lsub);
+    SUPERLU_FREE(Glu_freeable->xlsub);
+    SUPERLU_FREE(Glu_freeable->usub);
+    SUPERLU_FREE(Glu_freeable->xusub);
+
+#if ( DEBUGlevel>=1 )    
+    CHECK_MALLOC(iam, "Exit symbfact_SubFree()");
+#endif
+    return 0;
+} /* SYMBFACT_SUBFREE */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Expand the existing storage to accommodate more fill-ins.
+ * </pre>
+ */
+
+static void *expand
+/************************************************************************/
+(
+ int_t *prev_len,   /* length used from previous call */
+ MemType type,    /* which part of the memory to expand */
+ int_t len_to_copy, /* size of the memory to be copied to new store */
+ int_t keep_prev,   /* = 1: use prev_len;
+		     = 0: compute new_len to expand */
+ Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
+ )
+{
+    float    EXPAND = 1.5;
+    float    alpha;
+    void     *new_mem;
+    int_t    new_len, tries, lword, extra, bytes_to_copy;
+
+    alpha = EXPAND;
+    lword = sizeof(int_t);
+
+    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
+        new_len = *prev_len;
+    else {
+	new_len = alpha * *prev_len;
+    }
+
+    if ( Glu_freeable->MemModel == SYSTEM ) {
+	new_mem = (void *) SUPERLU_MALLOC((size_t) new_len * lword);
+	/*new_mem = (void *) calloc(new_len, lword); */
+	if ( no_expand != 0 ) {
+	    tries = 0;
+	    if ( keep_prev ) {
+		if ( !new_mem ) return (NULL);
+	    } else {
+		while ( !new_mem ) {
+		    if ( ++tries > 10 ) return (NULL);
+		    alpha = Reduce(alpha);
+		    new_len = alpha * *prev_len;
+		    new_mem = (void*) SUPERLU_MALLOC((size_t)new_len * lword); 
+		    /* new_mem = (void *) calloc(new_len, lword); */
+		}
+	    }
+	    copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
+	    SUPERLU_FREE (expanders[type].mem);
+	}
+	expanders[type].mem = (void *) new_mem;
+	
+    } else { /* MemModel == USER */
+	if ( no_expand == 0 ) {
+	    new_mem = user_malloc_dist((size_t)new_len * lword, HEAD);
+	    expanders[type].mem = (void *) new_mem;
+	}
+	else {
+	    tries = 0;
+	    extra = (new_len - *prev_len) * lword;
+	    if ( keep_prev ) {
+		if ( StackFull(extra) ) return (NULL);
+	    } else {
+		while ( StackFull(extra) ) {
+		    if ( ++tries > 10 ) return (NULL);
+		    alpha = Reduce(alpha);
+		    new_len = alpha * *prev_len;
+		    extra = (new_len - *prev_len) * lword;	    
+		}
+	    }
+
+	    if ( type != USUB ) {
+		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
+		bytes_to_copy = (char*)stack.array + stack.top1
+		    - (char*)expanders[type + 1].mem;
+		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
+
+		if ( type < USUB ) {
+		    Glu_freeable->usub = expanders[USUB].mem =
+			(void*)((char*)expanders[USUB].mem + extra);
+		}
+		if ( type < LSUB ) {
+		    Glu_freeable->lsub = expanders[LSUB].mem =
+			(void*)((char*)expanders[LSUB].mem + extra);
+		}
+		stack.top1 += extra;
+		stack.used += extra;
+		
+	    } /* if ... */
+
+	} /* else ... */
+    }
+
+    expanders[type].size = new_len;
+    *prev_len = new_len;
+    if ( no_expand ) ++no_expand;
+    
+    return (void *) expanders[type].mem;
+    
+} /* EXPAND */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * </pre>
+ */
+
+int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable,
+		      superlu_dist_mem_usage_t *mem_usage)
+/************************************************************************/
+{
+    register int_t iword = sizeof(int_t);
+    extern int_t no_expand;
+
+    /* For the adjacency graphs of L and U. */
+    /*mem_usage->for_lu = (float)( (4*n + 3) * iword +
+				Glu_freeable->xlsub[n]*iword );*/
+    mem_usage->for_lu = (float)( (4*n + 3) * iword +
+				lsub_size * iword );
+    mem_usage->for_lu += (float)( (n + 1) * iword +
+				 Glu_freeable->xusub[n]*iword );
+
+    /* Working storage to support factorization */
+    mem_usage->total = mem_usage->for_lu + 9*n*iword;
+
+    mem_usage->expansions = --no_expand;
+    return 0;
+} /* QUERYSPACE_DIST */
+
+static int_t
+memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n)
+{
+    register int_t iword = sizeof(int_t);
+    return (10*n*iword + (nzlmax+nzumax)*iword);
+}
+
diff --git a/SRC/memory.patch b/SRC/memory.patch
new file mode 100644
index 0000000..e4b4ecb
--- /dev/null
+++ b/SRC/memory.patch
@@ -0,0 +1,10 @@
+118d117
+< 
+144c143
+<     buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
+---
+>     buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
+152c151
+<     buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
+---
+>     buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
diff --git a/SRC/mmd.c b/SRC/mmd.c
new file mode 100644
index 0000000..a212480
--- /dev/null
+++ b/SRC/mmd.c
@@ -0,0 +1,1025 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Implements the minimum degree algorithm
+ */
+
+#include "superlu_defs.h"
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* ****     GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE     **** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */
+/*        ALGORITHM.  IT MAKES USE OF THE IMPLICIT REPRESENTATION */
+/*        OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */
+/*        NOTION OF INDISTINGUISHABLE NODES.  IT ALSO IMPLEMENTS */
+/*        THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */
+/*        EXTERNAL DEGREE. */
+/*        --------------------------------------------- */
+/*        CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */
+/*        DESTROYED. */
+/*        --------------------------------------------- */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */
+/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
+/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */
+/*                 (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */
+/*                 NODES. */
+
+/*     OUTPUT PARAMETERS - */
+/*        PERM   - THE MINIMUM DEGREE ORDERING. */
+/*        INVP   - THE INVERSE OF PERM. */
+/*        NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */
+/*                 SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */
+
+/*     WORKING PARAMETERS - */
+/*        DHEAD  - VECTOR FOR HEAD OF DEGREE LISTS. */
+/*        INVP   - USED TEMPORARILY FOR DEGREE FORWARD LINK. */
+/*        PERM   - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */
+/*        QSIZE  - VECTOR FOR SIZE OF SUPERNODES. */
+/*        LLIST  - VECTOR FOR TEMPORARY LINKED LISTS. */
+/*        MARKER - A TEMPORARY MARKER VECTOR. */
+
+/*     PROGRAM SUBROUTINES - */
+/*        MMDELM, MMDINT, MMDNUM, MMDUPD. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int genmmd_dist_(int_t *neqns, int_t *xadj, int_t *adjncy, 
+	int_t *invp, int_t *perm, int_t *delta, int_t *dhead, 
+	int_t *qsize, int_t *llist, int_t *marker, int_t *maxint, 
+	int_t *nofsub)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    static int_t mdeg, ehead, i, mdlmt, mdnode;
+    extern /* Subroutine */ int mmdelm_dist(int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *), mmdupd_dist(int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *, int_t 
+	    *, int_t *, int_t *, int_t *, int_t *, int_t *, 
+	    int_t *), mmdint_dist(int_t *, int_t *, int_t *, int_t *, 
+	    int_t *, int_t *, int_t *, int_t *, int_t *), 
+	    mmdnum_dist(int_t *, int_t *, int_t *, int_t *);
+    static int_t nextmd, tag, num;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dhead;
+    --perm;
+    --invp;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    if (*neqns <= 0) {
+	return 0;
+    }
+
+/*        ------------------------------------------------ */
+/*        INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */
+/*        ------------------------------------------------ */
+    *nofsub = 0;
+    mmdint_dist(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1],
+		&qsize[1], &llist[1], &marker[1]);
+
+/*        ---------------------------------------------- */
+/*        NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */
+/*        ---------------------------------------------- */
+    num = 1;
+
+/*        ----------------------------- */
+/*        ELIMINATE ALL ISOLATED NODES. */
+/*        ----------------------------- */
+    nextmd = dhead[1];
+L100:
+    if (nextmd <= 0) {
+	goto L200;
+    }
+    mdnode = nextmd;
+    nextmd = invp[mdnode];
+    marker[mdnode] = *maxint;
+    invp[mdnode] = -num;
+    ++num;
+    goto L100;
+
+L200:
+/*        ---------------------------------------- */
+/*        SEARCH FOR NODE OF THE MINIMUM DEGREE. */
+/*        MDEG IS THE CURRENT MINIMUM DEGREE; */
+/*        TAG IS USED TO FACILITATE MARKING NODES. */
+/*        ---------------------------------------- */
+    if (num > *neqns) {
+	goto L1000;
+    }
+    tag = 1;
+    dhead[1] = 0;
+    mdeg = 2;
+L300:
+    if (dhead[mdeg] > 0) {
+	goto L400;
+    }
+    ++mdeg;
+    goto L300;
+L400:
+/*            ------------------------------------------------- */
+/*            USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */
+/*            WHEN A DEGREE UPDATE IS TO BE PERFORMED. */
+/*            ------------------------------------------------- */
+    mdlmt = mdeg + *delta;
+    ehead = 0;
+
+L500:
+    mdnode = dhead[mdeg];
+    if (mdnode > 0) {
+	goto L600;
+    }
+    ++mdeg;
+    if (mdeg > mdlmt) {
+	goto L900;
+    }
+    goto L500;
+L600:
+/*                ---------------------------------------- */
+/*                REMOVE MDNODE FROM THE DEGREE STRUCTURE. */
+/*                ---------------------------------------- */
+    nextmd = invp[mdnode];
+    dhead[mdeg] = nextmd;
+    if (nextmd > 0) {
+	perm[nextmd] = -mdeg;
+    }
+    invp[mdnode] = -num;
+    *nofsub = *nofsub + mdeg + qsize[mdnode] - 2;
+    if (num + qsize[mdnode] > *neqns) {
+	goto L1000;
+    }
+/*                ---------------------------------------------- */
+/*                ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */
+/*                TRANSFORMATION.  RESET TAG VALUE IF NECESSARY. */
+/*                ---------------------------------------------- */
+    ++tag;
+    if (tag < *maxint) {
+	goto L800;
+    }
+    tag = 1;
+    i__1 = *neqns;
+    for (i = 1; i <= i__1; ++i) {
+	if (marker[i] < *maxint) {
+	    marker[i] = 0;
+	}
+/* L700: */
+    }
+L800:
+    mmdelm_dist(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1],
+		&qsize[1], &llist[1], &marker[1], maxint, &tag);
+    num += qsize[mdnode];
+    llist[mdnode] = ehead;
+    ehead = mdnode;
+    if (*delta >= 0) {
+	goto L500;
+    }
+L900:
+/*            ------------------------------------------- */
+/*            UPDATE DEGREES OF THE NODES INVOLVED IN THE */
+/*            MINIMUM DEGREE NODES ELIMINATION. */
+/*            ------------------------------------------- */
+    if (num > *neqns) {
+	goto L1000;
+    }
+    mmdupd_dist(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1],
+		&invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint,
+		&tag);
+    goto L300;
+
+L1000:
+    mmdnum_dist(neqns, &perm[1], &invp[1], &qsize[1]);
+    return 0;
+
+} /* genmmd_dist_ */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* ***     MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION     *** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */
+/*        MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */
+/*        ALGORITHM. */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
+
+/*     OUTPUT PARAMETERS - */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE (INITIALIZED TO ONE). */
+/*        LLIST  - LINKED LIST. */
+/*        MARKER - MARKER VECTOR. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdint_dist(int_t *neqns, int_t *xadj, int_t *adjncy, 
+	int_t *dhead, int_t *dforw, int_t *dbakw, int_t *qsize, 
+	int_t *llist, int_t *marker)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    static int_t ndeg, node, fnode;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	dhead[node] = 0;
+	qsize[node] = 1;
+	marker[node] = 0;
+	llist[node] = 0;
+/* L100: */
+    }
+/*        ------------------------------------------ */
+/*        INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */
+/*        ------------------------------------------ */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	ndeg = xadj[node + 1] - xadj[node] + 1;
+	fnode = dhead[ndeg];
+	dforw[node] = fnode;
+	dhead[ndeg] = node;
+	if (fnode > 0) {
+	    dbakw[fnode] = node;
+	}
+	dbakw[node] = -ndeg;
+/* L200: */
+    }
+    return 0;
+
+} /* mmdint_dist */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* **     MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION     *** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */
+/*        MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */
+/*        IS STORED IN THE QUOTIENT GRAPH FORMAT.  IT ALSO */
+/*        TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */
+/*        ELIMINATION GRAPH. */
+
+/*     INPUT PARAMETERS - */
+/*        MDNODE - NODE OF MINIMUM DEGREE. */
+/*        MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */
+/*                 INT. */
+/*        TAG    - TAG VALUE. */
+
+/*     UPDATED PARAMETERS - */
+/*        (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE. */
+/*        MARKER - MARKER VECTOR. */
+/*        LLIST  - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdelm_dist(int_t *mdnode, int_t *xadj, int_t *adjncy,
+	 int_t *dhead, int_t *dforw, int_t *dbakw, int_t *qsize, 
+ 	 int_t *llist, int_t *marker, int_t *maxint, int_t *tag)
+{
+    /* System generated locals */
+    int_t i__1, i__2;
+
+    /* Local variables */
+    static int_t node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, 
+	    istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+/*        ----------------------------------------------- */
+/*        FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */
+/*        ----------------------------------------------- */
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    marker[*mdnode] = *tag;
+    istrt = xadj[*mdnode];
+    istop = xadj[*mdnode + 1] - 1;
+/*        ------------------------------------------------------- */
+/*        ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */
+/*        NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */
+/*        FOR THE NEXT REACHABLE NODE. */
+/*        ------------------------------------------------------- */
+    elmnt = 0;
+    rloc = istrt;
+    rlmt = istop;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	nabor = adjncy[i];
+	if (nabor == 0) {
+	    goto L300;
+	}
+	if (marker[nabor] >= *tag) {
+	    goto L200;
+	}
+	marker[nabor] = *tag;
+	if (dforw[nabor] < 0) {
+	    goto L100;
+	}
+	adjncy[rloc] = nabor;
+	++rloc;
+	goto L200;
+L100:
+	llist[nabor] = elmnt;
+	elmnt = nabor;
+L200:
+	;
+    }
+L300:
+/*            ----------------------------------------------------- */
+/*            MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */
+/*            ----------------------------------------------------- */
+    if (elmnt <= 0) {
+	goto L1000;
+    }
+    adjncy[rlmt] = -elmnt;
+    link = elmnt;
+L400:
+    jstrt = xadj[link];
+    jstop = xadj[link + 1] - 1;
+    i__1 = jstop;
+    for (j = jstrt; j <= i__1; ++j) {
+	node = adjncy[j];
+	link = -node;
+	if (node < 0) {
+	    goto L400;
+	} else if (node == 0) {
+	    goto L900;
+	} else {
+	    goto L500;
+	}
+L500:
+	if (marker[node] >= *tag || dforw[node] < 0) {
+	    goto L800;
+	}
+	marker[node] = *tag;
+/*                            --------------------------------- */
+/*                            USE STORAGE FROM ELIMINATED NODES */
+/*                            IF NECESSARY. */
+/*                            --------------------------------- */
+L600:
+	if (rloc < rlmt) {
+	    goto L700;
+	}
+	link = -adjncy[rlmt];
+	rloc = xadj[link];
+	rlmt = xadj[link + 1] - 1;
+	goto L600;
+L700:
+	adjncy[rloc] = node;
+	++rloc;
+L800:
+	;
+    }
+L900:
+    elmnt = llist[elmnt];
+    goto L300;
+L1000:
+    if (rloc <= rlmt) {
+	adjncy[rloc] = 0;
+    }
+/*        -------------------------------------------------------- */
+/*        FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */
+/*        -------------------------------------------------------- */
+    link = *mdnode;
+L1100:
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	rnode = adjncy[i];
+	link = -rnode;
+	if (rnode < 0) {
+	    goto L1100;
+	} else if (rnode == 0) {
+	    goto L1800;
+	} else {
+	    goto L1200;
+	}
+L1200:
+/*                -------------------------------------------- */
+/*                IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */
+/*                -------------------------------------------- */
+	pvnode = dbakw[rnode];
+	if (pvnode == 0 || pvnode == -(*maxint)) {
+	    goto L1300;
+	}
+/*                    ------------------------------------- */
+/*                    THEN REMOVE RNODE FROM THE STRUCTURE. */
+/*                    ------------------------------------- */
+	nxnode = dforw[rnode];
+	if (nxnode > 0) {
+	    dbakw[nxnode] = pvnode;
+	}
+	if (pvnode > 0) {
+	    dforw[pvnode] = nxnode;
+	}
+	npv = -pvnode;
+	if (pvnode < 0) {
+	    dhead[npv] = nxnode;
+	}
+L1300:
+/*                ---------------------------------------- */
+/*                PURGE INACTIVE QUOTIENT NABORS OF RNODE. */
+/*                ---------------------------------------- */
+	jstrt = xadj[rnode];
+	jstop = xadj[rnode + 1] - 1;
+	xqnbr = jstrt;
+	i__2 = jstop;
+	for (j = jstrt; j <= i__2; ++j) {
+	    nabor = adjncy[j];
+	    if (nabor == 0) {
+		goto L1500;
+	    }
+	    if (marker[nabor] >= *tag) {
+		goto L1400;
+	    }
+	    adjncy[xqnbr] = nabor;
+	    ++xqnbr;
+L1400:
+	    ;
+	}
+L1500:
+/*                ---------------------------------------- */
+/*                IF NO ACTIVE NABOR AFTER THE PURGING ... */
+/*                ---------------------------------------- */
+	nqnbrs = xqnbr - jstrt;
+	if (nqnbrs > 0) {
+	    goto L1600;
+	}
+/*                    ----------------------------- */
+/*                    THEN MERGE RNODE WITH MDNODE. */
+/*                    ----------------------------- */
+	qsize[*mdnode] += qsize[rnode];
+	qsize[rnode] = 0;
+	marker[rnode] = *maxint;
+	dforw[rnode] = -(*mdnode);
+	dbakw[rnode] = -(*maxint);
+	goto L1700;
+L1600:
+/*                -------------------------------------- */
+/*                ELSE FLAG RNODE FOR DEGREE UPDATE, AND */
+/*                ADD MDNODE AS A NABOR OF RNODE. */
+/*                -------------------------------------- */
+	dforw[rnode] = nqnbrs + 1;
+	dbakw[rnode] = 0;
+	adjncy[xqnbr] = *mdnode;
+	++xqnbr;
+	if (xqnbr <= jstop) {
+	    adjncy[xqnbr] = 0;
+	}
+
+L1700:
+	;
+    }
+L1800:
+    return 0;
+
+} /* mmdelm_dist */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* *****     MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE     ***** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */
+/*        AFTER A MULTIPLE ELIMINATION STEP. */
+
+/*     INPUT PARAMETERS - */
+/*        EHEAD  - THE BEGINNING OF THE LIST OF ELIMINATED */
+/*                 NODES (I.E., NEWLY FORMED ELEMENTS). */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
+/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
+/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */
+/*                 INTEGER. */
+
+/*     UPDATED PARAMETERS - */
+/*        MDEG   - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE. */
+/*        LLIST  - WORKING LINKED LIST. */
+/*        MARKER - MARKER VECTOR FOR DEGREE UPDATE. */
+/*        TAG    - TAG VALUE. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdupd_dist(int_t *ehead, int_t *neqns, int_t *xadj, 
+	int_t *adjncy, int_t *delta, int_t *mdeg, int_t *dhead, 
+	int_t *dforw, int_t *dbakw, int_t *qsize, int_t *llist, 
+	int_t *marker, int_t *maxint, int_t *tag)
+{
+    /* System generated locals */
+    int_t i__1, i__2;
+
+    /* Local variables */
+    static int_t node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, 
+	    istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    mdeg0 = *mdeg + *delta;
+    elmnt = *ehead;
+L100:
+/*            ------------------------------------------------------- */
+/*            FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */
+/*            (RESET TAG VALUE IF NECESSARY.) */
+/*            ------------------------------------------------------- */
+    if (elmnt <= 0) {
+	return 0;
+    }
+    mtag = *tag + mdeg0;
+    if (mtag < *maxint) {
+	goto L300;
+    }
+    *tag = 1;
+    i__1 = *neqns;
+    for (i = 1; i <= i__1; ++i) {
+	if (marker[i] < *maxint) {
+	    marker[i] = 0;
+	}
+/* L200: */
+    }
+    mtag = *tag + mdeg0;
+L300:
+/*            --------------------------------------------- */
+/*            CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */
+/*            WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */
+/*            ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */
+/*            THAN TWO NABORS (QXHEAD).  ALSO COMPUTE DEG0, */
+/*            NUMBER OF NODES IN THIS ELEMENT. */
+/*            --------------------------------------------- */
+    q2head = 0;
+    qxhead = 0;
+    deg0 = 0;
+    link = elmnt;
+L400:
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	enode = adjncy[i];
+	link = -enode;
+	if (enode < 0) {
+	    goto L400;
+	} else if (enode == 0) {
+	    goto L800;
+	} else {
+	    goto L500;
+	}
+
+L500:
+	if (qsize[enode] == 0) {
+	    goto L700;
+	}
+	deg0 += qsize[enode];
+	marker[enode] = mtag;
+/*                        ---------------------------------- */
+/*                        IF ENODE REQUIRES A DEGREE UPDATE, */
+/*                        THEN DO THE FOLLOWING. */
+/*                        ---------------------------------- */
+	if (dbakw[enode] != 0) {
+	    goto L700;
+	}
+/*                            --------------------------------------- 
+*/
+/*                            PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. 
+*/
+/*                            --------------------------------------- 
+*/
+	if (dforw[enode] == 2) {
+	    goto L600;
+	}
+	llist[enode] = qxhead;
+	qxhead = enode;
+	goto L700;
+L600:
+	llist[enode] = q2head;
+	q2head = enode;
+L700:
+	;
+    }
+L800:
+/*            -------------------------------------------- */
+/*            FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */
+/*            -------------------------------------------- */
+    enode = q2head;
+    iq2 = 1;
+L900:
+    if (enode <= 0) {
+	goto L1500;
+    }
+    if (dbakw[enode] != 0) {
+	goto L2200;
+    }
+    ++(*tag);
+    deg = deg0;
+/*                    ------------------------------------------ */
+/*                    IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */
+/*                    ------------------------------------------ */
+    istrt = xadj[enode];
+    nabor = adjncy[istrt];
+    if (nabor == elmnt) {
+	nabor = adjncy[istrt + 1];
+    }
+/*                    ------------------------------------------------ */
+/*                    IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */
+/*                    ------------------------------------------------ */
+    link = nabor;
+    if (dforw[nabor] < 0) {
+	goto L1000;
+    }
+    deg += qsize[nabor];
+    goto L2100;
+L1000:
+/*                        -------------------------------------------- */
+/*                        OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */
+/*                        DO THE FOLLOWING. */
+/*                        -------------------------------------------- */
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	node = adjncy[i];
+	link = -node;
+	if (node == enode) {
+	    goto L1400;
+	}
+	if (node < 0) {
+	    goto L1000;
+	} else if (node == 0) {
+	    goto L2100;
+	} else {
+	    goto L1100;
+	}
+
+L1100:
+	if (qsize[node] == 0) {
+	    goto L1400;
+	}
+	if (marker[node] >= *tag) {
+	    goto L1200;
+	}
+/*                                -----------------------------------
+-- */
+/*                                CASE WHEN NODE IS NOT YET CONSIDERED
+. */
+/*                                -----------------------------------
+-- */
+	marker[node] = *tag;
+	deg += qsize[node];
+	goto L1400;
+L1200:
+/*                            ----------------------------------------
+ */
+/*                            CASE WHEN NODE IS INDISTINGUISHABLE FROM
+ */
+/*                            ENODE.  MERGE THEM INTO A NEW SUPERNODE.
+ */
+/*                            ----------------------------------------
+ */
+	if (dbakw[node] != 0) {
+	    goto L1400;
+	}
+	if (dforw[node] != 2) {
+	    goto L1300;
+	}
+	qsize[enode] += qsize[node];
+	qsize[node] = 0;
+	marker[node] = *maxint;
+	dforw[node] = -enode;
+	dbakw[node] = -(*maxint);
+	goto L1400;
+L1300:
+/*                            -------------------------------------- 
+*/
+/*                            CASE WHEN NODE IS OUTMATCHED BY ENODE. 
+*/
+/*                            -------------------------------------- 
+*/
+	if (dbakw[node] == 0) {
+	    dbakw[node] = -(*maxint);
+	}
+L1400:
+	;
+    }
+    goto L2100;
+L1500:
+/*                ------------------------------------------------ */
+/*                FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */
+/*                ------------------------------------------------ */
+    enode = qxhead;
+    iq2 = 0;
+L1600:
+    if (enode <= 0) {
+	goto L2300;
+    }
+    if (dbakw[enode] != 0) {
+	goto L2200;
+    }
+    ++(*tag);
+    deg = deg0;
+/*                        --------------------------------- */
+/*                        FOR EACH UNMARKED NABOR OF ENODE, */
+/*                        DO THE FOLLOWING. */
+/*                        --------------------------------- */
+    istrt = xadj[enode];
+    istop = xadj[enode + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	nabor = adjncy[i];
+	if (nabor == 0) {
+	    goto L2100;
+	}
+	if (marker[nabor] >= *tag) {
+	    goto L2000;
+	}
+	marker[nabor] = *tag;
+	link = nabor;
+/*                                ------------------------------ */
+/*                                IF UNELIMINATED, INCLUDE IT IN */
+/*                                DEG COUNT. */
+/*                                ------------------------------ */
+	if (dforw[nabor] < 0) {
+	    goto L1700;
+	}
+	deg += qsize[nabor];
+	goto L2000;
+L1700:
+/*                                    ------------------------------- 
+*/
+/*                                    IF ELIMINATED, INCLUDE UNMARKED 
+*/
+/*                                    NODES IN THIS ELEMENT INTO THE 
+*/
+/*                                    DEGREE COUNT. */
+/*                                    ------------------------------- 
+*/
+	jstrt = xadj[link];
+	jstop = xadj[link + 1] - 1;
+	i__2 = jstop;
+	for (j = jstrt; j <= i__2; ++j) {
+	    node = adjncy[j];
+	    link = -node;
+	    if (node < 0) {
+		goto L1700;
+	    } else if (node == 0) {
+		goto L2000;
+	    } else {
+		goto L1800;
+	    }
+
+L1800:
+	    if (marker[node] >= *tag) {
+		goto L1900;
+	    }
+	    marker[node] = *tag;
+	    deg += qsize[node];
+L1900:
+	    ;
+	}
+L2000:
+	;
+    }
+L2100:
+/*                    ------------------------------------------- */
+/*                    UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */
+/*                    STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */
+/*                    ------------------------------------------- */
+    deg = deg - qsize[enode] + 1;
+    fnode = dhead[deg];
+    dforw[enode] = fnode;
+    dbakw[enode] = -deg;
+    if (fnode > 0) {
+	dbakw[fnode] = enode;
+    }
+    dhead[deg] = enode;
+    if (deg < *mdeg) {
+	*mdeg = deg;
+    }
+L2200:
+/*                    ---------------------------------- */
+/*                    GET NEXT ENODE IN CURRENT ELEMENT. */
+/*                    ---------------------------------- */
+    enode = llist[enode];
+    if (iq2 == 1) {
+	goto L900;
+    }
+    goto L1600;
+L2300:
+/*            ----------------------------- */
+/*            GET NEXT ELEMENT IN THE LIST. */
+/*            ----------------------------- */
+    *tag = mtag;
+    elmnt = llist[elmnt];
+    goto L100;
+
+} /* mmdupd_dist */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* *****     MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING     ***** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */
+/*        PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */
+/*        VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */
+/*        MINIMUM DEGREE ORDERING ALGORITHM. */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        QSIZE  - SIZE OF SUPERNODES AT ELIMINATION. */
+
+/*     UPDATED PARAMETERS - */
+/*        INVP   - INVERSE PERMUTATION VECTOR.  ON INPUT, */
+/*                 IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */
+/*                 INTO THE NODE -INVP(NODE); OTHERWISE, */
+/*                 -INVP(NODE) IS ITS INVERSE LABELLING. */
+
+/*     OUTPUT PARAMETERS - */
+/*        PERM   - THE PERMUTATION VECTOR. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdnum_dist(int_t *neqns, int_t *perm, int_t *invp,
+				 int_t *qsize)
+{
+    /* System generated locals */
+    int_t i__1;
+
+    /* Local variables */
+    static int_t node, root, nextf, father, nqsize, num;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --qsize;
+    --invp;
+    --perm;
+
+    /* Function Body */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	nqsize = qsize[node];
+	if (nqsize <= 0) {
+	    perm[node] = invp[node];
+	}
+	if (nqsize > 0) {
+	    perm[node] = -invp[node];
+	}
+/* L100: */
+    }
+/*        ------------------------------------------------------ */
+/*        FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */
+/*        ------------------------------------------------------ */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	if (perm[node] > 0) {
+	    goto L500;
+	}
+/*                ----------------------------------------- */
+/*                TRACE THE MERGED TREE UNTIL ONE WHICH HAS */
+/*                NOT BEEN MERGED, CALL IT ROOT. */
+/*                ----------------------------------------- */
+	father = node;
+L200:
+	if (perm[father] > 0) {
+	    goto L300;
+	}
+	father = -perm[father];
+	goto L200;
+L300:
+/*                ----------------------- */
+/*                NUMBER NODE AFTER ROOT. */
+/*                ----------------------- */
+	root = father;
+	num = perm[root] + 1;
+	invp[node] = -num;
+	perm[root] = num;
+/*                ------------------------ */
+/*                SHORTEN THE MERGED TREE. */
+/*                ------------------------ */
+	father = node;
+L400:
+	nextf = -perm[father];
+	if (nextf <= 0) {
+	    goto L500;
+	}
+	perm[father] = -root;
+	father = nextf;
+	goto L400;
+L500:
+	;
+    }
+/*        ---------------------- */
+/*        READY TO COMPUTE PERM. */
+/*        ---------------------- */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	num = -invp[node];
+	invp[node] = num;
+	perm[num] = node;
+/* L600: */
+    }
+    return 0;
+
+} /* mmdnum_dist */
+
diff --git a/SRC/old_colamd.c b/SRC/old_colamd.c
new file mode 100644
index 0000000..3b2ed7d
--- /dev/null
+++ b/SRC/old_colamd.c
@@ -0,0 +1,2596 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief An approximate minimum degree column ordering algorithm
+ */
+/* ========================================================================== */
+/* === colamd - a sparse matrix column ordering algorithm =================== */
+/* ========================================================================== */
+
+/*
+    colamd:  An approximate minimum degree column ordering algorithm.
+
+    Purpose:
+
+	Colamd computes a permutation Q such that the Cholesky factorization of
+	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
+	than A'A.  This also provides a good ordering for sparse partial
+	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+	factorization, and P is computed during numerical factorization via
+	conventional partial pivoting with row interchanges.  Colamd is the
+	column ordering method used in SuperLU, part of the ScaLAPACK library.
+	It is also available as user-contributed software for Matlab 5.2,
+	available from MathWorks, Inc. (http://www.mathworks.com).  This
+	routine can be used in place of COLMMD in Matlab.  By default, the \
+	and / operators in Matlab perform a column ordering (using COLMMD)
+	prior to LU factorization using sparse partial pivoting, in the
+	built-in Matlab LU(A) routine.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	August 3, 1998.  Version 1.0.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998 by the University of Florida.  All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use or copy this program for any
+	purpose, provided the above notices are retained on all copies.
+	User documentation of any code that uses this code must cite the
+	Authors, the Copyright, and "Used by permission."  If this code is
+	accessible from within Matlab, then typing "help colamd" or "colamd"
+	(with no arguments) must cite the Authors.  Permission to modify the
+	code and to distribute modified code is granted, provided the above
+	notices are retained, and a notice that the code was modified is
+	included with the above copyright notice.  You must also retain the
+	Availability information below, of the original version.
+
+	This software is provided free of charge.
+
+    Availability:
+
+	This file is located at
+
+		http://www.cise.ufl.edu/~davis/colamd/colamd.c
+
+	The colamd.h file is required, located in the same directory.
+	The colamdmex.c file provides a Matlab interface for colamd.
+	The symamdmex.c file provides a Matlab interface for symamd, which is
+	a symmetric ordering based on this code, colamd.c.  All codes are
+	purely ANSI C compliant (they use no Unix-specific routines, include
+	files, etc.).
+*/
+
+/* ========================================================================== */
+/* === Description of user-callable routines ================================ */
+/* ========================================================================== */
+
+/*
+    Each user-callable routine (declared as PUBLIC) is briefly described below.
+    Refer to the comments preceding each routine for more details.
+
+    ----------------------------------------------------------------------------
+    colamd_recommended:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    Alen = colamd_recommended (nnz, n_row, n_col) ;
+
+	Purpose:
+
+	    Returns recommended value of Alen for use by colamd.  Returns -1
+	    if any input argument is negative.
+
+	Arguments:
+
+	    int nnz ;		Number of nonzeros in the matrix A.  This must
+				be the same value as p [n_col] in the call to
+				colamd - otherwise you will get a wrong value
+				of the recommended memory to use.
+	    int n_row ;		Number of rows in the matrix A.
+	    int n_col ;		Number of columns in the matrix A.
+
+    ----------------------------------------------------------------------------
+    colamd_set_defaults:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    colamd_set_defaults (knobs) ;
+
+	Purpose:
+
+	    Sets the default parameters.
+
+	Arguments:
+
+	    double knobs [COLAMD_KNOBS] ;	Output only.
+
+		Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries
+		are removed prior to ordering.  Columns with more than
+		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed
+		prior to ordering, and placed last in the output column
+		ordering.  Default values of these two knobs are both 0.5.
+		Currently, only knobs [0] and knobs [1] are used, but future
+		versions may use more knobs.  If so, they will be properly set
+		to their defaults by the future version of colamd_set_defaults,
+		so that the code that calls colamd will not need to change,
+		assuming that you either use colamd_set_defaults, or pass a
+		(double *) NULL pointer as the knobs array to colamd.
+
+    ----------------------------------------------------------------------------
+    colamd:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    colamd (n_row, n_col, Alen, A, p, knobs) ;
+
+	Purpose:
+
+	    Computes a column ordering (Q) of A such that P(AQ)=LU or
+	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
+	    operations than factorizing the unpermuted matrix A or A'A,
+	    respectively.
+
+	Arguments:
+
+	    int n_row ;
+
+		Number of rows in the matrix A.
+		Restriction:  n_row >= 0.
+		Colamd returns FALSE if n_row is negative.
+
+	    int n_col ;
+
+		Number of columns in the matrix A.
+		Restriction:  n_col >= 0.
+		Colamd returns FALSE if n_col is negative.
+
+	    int Alen ;
+
+		Restriction (see note):
+		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS
+		Colamd returns FALSE if these conditions are not met.
+
+		Note:  this restriction makes an modest assumption regarding
+		the size of the two typedef'd structures, below.  We do,
+		however, guarantee that
+		Alen >= colamd_recommended (nnz, n_row, n_col)
+		will be sufficient.
+
+	    int A [Alen] ;	Input argument, stats on output.
+
+		A is an integer array of size Alen.  Alen must be at least as
+		large as the bare minimum value given above, but this is very
+		low, and can result in excessive run time.  For best
+		performance, we recommend that Alen be greater than or equal to
+		colamd_recommended (nnz, n_row, n_col), which adds
+		nnz/5 to the bare minimum value given above.
+
+		On input, the row indices of the entries in column c of the
+		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
+		in a given column c need not be in ascending order, and
+		duplicate row indices may be be present.  However, colamd will
+		work a little faster if both of these conditions are met
+		(Colamd puts the matrix into this format, if it finds that the
+		the conditions are not met).
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are modified during ordering, and are thus
+		undefined on output with the exception of a few statistics
+		about the ordering (A [0..COLAMD_STATS-1]):
+		A [0]:  number of dense or empty rows ignored.
+		A [1]:  number of dense or empty columns ignored (and ordered
+			last in the output permutation p)
+		A [2]:  number of garbage collections performed.
+		A [3]:  0, if all row indices in each column were in sorted
+			  order, and no duplicates were present.
+			1, otherwise (in which case colamd had to do more work)
+		Note that a row can become "empty" if it contains only
+		"dense" and/or "empty" columns, and similarly a column can
+		become "empty" if it only contains "dense" and/or "empty" rows.
+		Future versions may return more statistics in A, but the usage
+		of these 4 entries in A will remain unchanged.
+
+	    int p [n_col+1] ;	Both input and output argument.
+
+		p is an integer array of size n_col+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n_col-1.  The value p [n_col] is
+		thus the total number of entries in the pattern of the matrix A.
+		Colamd returns FALSE if these conditions are not met.
+
+		On output, if colamd returns TRUE, the array p holds the column
+		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+		the first column index in the new ordering, and p [n_col-1] is
+		the last.  That is, p [k] = j means that column j of A is the
+		kth pivot column, in AQ, where k is in the range 0 to n_col-1
+		(p [0] = j means that column j of A is the first column in AQ).
+
+		If colamd returns FALSE, then no permutation is returned, and
+		p is undefined on output.
+
+	    double knobs [COLAMD_KNOBS] ;	Input only.
+
+		See colamd_set_defaults for a description.  If the knobs array
+		is not present (that is, if a (double *) NULL pointer is passed
+		in its place), then the default values of the parameters are
+		used instead.
+
+*/
+
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+
+/* colamd.h:  knob array size, stats output size, and global prototypes */
+#include "colamd.h"
+
+/* ========================================================================== */
+/* === Scaffolding code definitions  ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+/* assert.h:  the assert macro (no debugging if NDEBUG is defined) */
+#include <assert.h>
+
+/*
+   Our "scaffolding code" philosophy:  In our opinion, well-written library
+   code should keep its "debugging" code, and just normally have it turned off
+   by the compiler so as not to interfere with performance.  This serves
+   several purposes:
+
+   (1) assertions act as comments to the reader, telling you what the code
+	expects at that point.  All assertions will always be true (unless
+	there really is a bug, of course).
+
+   (2) leaving in the scaffolding code assists anyone who would like to modify
+	the code, or understand the algorithm (by reading the debugging output,
+	one can get a glimpse into what the code is doing).
+
+   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
+	and "should" be fully functional and bug-free ... but you never know...
+
+    To enable debugging, comment out the "#define NDEBUG" above.  The code will
+    become outrageously slow when debugging is enabled.  To control the level of
+    debugging output, set an environment variable D to 0 (little), 1 (some),
+    2, 3, or 4 (lots).
+*/
+
+/* ========================================================================== */
+/* === Row and Column structures ============================================ */
+/* ========================================================================== */
+
+typedef struct ColInfo_struct
+{
+    int start ;		/* index for A of first row in this column, or DEAD */
+			/* if column is dead */
+    int length ;	/* number of rows in this column */
+    union
+    {
+	int thickness ;	/* number of original columns represented by this */
+			/* col, if the column is alive */
+	int parent ;	/* parent in parent tree super-column structure, if */
+			/* the column is dead */
+    } shared1 ;
+    union
+    {
+	int score ;	/* the score used to maintain heap, if col is alive */
+	int order ;	/* pivot ordering of this column, if col is dead */
+    } shared2 ;
+    union
+    {
+	int headhash ;	/* head of a hash bucket, if col is at the head of */
+			/* a degree list */
+	int hash ;	/* hash value, if col is not in a degree list */
+	int prev ;	/* previous column in degree list, if col is in a */
+			/* degree list (but not at the head of a degree list) */
+    } shared3 ;
+    union
+    {
+	int degree_next ;	/* next column, if col is in a degree list */
+	int hash_next ;		/* next column, if col is in a hash list */
+    } shared4 ;
+
+} ColInfo ;
+
+typedef struct RowInfo_struct
+{
+    int start ;		/* index for A of first col in this row */
+    int length ;	/* number of principal columns in this row */
+    union
+    {
+	int degree ;	/* number of principal & non-principal columns in row */
+	int p ;		/* used as a row pointer in init_rows_cols () */
+    } shared1 ;
+    union
+    {
+	int mark ;	/* for computing set differences and marking dead rows*/
+	int first_column ;/* first column in row (used in garbage collection) */
+    } shared2 ;
+
+} RowInfo ;
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+#define TRUE	(1)
+#define FALSE	(0)
+#define EMPTY	(-1)
+
+/* Row and column status */
+#define ALIVE	(0)
+#define DEAD	(-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL		(-1)
+#define DEAD_NON_PRINCIPAL	(-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
+#define PUBLIC
+#define PRIVATE static
+
+/* ========================================================================== */
+/* === Prototypes of PRIVATE routines ======================================= */
+/* ========================================================================== */
+
+PRIVATE int init_rows_cols
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int p []
+) ;
+
+PRIVATE void init_scoring
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int head [],
+    double knobs [COLAMD_KNOBS],
+    int *p_n_row2,
+    int *p_n_col2,
+    int *p_max_deg
+) ;
+
+PRIVATE int find_ordering
+(
+    int n_row,
+    int n_col,
+    int Alen,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int head [],
+    int n_col2,
+    int max_deg,
+    int pfree
+) ;
+
+PRIVATE void order_children
+(
+    int n_col,
+    ColInfo Col [],
+    int p []
+) ;
+
+PRIVATE void detect_super_cols
+(
+#ifndef NDEBUG
+    int n_col,
+    RowInfo Row [],
+#endif
+    ColInfo Col [],
+    int A [],
+    int head [],
+    int row_start,
+    int row_length
+) ;
+
+PRIVATE int garbage_collection
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int *pfree
+) ;
+
+PRIVATE int clear_mark
+(
+    int n_row,
+    RowInfo Row []
+) ;
+
+/* ========================================================================== */
+/* === Debugging definitions ================================================ */
+/* ========================================================================== */
+
+#ifndef NDEBUG
+
+/* === With debugging ======================================================= */
+
+/* stdlib.h: for getenv and atoi, to get debugging level from environment */
+#include <stdlib.h>
+
+/* stdio.h:  for printf (no printing if debugging is turned off) */
+#include <stdio.h>
+
+PRIVATE void debug_deg_lists
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+) ;
+
+PRIVATE void debug_mark
+(
+    int n_row,
+    RowInfo Row [],
+    int tag_mark,
+    int max_mark
+) ;
+
+PRIVATE void debug_matrix
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A []
+) ;
+
+PRIVATE void debug_structures
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int n_col2
+) ;
+
+/* the following is the *ONLY* global variable in this file, and is only */
+/* present when debugging */
+
+PRIVATE int debug_colamd ;	/* debug print level */
+
+#define DEBUG0(params) { (void) printf params ; }
+#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; }
+#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; }
+#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; }
+#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; }
+
+#else
+
+/* === No debugging ========================================================= */
+
+#define DEBUG0(params) ;
+#define DEBUG1(params) ;
+#define DEBUG2(params) ;
+#define DEBUG3(params) ;
+#define DEBUG4(params) ;
+
+#endif
+
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === USER-CALLABLE ROUTINES: ============================================== */
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === colamd_recommended =================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_recommended routine returns the suggested size for Alen.  This
+    value has been determined to provide good balance between the number of
+    garbage collections and the memory requirements for colamd.
+*/
+
+PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
+(
+    /* === Parameters ======================================================= */
+
+    int nnz,			/* number of nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+)
+{
+    /* === Local variables ================================================== */
+
+    int minimum ;		/* bare minimum requirements */
+    int recommended ;		/* recommended value of Alen */
+
+    if (nnz < 0 || n_row < 0 || n_col < 0)
+    {
+	/* return -1 if any input argument is corrupted */
+	DEBUG0 (("colamd_recommended error!")) ;
+	DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ;
+	return (-1) ;
+    }
+
+    minimum =
+	2 * (nnz)		/* for A */
+	+ (((n_col) + 1) * sizeof (ColInfo) / sizeof (int))	/* for Col */
+	+ (((n_row) + 1) * sizeof (RowInfo) / sizeof (int))	/* for Row */
+	+ n_col			/* minimum elbow room to guarrantee success */
+	+ COLAMD_STATS ;	/* for output statistics */
+
+    /* recommended is equal to the minumum plus enough memory to keep the */
+    /* number garbage collections low */
+    recommended = minimum + nnz/5 ;
+
+    return (recommended) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_set_defaults ================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_set_defaults routine sets the default values of the user-
+    controllable parameters for colamd:
+
+	knobs [0]	rows with knobs[0]*n_col entries or more are removed
+			prior to ordering.
+
+	knobs [1]	columns with knobs[1]*n_row entries or more are removed
+			prior to ordering, and placed last in the column
+			permutation.
+
+	knobs [2..19]	unused, but future versions might use this
+*/
+
+PUBLIC void colamd_set_defaults
+(
+    /* === Parameters ======================================================= */
+
+    double knobs [COLAMD_KNOBS]		/* knob array */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+
+    if (!knobs)
+    {
+	return ;			/* no knobs to initialize */
+    }
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	knobs [i] = 0 ;
+    }
+    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
+    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
+}
+
+
+/* ========================================================================== */
+/* === colamd =============================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd routine computes a column ordering Q of a sparse matrix
+    A such that the LU factorization P(AQ) = LU remains sparse, where P is
+    selected via partial pivoting.   The routine can also be viewed as
+    providing a permutation Q such that the Cholesky factorization
+    (AQ)'(AQ) = LL' remains sparse.
+
+    On input, the nonzero patterns of the columns of A are stored in the
+    array A, in order 0 to n_col-1.  A is held in 0-based form (rows in the
+    range 0 to n_row-1 and columns in the range 0 to n_col-1).  Row indices
+    for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0,
+    and thus p [n_col] is the number of entries in A.  The matrix is
+    destroyed on output.  The row indices within each column do not have to
+    be sorted (from small to large row indices), and duplicate row indices
+    may be present.  However, colamd will work a little faster if columns are
+    sorted and no duplicates are present.  Matlab 5.2 always passes the matrix
+    with sorted columns, and no duplicates.
+
+    The integer array A is of size Alen.  Alen must be at least of size
+    (where nnz is the number of entries in A):
+
+	nnz			for the input column form of A
+	+ nnz			for a row form of A that colamd generates
+	+ 6*(n_col+1)		for a ColInfo Col [0..n_col] array
+				(this assumes sizeof (ColInfo) is 6 int's).
+	+ 4*(n_row+1)		for a RowInfo Row [0..n_row] array
+				(this assumes sizeof (RowInfo) is 4 int's).
+	+ elbow_room		must be at least n_col.  We recommend at least
+				nnz/5 in addition to that.  If sufficient,
+				changes in the elbow room affect the ordering
+				time only, not the ordering itself.
+	+ COLAMD_STATS		for the output statistics
+
+    Colamd returns FALSE is memory is insufficient, or TRUE otherwise.
+
+    On input, the caller must specify:
+
+	n_row			the number of rows of A
+	n_col			the number of columns of A
+	Alen			the size of the array A
+	A [0 ... nnz-1]		the row indices, where nnz = p [n_col]
+	A [nnz ... Alen-1]	(need not be initialized by the user)
+	p [0 ... n_col]		the column pointers,  p [0] = 0, and p [n_col]
+				is the number of entries in A.  Column c of A
+				is stored in A [p [c] ... p [c+1]-1].
+	knobs [0 ... 19]	a set of parameters that control the behavior
+				of colamd.  If knobs is a NULL pointer the
+				defaults are used.  The user-callable
+				colamd_set_defaults routine sets the default
+				parameters.  See that routine for a description
+				of the user-controllable parameters.
+
+    If the return value of Colamd is TRUE, then on output:
+
+	p [0 ... n_col-1]	the column permutation. p [0] is the first
+				column index, and p [n_col-1] is the last.
+				That is, p [k] = j means that column j of A
+				is the kth column of AQ.
+
+	A			is undefined on output (the matrix pattern is
+				destroyed), except for the following statistics:
+
+	A [0]			the number of dense (or empty) rows ignored
+	A [1]			the number of dense (or empty) columms.  These
+				are ordered last, in their natural order.
+	A [2]			the number of garbage collections performed.
+				If this is excessive, then you would have
+				gotten your results faster if Alen was larger.
+	A [3]			0, if all row indices in each column were in
+				sorted order and no duplicates were present.
+				1, if there were unsorted or duplicate row
+				indices in the input.  You would have gotten
+				your results faster if A [3] was returned as 0.
+
+    If the return value of Colamd is FALSE, then A and p are undefined on
+    output.
+*/
+
+PUBLIC int colamd		/* returns TRUE if successful */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* length of A */
+    int A [],			/* row indices of A */
+    int p [],			/* pointers to columns in A */
+    double knobs [COLAMD_KNOBS]	/* parameters (uses defaults if NULL) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop index */
+    int nnz ;			/* nonzeros in A */
+    int Row_size ;		/* size of Row [], in integers */
+    int Col_size ;		/* size of Col [], in integers */
+    int elbow_room ;		/* remaining free space */
+    RowInfo *Row ;		/* pointer into A of Row [0..n_row] array */
+    ColInfo *Col ;		/* pointer into A of Col [0..n_col] array */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int ngarbage ;		/* number of garbage collections performed */
+    int max_deg ;		/* maximum row degree */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs knobs array */
+    int init_result ;		/* return code from initialization */
+
+#ifndef NDEBUG
+    debug_colamd = 0 ;		/* no debug printing */
+    /* get "D" environment variable, which gives the debug printing level */
+    if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ;
+    DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ;
+#endif
+
+    /* === Check the input arguments ======================================== */
+
+    if (n_row < 0 || n_col < 0 || !A || !p)
+    {
+	/* n_row and n_col must be non-negative, A and p must be present */
+	DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ;
+	return (FALSE) ;
+    }
+    nnz = p [n_col] ;
+    if (nnz < 0 || p [0] != 0)
+    {
+	/* nnz must be non-negative, and p [0] must be zero */
+	DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default parameters ============================== */
+
+    if (!knobs)
+    {
+	knobs = default_knobs ;
+	colamd_set_defaults (knobs) ;
+    }
+
+    /* === Allocate the Row and Col arrays from array A ===================== */
+
+    Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ;
+    Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ;
+    elbow_room = Alen - (2*nnz + Col_size + Row_size) ;
+    if (elbow_room < n_col + COLAMD_STATS)
+    {
+	/* not enough space in array A to perform the ordering */
+	DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ;
+	return (FALSE) ;
+    }
+    Alen = 2*nnz + elbow_room ;
+    Col  = (ColInfo *) &A [Alen] ;
+    Row  = (RowInfo *) &A [Alen + Col_size] ;
+
+    /* === Construct the row and column data structures ===================== */
+
+    init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ;
+    if (init_result == -1)
+    {
+	/* input matrix is invalid */
+	DEBUG0 (("colamd error! matrix invalid\n")) ;
+	return (FALSE) ;
+    }
+
+    /* === Initialize scores, kill dense rows/columns ======================= */
+
+    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+	&n_row2, &n_col2, &max_deg) ;
+
+    /* === Order the supercolumns =========================================== */
+
+    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+	n_col2, max_deg, 2*nnz) ;
+
+    /* === Order the non-principal columns ================================== */
+
+    order_children (n_col, Col, p) ;
+
+    /* === Return statistics in A =========================================== */
+
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	A [i] = 0 ;
+    }
+    A [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+    A [COLAMD_DENSE_COL] = n_col - n_col2 ;
+    A [COLAMD_DEFRAG_COUNT] = ngarbage ;
+    A [COLAMD_JUMBLED_COLS] = init_result ;
+
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+    Takes the column form of the matrix in A and creates the row form of the
+    matrix.  Also, row and column attributes are stored in the Col and Row
+    structs.  If the columns are un-sorted or contain duplicate row indices,
+    this routine will also sort and remove duplicate row indices from the
+    column form of the matrix.  Returns -1 on error, 1 if columns jumbled,
+    or 0 if columns not jumbled.  Not user-callable.
+*/
+
+PRIVATE int init_rows_cols	/* returns status code */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A, of size Alen */
+    int p []			/* pointers to columns in A, of size n_col+1 */
+)
+{
+    /* === Local variables ================================================== */
+
+    int col ;			/* a column index */
+    int row ;			/* a row index */
+    int *cp ;			/* a column pointer */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *rp ;			/* a row pointer */
+    int *rp_end ;		/* a pointer to the end of a row */
+    int last_start ;		/* start index of previous column in A */
+    int start ;			/* start index of column in A */
+    int last_row ;		/* previous row */
+    int jumbled_columns ;	/* indicates if columns are jumbled */
+
+    /* === Initialize columns, and check column pointers ==================== */
+
+    last_start = 0 ;
+    for (col = 0 ; col < n_col ; col++)
+    {
+	start = p [col] ;
+	if (start < last_start)
+	{
+	    /* column pointers must be non-decreasing */
+	    DEBUG0 (("colamd error!  last p %d p [col] %d\n",last_start,start));
+	    return (-1) ;
+	}
+	Col [col].start = start ;
+	Col [col].length = p [col+1] - start ;
+	Col [col].shared1.thickness = 1 ;
+	Col [col].shared2.score = 0 ;
+	Col [col].shared3.prev = EMPTY ;
+	Col [col].shared4.degree_next = EMPTY ;
+	last_start = start ;
+    }
+    /* must check the end pointer for last column */
+    if (p [n_col] < last_start)
+    {
+	/* column pointers must be non-decreasing */
+	DEBUG0 (("colamd error!  last p %d p [n_col] %d\n",p[col],last_start)) ;
+	return (-1) ;
+    }
+
+    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+    /* === Scan columns, compute row degrees, and check row indices ========= */
+
+    jumbled_columns = FALSE ;
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].length = 0 ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	last_row = -1 ;
+
+	cp = &A [p [col]] ;
+	cp_end = &A [p [col+1]] ;
+
+	while (cp < cp_end)
+	{
+	    row = *cp++ ;
+
+	    /* make sure row indices within range */
+	    if (row < 0 || row >= n_row)
+	    {
+		DEBUG0 (("colamd error!  col %d row %d last_row %d\n",
+			 col, row, last_row)) ;
+		return (-1) ;
+	    }
+	    else if (row <= last_row)
+	    {
+		/* row indices are not sorted or repeated, thus cols */
+		/* are jumbled */
+		jumbled_columns = TRUE ;
+	    }
+	    /* prevent repeated row from being counted */
+	    if (Row [row].shared2.mark != col)
+	    {
+		Row [row].length++ ;
+		Row [row].shared2.mark = col ;
+		last_row = row ;
+	    }
+	    else
+	    {
+		/* this is a repeated entry in the column, */
+		/* it will be removed */
+		Col [col].length-- ;
+	    }
+	}
+    }
+
+    /* === Compute row pointers ============================================= */
+
+    /* row form of the matrix starts directly after the column */
+    /* form of matrix in A */
+    Row [0].start = p [n_col] ;
+    Row [0].shared1.p = Row [0].start ;
+    Row [0].shared2.mark = -1 ;
+    for (row = 1 ; row < n_row ; row++)
+    {
+	Row [row].start = Row [row-1].start + Row [row-1].length ;
+	Row [row].shared1.p = Row [row].start ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    /* === Create row form ================================================== */
+
+    if (jumbled_columns)
+    {
+	/* if cols jumbled, watch for repeated row indices */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		row = *cp++ ;
+		if (Row [row].shared2.mark != col)
+		{
+		    A [(Row [row].shared1.p)++] = col ;
+		    Row [row].shared2.mark = col ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* if cols not jumbled, we don't need the mark (this is faster) */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		A [(Row [*cp++].shared1.p)++] = col ;
+	    }
+	}
+    }
+
+    /* === Clear the row marks and set row degrees ========================== */
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].shared2.mark = 0 ;
+	Row [row].shared1.degree = Row [row].length ;
+    }
+
+    /* === See if we need to re-create columns ============================== */
+
+    if (jumbled_columns)
+    {
+
+#ifndef NDEBUG
+	/* make sure column lengths are correct */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    p [col] = Col [col].length ;
+	}
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		p [*rp++]-- ;
+	    }
+	}
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    assert (p [col] == 0) ;
+	}
+	/* now p is all zero (different than when debugging is turned off) */
+#endif
+
+	/* === Compute col pointers ========================================= */
+
+	/* col form of the matrix starts at A [0]. */
+	/* Note, we may have a gap between the col form and the row */
+	/* form if there were duplicate entries, if so, it will be */
+	/* removed upon the first garbage collection */
+	Col [0].start = 0 ;
+	p [0] = Col [0].start ;
+	for (col = 1 ; col < n_col ; col++)
+	{
+	    /* note that the lengths here are for pruned columns, i.e. */
+	    /* no duplicate row indices will exist for these columns */
+	    Col [col].start = Col [col-1].start + Col [col-1].length ;
+	    p [col] = Col [col].start ;
+	}
+
+	/* === Re-create col form =========================================== */
+
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		A [(p [*rp++])++] = row ;
+	    }
+	}
+	return (1) ;
+    }
+    else
+    {
+	/* no columns jumbled (this is faster) */
+	return (0) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+    Kills dense or empty columns and rows, calculates an initial score for
+    each column, and places all columns in the degree lists.  Not user-callable.
+*/
+
+PRIVATE void init_scoring
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameters */
+    int *p_n_row2,		/* number of non-dense, non-empty rows */
+    int *p_n_col2,		/* number of non-dense, non-empty columns */
+    int *p_max_deg		/* maximum row degree */
+)
+{
+    /* === Local variables ================================================== */
+
+    int c ;			/* a column index */
+    int r, row ;		/* a row index */
+    int *cp ;			/* a column pointer */
+    int deg ;			/* degree (# entries) of a row or column */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *new_cp ;		/* new column pointer */
+    int col_length ;		/* length of pruned column */
+    int score ;			/* current column score */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int dense_row_count ;	/* remove rows with more entries than this */
+    int dense_col_count ;	/* remove cols with more entries than this */
+    int min_score ;		/* smallest column score */
+    int max_deg ;		/* maximum row degree */
+    int next_col ;		/* Used to add to degree list.*/
+#ifndef NDEBUG
+    int debug_count ;		/* debug only. */
+#endif
+
+    /* === Extract knobs ==================================================== */
+
+    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
+    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+    DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+    max_deg = 0 ;
+    n_col2 = n_col ;
+    n_row2 = n_row ;
+
+    /* === Kill empty columns =============================================== */
+
+    /* Put the empty columns at the end in their natural, so that LU */
+    /* factorization can proceed as far as possible. */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	deg = Col [c].length ;
+	if (deg == 0)
+	{
+	    /* this is a empty column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense columns =============================================== */
+
+    /* Put the dense columns at the end, in their natural order */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip any dead columns */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	deg = Col [c].length ;
+	if (deg > dense_col_count)
+	{
+	    /* this is a dense column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    /* decrement the row degrees */
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + Col [c].length ;
+	    while (cp < cp_end)
+	    {
+		Row [*cp++].shared1.degree-- ;
+	    }
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense and empty rows ======================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	deg = Row [r].shared1.degree ;
+	assert (deg >= 0 && deg <= n_col) ;
+	if (deg > dense_row_count || deg == 0)
+	{
+	    /* kill a dense or empty row */
+	    KILL_ROW (r) ;
+	    --n_row2 ;
+	}
+	else
+	{
+	    /* keep track of max degree of remaining rows */
+	    max_deg = MAX (max_deg, deg) ;
+	}
+    }
+    DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+    /* === Compute initial column scores ==================================== */
+
+    /* At this point the row degrees are accurate.  They reflect the number */
+    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
+    /* Some "live" columns may contain only dead rows, however.  These are */
+    /* pruned in the code below. */
+
+    /* now find the initial matlab score for each column */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip dead column */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	score = 0 ;
+	cp = &A [Col [c].start] ;
+	new_cp = cp ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    /* skip if dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    /* compact the column */
+	    *new_cp++ = row ;
+	    /* add row's external degree */
+	    score += Row [row].shared1.degree - 1 ;
+	    /* guard against integer overflow */
+	    score = MIN (score, n_col) ;
+	}
+	/* determine pruned column length */
+	col_length = (int) (new_cp - &A [Col [c].start]) ;
+	if (col_length == 0)
+	{
+	    /* a newly-made null column (all rows in this col are "dense" */
+	    /* and have already been killed) */
+	    DEBUG0 (("Newly null killed: %d\n", c)) ;
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+	else
+	{
+	    /* set column length and set score */
+	    assert (score >= 0) ;
+	    assert (score <= n_col) ;
+	    Col [c].length = col_length ;
+	    Col [c].shared2.score = score ;
+	}
+    }
+    DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ;
+
+    /* At this point, all empty rows and columns are dead.  All live columns */
+    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+    /* yet).  Rows may contain dead columns, but all live rows contain at */
+    /* least one live column. */
+
+#ifndef NDEBUG
+    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
+#endif
+
+    /* === Initialize degree lists ========================================== */
+
+#ifndef NDEBUG
+    debug_count = 0 ;
+#endif
+
+    /* clear the hash buckets */
+    for (c = 0 ; c <= n_col ; c++)
+    {
+	head [c] = EMPTY ;
+    }
+    min_score = n_col ;
+    /* place in reverse order, so low column indices are at the front */
+    /* of the lists.  This is to encourage natural tie-breaking */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* only add principal columns to degree lists */
+	if (COL_IS_ALIVE (c))
+	{
+	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+		c, Col [c].shared2.score, min_score, n_col)) ;
+
+	    /* === Add columns score to DList =============================== */
+
+	    score = Col [c].shared2.score ;
+
+	    assert (min_score >= 0) ;
+	    assert (min_score <= n_col) ;
+	    assert (score >= 0) ;
+	    assert (score <= n_col) ;
+	    assert (head [score] >= EMPTY) ;
+
+	    /* now add this column to dList at proper score location */
+	    next_col = head [score] ;
+	    Col [c].shared3.prev = EMPTY ;
+	    Col [c].shared4.degree_next = next_col ;
+
+	    /* if there already was a column with the same score, set its */
+	    /* previous pointer to this new column */
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = c ;
+	    }
+	    head [score] = c ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, score) ;
+
+#ifndef NDEBUG
+	    debug_count++ ;
+#endif
+	}
+    }
+
+#ifndef NDEBUG
+    DEBUG0 (("Live cols %d out of %d, non-princ: %d\n",
+	debug_count, n_col, n_col-debug_count)) ;
+    assert (debug_count == n_col2) ;
+    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
+#endif
+
+    /* === Return number of remaining columns, and max row degree =========== */
+
+    *p_n_col2 = n_col2 ;
+    *p_n_row2 = n_row2 ;
+    *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+    Order the principal columns of the supercolumn form of the matrix
+    (no supercolumns on input).  Uses a minimum approximate column minimum
+    degree ordering method.  Not user-callable.
+*/
+
+PRIVATE int find_ordering	/* return the number of garbage collections */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    int Alen,			/* size of A, 2*nnz + elbow_room or larger */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    int n_col2,			/* Remaining columns to order */
+    int max_deg,		/* Maximum row degree */
+    int pfree			/* index of first free slot (2*nnz on entry) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int k ;			/* current pivot ordering step */
+    int pivot_col ;		/* current pivot column */
+    int *cp ;			/* a column pointer */
+    int *rp ;			/* a row pointer */
+    int pivot_row ;		/* current pivot row */
+    int *new_cp ;		/* modified column pointer */
+    int *new_rp ;		/* modified row pointer */
+    int pivot_row_start ;	/* pointer to start of pivot row */
+    int pivot_row_degree ;	/* # of columns in pivot row */
+    int pivot_row_length ;	/* # of supercolumns in pivot row */
+    int pivot_col_score ;	/* score of pivot column */
+    int needed_memory ;		/* free space needed for pivot row */
+    int *cp_end ;		/* pointer to the end of a column */
+    int *rp_end ;		/* pointer to the end of a row */
+    int row ;			/* a row index */
+    int col ;			/* a column index */
+    int max_score ;		/* maximum possible score */
+    int cur_score ;		/* score of current column */
+    unsigned int hash ;		/* hash value for supernode detection */
+    int head_column ;		/* head of hash bucket */
+    int first_col ;		/* first column in hash bucket */
+    int tag_mark ;		/* marker value for mark array */
+    int row_mark ;		/* Row [row].shared2.mark */
+    int set_difference ;	/* set difference size of row with pivot row */
+    int min_score ;		/* smallest column score */
+    int col_thickness ;		/* "thickness" (# of columns in a supercol) */
+    int max_mark ;		/* maximum value of tag_mark */
+    int pivot_col_thickness ;	/* number of columns represented by pivot col */
+    int prev_col ;		/* Used by Dlist operations. */
+    int next_col ;		/* Used by Dlist operations. */
+    int ngarbage ;		/* number of garbage collections performed */
+#ifndef NDEBUG
+    int debug_d ;		/* debug loop counter */
+    int debug_step = 0 ;	/* debug loop counter */
+#endif
+
+    /* === Initialization and clear mark ==================================== */
+
+    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
+    tag_mark = clear_mark (n_row, Row) ;
+    min_score = 0 ;
+    ngarbage = 0 ;
+    DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ;
+
+    /* === Order the columns ================================================ */
+
+    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+    {
+
+#ifndef NDEBUG
+	if (debug_step % 100 == 0)
+	{
+	    DEBUG0 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	else
+	{
+	    DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	debug_step++ ;
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+	debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif
+
+	/* === Select pivot column, and order it ============================ */
+
+	/* make sure degree list isn't empty */
+	assert (min_score >= 0) ;
+	assert (min_score <= n_col) ;
+	assert (head [min_score] >= EMPTY) ;
+
+#ifndef NDEBUG
+	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
+	{
+	    assert (head [debug_d] == EMPTY) ;
+	}
+#endif
+
+	/* get pivot column from head of minimum degree list */
+	while (head [min_score] == EMPTY && min_score < n_col)
+	{
+	    min_score++ ;
+	}
+	pivot_col = head [min_score] ;
+	assert (pivot_col >= 0 && pivot_col <= n_col) ;
+	next_col = Col [pivot_col].shared4.degree_next ;
+	head [min_score] = next_col ;
+	if (next_col != EMPTY)
+	{
+	    Col [next_col].shared3.prev = EMPTY ;
+	}
+
+	assert (COL_IS_ALIVE (pivot_col)) ;
+	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+
+	/* remember score for defrag check */
+	pivot_col_score = Col [pivot_col].shared2.score ;
+
+	/* the pivot column is the kth column in the pivot order */
+	Col [pivot_col].shared2.order = k ;
+
+	/* increment order count by column thickness */
+	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+	k += pivot_col_thickness ;
+	assert (pivot_col_thickness > 0) ;
+
+	/* === Garbage_collection, if necessary ============================= */
+
+	needed_memory = MIN (pivot_col_score, n_col - k) ;
+	if (pfree + needed_memory >= Alen)
+	{
+	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+	    ngarbage++ ;
+	    /* after garbage collection we will have enough */
+	    assert (pfree + needed_memory < Alen) ;
+	    /* garbage collection has wiped out the Row[].shared2.mark array */
+	    tag_mark = clear_mark (n_row, Row) ;
+#ifndef NDEBUG
+	    debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif
+	}
+
+	/* === Compute pivot row pattern ==================================== */
+
+	/* get starting location for this new merged row */
+	pivot_row_start = pfree ;
+
+	/* initialize new row counts to zero */
+	pivot_row_degree = 0 ;
+
+	/* tag pivot column as having been visited so it isn't included */
+	/* in merged pivot row */
+	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+	/* pivot row is the union of all rows in the pivot column pattern */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+	    /* skip if row is dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		/* get a column */
+		col = *rp++ ;
+		/* add the column, if alive and untagged */
+		col_thickness = Col [col].shared1.thickness ;
+		if (col_thickness > 0 && COL_IS_ALIVE (col))
+		{
+		    /* tag column in pivot row */
+		    Col [col].shared1.thickness = -col_thickness ;
+		    assert (pfree < Alen) ;
+		    /* place column in pivot row */
+		    A [pfree++] = col ;
+		    pivot_row_degree += col_thickness ;
+		}
+	    }
+	}
+
+	/* clear tag on pivot column */
+	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+	max_deg = MAX (max_deg, pivot_row_degree) ;
+
+#ifndef NDEBUG
+	DEBUG3 (("check2\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif
+
+	/* === Kill all rows used to construct pivot row ==================== */
+
+	/* also kill pivot row, temporarily */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* may be killing an already dead row */
+	    row = *cp++ ;
+	    DEBUG2 (("Kill row in pivot col: %d\n", row)) ;
+	    KILL_ROW (row) ;
+	}
+
+	/* === Select a row index to use as the new pivot row =============== */
+
+	pivot_row_length = pfree - pivot_row_start ;
+	if (pivot_row_length > 0)
+	{
+	    /* pick the "pivot" row arbitrarily (first row in col) */
+	    pivot_row = A [Col [pivot_col].start] ;
+	    DEBUG2 (("Pivotal row is %d\n", pivot_row)) ;
+	}
+	else
+	{
+	    /* there is no pivot row, since it is of zero length */
+	    pivot_row = EMPTY ;
+	    assert (pivot_row_length == 0) ;
+	}
+	assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+	/* === Approximate degree computation =============================== */
+
+	/* Here begins the computation of the approximate degree.  The column */
+	/* score is the sum of the pivot row "length", plus the size of the */
+	/* set differences of each row in the column minus the pattern of the */
+	/* pivot row itself.  The column ("thickness") itself is also */
+	/* excluded from the column score (we thus use an approximate */
+	/* external degree). */
+
+	/* The time taken by the following code (compute set differences, and */
+	/* add them up) is proportional to the size of the data structure */
+	/* being scanned - that is, the sum of the sizes of each column in */
+	/* the pivot row.  Thus, the amortized time to compute a column score */
+	/* is proportional to the size of that column (where size, in this */
+	/* context, is the column "length", or the number of row indices */
+	/* in that column).  The number of row indices in a column is */
+	/* monotonically non-decreasing, from the length of the original */
+	/* column on input to colamd. */
+
+	/* === Compute set differences ====================================== */
+
+	DEBUG1 (("** Computing set differences phase. **\n")) ;
+
+	/* pivot row is currently dead - it will be revived later. */
+
+	DEBUG2 (("Pivot row: ")) ;
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    DEBUG2 (("Col: %d\n", col)) ;
+
+	    /* clear tags used to construct pivot row pattern */
+	    col_thickness = -Col [col].shared1.thickness ;
+	    assert (col_thickness > 0) ;
+	    Col [col].shared1.thickness = col_thickness ;
+
+	    /* === Remove column from degree list =========================== */
+
+	    cur_score = Col [col].shared2.score ;
+	    prev_col = Col [col].shared3.prev ;
+	    next_col = Col [col].shared4.degree_next ;
+	    assert (cur_score >= 0) ;
+	    assert (cur_score <= n_col) ;
+	    assert (cur_score >= EMPTY) ;
+	    if (prev_col == EMPTY)
+	    {
+		head [cur_score] = next_col ;
+	    }
+	    else
+	    {
+		Col [prev_col].shared4.degree_next = next_col ;
+	    }
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = prev_col ;
+	    }
+
+	    /* === Scan the column ========================================== */
+
+	    cp = &A [Col [col].start] ;
+	    cp_end = cp + Col [col].length ;
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		assert (row != pivot_row) ;
+		set_difference = row_mark - tag_mark ;
+		/* check if the row has been seen yet */
+		if (set_difference < 0)
+		{
+		    assert (Row [row].shared1.degree <= max_deg) ;
+		    set_difference = Row [row].shared1.degree ;
+		}
+		/* subtract column thickness from this row's set difference */
+		set_difference -= col_thickness ;
+		assert (set_difference >= 0) ;
+		/* absorb this row if the set difference becomes zero */
+		if (set_difference == 0)
+		{
+		    DEBUG1 (("aggressive absorption. Row: %d\n", row)) ;
+		    KILL_ROW (row) ;
+		}
+		else
+		{
+		    /* save the new mark */
+		    Row [row].shared2.mark = set_difference + tag_mark ;
+		}
+	    }
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k-pivot_row_degree, max_deg) ;
+#endif
+
+	/* === Add up set differences for each column ======================= */
+
+	DEBUG1 (("** Adding set differences phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    /* get a column */
+	    col = *rp++ ;
+	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    hash = 0 ;
+	    cur_score = 0 ;
+	    cp = &A [Col [col].start] ;
+	    /* compact the column */
+	    new_cp = cp ;
+	    cp_end = cp + Col [col].length ;
+
+	    DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ;
+
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		assert(row >= 0 && row < n_row) ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		assert (row_mark > tag_mark) ;
+		/* compact the column */
+		*new_cp++ = row ;
+		/* compute hash function */
+		hash += row ;
+		/* add set difference */
+		cur_score += row_mark - tag_mark ;
+		/* integer overflow... */
+		cur_score = MIN (cur_score, n_col) ;
+	    }
+
+	    /* recompute the column's length */
+	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
+
+	    /* === Further mass elimination ================================= */
+
+	    if (Col [col].length == 0)
+	    {
+		DEBUG1 (("further mass elimination. Col: %d\n", col)) ;
+		/* nothing left but the pivot row in this column */
+		KILL_PRINCIPAL_COL (col) ;
+		pivot_row_degree -= Col [col].shared1.thickness ;
+		assert (pivot_row_degree >= 0) ;
+		/* order it */
+		Col [col].shared2.order = k ;
+		/* increment order count by column thickness */
+		k += Col [col].shared1.thickness ;
+	    }
+	    else
+	    {
+		/* === Prepare for supercolumn detection ==================== */
+
+		DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+		/* save score so far */
+		Col [col].shared2.score = cur_score ;
+
+		/* add column to hash table, for supercolumn detection */
+		hash %= n_col + 1 ;
+
+		DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+		assert (hash <= n_col) ;
+
+		head_column = head [hash] ;
+		if (head_column > EMPTY)
+		{
+		    /* degree list "hash" is non-empty, use prev (shared3) of */
+		    /* first column in degree list as head of hash bucket */
+		    first_col = Col [head_column].shared3.headhash ;
+		    Col [head_column].shared3.headhash = col ;
+		}
+		else
+		{
+		    /* degree list "hash" is empty, use head as hash bucket */
+		    first_col = - (head_column + 2) ;
+		    head [hash] = - (col + 2) ;
+		}
+		Col [col].shared4.hash_next = first_col ;
+
+		/* save hash function in Col [col].shared3.hash */
+		Col [col].shared3.hash = (int) hash ;
+		assert (COL_IS_ALIVE (col)) ;
+	    }
+	}
+
+	/* The approximate external column degree is now computed.  */
+
+	/* === Supercolumn detection ======================================== */
+
+	DEBUG1 (("** Supercolumn detection phase. **\n")) ;
+
+	detect_super_cols (
+#ifndef NDEBUG
+		n_col, Row,
+#endif
+		Col, A, head, pivot_row_start, pivot_row_length) ;
+
+	/* === Kill the pivotal column ====================================== */
+
+	KILL_PRINCIPAL_COL (pivot_col) ;
+
+	/* === Clear mark =================================================== */
+
+	tag_mark += (max_deg + 1) ;
+	if (tag_mark >= max_mark)
+	{
+	    DEBUG1 (("clearing tag_mark\n")) ;
+	    tag_mark = clear_mark (n_row, Row) ;
+	}
+#ifndef NDEBUG
+	DEBUG3 (("check3\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif
+
+	/* === Finalize the new pivot row, and column scores ================ */
+
+	DEBUG1 (("** Finalize scores phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	/* compact the pivot row */
+	new_rp = rp ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    /* skip dead columns */
+	    if (COL_IS_DEAD (col))
+	    {
+		continue ;
+	    }
+	    *new_rp++ = col ;
+	    /* add new pivot row to column */
+	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+	    /* retrieve score so far and add on pivot row's degree. */
+	    /* (we wait until here for this in case the pivot */
+	    /* row's degree was reduced due to mass elimination). */
+	    cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+	    /* calculate the max possible score as the number of */
+	    /* external columns minus the 'k' value minus the */
+	    /* columns thickness */
+	    max_score = n_col - k - Col [col].shared1.thickness ;
+
+	    /* make the score the external degree of the union-of-rows */
+	    cur_score -= Col [col].shared1.thickness ;
+
+	    /* make sure score is less or equal than the max score */
+	    cur_score = MIN (cur_score, max_score) ;
+	    assert (cur_score >= 0) ;
+
+	    /* store updated score */
+	    Col [col].shared2.score = cur_score ;
+
+	    /* === Place column back in degree list ========================= */
+
+	    assert (min_score >= 0) ;
+	    assert (min_score <= n_col) ;
+	    assert (cur_score >= 0) ;
+	    assert (cur_score <= n_col) ;
+	    assert (head [cur_score] >= EMPTY) ;
+	    next_col = head [cur_score] ;
+	    Col [col].shared4.degree_next = next_col ;
+	    Col [col].shared3.prev = EMPTY ;
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = col ;
+	    }
+	    head [cur_score] = col ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, cur_score) ;
+
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+#endif
+
+	/* === Resurrect the new pivot row ================================== */
+
+	if (pivot_row_degree > 0)
+	{
+	    /* update pivot row length to reflect any cols that were killed */
+	    /* during super-col detection and mass elimination */
+	    Row [pivot_row].start  = pivot_row_start ;
+	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
+	    Row [pivot_row].shared1.degree = pivot_row_degree ;
+	    Row [pivot_row].shared2.mark = 0 ;
+	    /* pivot row is no longer dead */
+	}
+    }
+
+    /* === All principal columns have now been ordered ====================== */
+
+    return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+    The find_ordering routine has ordered all of the principal columns (the
+    representatives of the supercolumns).  The non-principal columns have not
+    yet been ordered.  This routine orders those columns by walking up the
+    parent tree (a column is a child of the column which absorbed it).  The
+    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+    being the first column, and p [n_col-1] being the last.  It doesn't look
+    like it at first glance, but be assured that this routine takes time linear
+    in the number of columns.  Although not immediately obvious, the time
+    taken by this routine is O (n_col), that is, linear in the number of
+    columns.  Not user-callable.
+*/
+
+PRIVATE void order_children
+(
+    /* === Parameters ======================================================= */
+
+    int n_col,			/* number of columns of A */
+    ColInfo Col [],		/* of size n_col+1 */
+    int p []			/* p [0 ... n_col-1] is the column permutation*/
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop counter for all columns */
+    int c ;			/* column index */
+    int parent ;		/* index of column's parent */
+    int order ;			/* column's order */
+
+    /* === Order each non-principal column ================================== */
+
+    for (i = 0 ; i < n_col ; i++)
+    {
+	/* find an un-ordered non-principal column */
+	assert (COL_IS_DEAD (i)) ;
+	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
+	{
+	    parent = i ;
+	    /* once found, find its principal parent */
+	    do
+	    {
+		parent = Col [parent].shared1.parent ;
+	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+	    /* now, order all un-ordered non-principal columns along path */
+	    /* to this parent.  collapse tree at the same time */
+	    c = i ;
+	    /* get order of parent */
+	    order = Col [parent].shared2.order ;
+
+	    do
+	    {
+		assert (Col [c].shared2.order == EMPTY) ;
+
+		/* order this column */
+		Col [c].shared2.order = order++ ;
+		/* collaps tree */
+		Col [c].shared1.parent = parent ;
+
+		/* get immediate parent of this column */
+		c = Col [c].shared1.parent ;
+
+		/* continue until we hit an ordered column.  There are */
+		/* guarranteed not to be anymore unordered columns */
+		/* above an ordered column */
+	    } while (Col [c].shared2.order == EMPTY) ;
+
+	    /* re-order the super_col parent to largest order for this group */
+	    Col [parent].shared2.order = order ;
+	}
+    }
+
+    /* === Generate the permutation ========================================= */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	p [Col [c].shared2.order] = c ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+    Detects supercolumns by finding matches between columns in the hash buckets.
+    Check amongst columns in the set A [row_start ... row_start + row_length-1].
+    The columns under consideration are currently *not* in the degree lists,
+    and have already been placed in the hash buckets.
+
+    The hash bucket for columns whose hash function is equal to h is stored
+    as follows:
+
+	if head [h] is >= 0, then head [h] contains a degree list, so:
+
+		head [h] is the first column in degree bucket h.
+		Col [head [h]].headhash gives the first column in hash bucket h.
+
+	otherwise, the degree list is empty, and:
+
+		-(head [h] + 2) is the first column in hash bucket h.
+
+    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+    column" pointer.  Col [c].shared3.hash is used instead as the hash number
+    for that column.  The value of Col [c].shared4.hash_next is the next column
+    in the same hash bucket.
+
+    Assuming no, or "few" hash collisions, the time taken by this routine is
+    linear in the sum of the sizes (lengths) of each column whose score has
+    just been computed in the approximate degree computation.
+    Not user-callable.
+*/
+
+PRIVATE void detect_super_cols
+(
+    /* === Parameters ======================================================= */
+
+#ifndef NDEBUG
+    /* these two parameters are only needed when debugging is enabled: */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+#endif
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A */
+    int head [],		/* head of degree lists and hash buckets */
+    int row_start,		/* pointer to set of columns to check */
+    int row_length		/* number of columns to check */
+)
+{
+    /* === Local variables ================================================== */
+
+    int hash ;			/* hash # for a column */
+    int *rp ;			/* pointer to a row */
+    int c ;			/* a column index */
+    int super_c ;		/* column index of the column to absorb into */
+    int *cp1 ;			/* column pointer for column super_c */
+    int *cp2 ;			/* column pointer for column c */
+    int length ;		/* length of column super_c */
+    int prev_c ;		/* column preceding c in hash bucket */
+    int i ;			/* loop counter */
+    int *rp_end ;		/* pointer to the end of the row */
+    int col ;			/* a column index in the row to check */
+    int head_column ;		/* first column in hash bucket or degree list */
+    int first_col ;		/* first column in hash bucket */
+
+    /* === Consider each column in the row ================================== */
+
+    rp = &A [row_start] ;
+    rp_end = rp + row_length ;
+    while (rp < rp_end)
+    {
+	col = *rp++ ;
+	if (COL_IS_DEAD (col))
+	{
+	    continue ;
+	}
+
+	/* get hash number for this column */
+	hash = Col [col].shared3.hash ;
+	assert (hash <= n_col) ;
+
+	/* === Get the first column in this hash bucket ===================== */
+
+	head_column = head [hash] ;
+	if (head_column > EMPTY)
+	{
+	    first_col = Col [head_column].shared3.headhash ;
+	}
+	else
+	{
+	    first_col = - (head_column + 2) ;
+	}
+
+	/* === Consider each column in the hash bucket ====================== */
+
+	for (super_c = first_col ; super_c != EMPTY ;
+	    super_c = Col [super_c].shared4.hash_next)
+	{
+	    assert (COL_IS_ALIVE (super_c)) ;
+	    assert (Col [super_c].shared3.hash == hash) ;
+	    length = Col [super_c].length ;
+
+	    /* prev_c is the column preceding column c in the hash bucket */
+	    prev_c = super_c ;
+
+	    /* === Compare super_c with all columns after it ================ */
+
+	    for (c = Col [super_c].shared4.hash_next ;
+		 c != EMPTY ; c = Col [c].shared4.hash_next)
+	    {
+		assert (c != super_c) ;
+		assert (COL_IS_ALIVE (c)) ;
+		assert (Col [c].shared3.hash == hash) ;
+
+		/* not identical if lengths or scores are different */
+		if (Col [c].length != length ||
+		    Col [c].shared2.score != Col [super_c].shared2.score)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* compare the two columns */
+		cp1 = &A [Col [super_c].start] ;
+		cp2 = &A [Col [c].start] ;
+
+		for (i = 0 ; i < length ; i++)
+		{
+		    /* the columns are "clean" (no dead rows) */
+		    assert (ROW_IS_ALIVE (*cp1))  ;
+		    assert (ROW_IS_ALIVE (*cp2))  ;
+		    /* row indices will same order for both supercols, */
+		    /* no gather scatter nessasary */
+		    if (*cp1++ != *cp2++)
+		    {
+			break ;
+		    }
+		}
+
+		/* the two columns are different if the for-loop "broke" */
+		if (i != length)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* === Got it!  two columns are identical =================== */
+
+		assert (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+		Col [c].shared1.parent = super_c ;
+		KILL_NON_PRINCIPAL_COL (c) ;
+		/* order c later, in order_children() */
+		Col [c].shared2.order = EMPTY ;
+		/* remove c from hash bucket */
+		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+	    }
+	}
+
+	/* === Empty this hash bucket ======================================= */
+
+	if (head_column > EMPTY)
+	{
+	    /* corresponding degree list "hash" is not empty */
+	    Col [head_column].shared3.headhash = EMPTY ;
+	}
+	else
+	{
+	    /* corresponding degree list "hash" is empty */
+	    head [hash] = EMPTY ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+    Defragments and compacts columns and rows in the workspace A.  Used when
+    all avaliable memory has been used while performing row merging.  Returns
+    the index of the first free position in A, after garbage collection.  The
+    time taken by this routine is linear is the size of the array A, which is
+    itself linear in the number of nonzeros in the input matrix.
+    Not user-callable.
+*/
+
+PRIVATE int garbage_collection  /* returns the new value of pfree */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows */
+    int n_col,			/* number of columns */
+    RowInfo Row [],		/* row info */
+    ColInfo Col [],		/* column info */
+    int A [],			/* A [0 ... Alen-1] holds the matrix */
+    int *pfree			/* &A [0] ... pfree is in use */
+)
+{
+    /* === Local variables ================================================== */
+
+    int *psrc ;			/* source pointer */
+    int *pdest ;		/* destination pointer */
+    int j ;			/* counter */
+    int r ;			/* a row index */
+    int c ;			/* a column index */
+    int length ;		/* length of a row or column */
+
+#ifndef NDEBUG
+    int debug_rows ;
+    DEBUG0 (("Defrag..\n")) ;
+    for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ;
+    debug_rows = 0 ;
+#endif
+
+    /* === Defragment the columns =========================================== */
+
+    pdest = &A[0] ;
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    psrc = &A [Col [c].start] ;
+
+	    /* move and compact the column */
+	    assert (pdest <= psrc) ;
+	    Col [c].start = (int) (pdest - &A [0]) ;
+	    length = Col [c].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		r = *psrc++ ;
+		if (ROW_IS_ALIVE (r))
+		{
+		    *pdest++ = r ;
+		}
+	    }
+	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
+	}
+    }
+
+    /* === Prepare to defragment the rows =================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    if (Row [r].length == 0)
+	    {
+		/* this row is of zero length.  cannot compact it, so kill it */
+		DEBUG0 (("Defrag row kill\n")) ;
+		KILL_ROW (r) ;
+	    }
+	    else
+	    {
+		/* save first column index in Row [r].shared2.first_column */
+		psrc = &A [Row [r].start] ;
+		Row [r].shared2.first_column = *psrc ;
+		assert (ROW_IS_ALIVE (r)) ;
+		/* flag the start of the row with the one's complement of row */
+		*psrc = ONES_COMPLEMENT (r) ;
+#ifndef NDEBUG
+		debug_rows++ ;
+#endif
+	    }
+	}
+    }
+
+    /* === Defragment the rows ============================================== */
+
+    psrc = pdest ;
+    while (psrc < pfree)
+    {
+	/* find a negative number ... the start of a row */
+	if (*psrc++ < 0)
+	{
+	    psrc-- ;
+	    /* get the row index */
+	    r = ONES_COMPLEMENT (*psrc) ;
+	    assert (r >= 0 && r < n_row) ;
+	    /* restore first column index */
+	    *psrc = Row [r].shared2.first_column ;
+	    assert (ROW_IS_ALIVE (r)) ;
+
+	    /* move and compact the row */
+	    assert (pdest <= psrc) ;
+	    Row [r].start = (int) (pdest - &A [0]) ;
+	    length = Row [r].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		c = *psrc++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    *pdest++ = c ;
+		}
+	    }
+	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
+#ifndef NDEBUG
+	    debug_rows-- ;
+#endif
+	}
+    }
+    /* ensure we found all the rows */
+    assert (debug_rows == 0) ;
+
+    /* === Return the new value of pfree ==================================== */
+
+    return ((int) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Clears the Row [].shared2.mark array, and returns the new tag_mark.
+    Return value is the new tag_mark.  Not user-callable.
+*/
+
+PRIVATE int clear_mark	/* return the new value for tag_mark */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,		/* number of rows in A */
+    RowInfo Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    DEBUG0 (("Clear mark\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    Row [r].shared2.mark = 0 ;
+	}
+    }
+    return (1) ;
+}
+
+
+/* ========================================================================== */
+/* === debugging routines =================================================== */
+/* ========================================================================== */
+
+/* When debugging is disabled, the remainder of this file is ignored. */
+
+#ifndef NDEBUG
+
+
+/* ========================================================================== */
+/* === debug_structures ===================================================== */
+/* ========================================================================== */
+
+/*
+    At this point, all empty rows and columns are dead.  All live columns
+    are "clean" (containing no dead rows) and simplicial (no supercolumns
+    yet).  Rows may contain dead columns, but all live rows contain at
+    least one live column.
+*/
+
+PRIVATE void debug_structures
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int n_col2
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+    int c ;
+    int *cp ;
+    int *cp_end ;
+    int len ;
+    int score ;
+    int r ;
+    int *rp ;
+    int *rp_end ;
+    int deg ;
+
+    /* === Check A, Row, and Col ============================================ */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    len = Col [c].length ;
+	    score = Col [c].shared2.score ;
+	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
+	    assert (len > 0) ;
+	    assert (score >= 0) ;
+	    assert (Col [c].shared1.thickness == 1) ;
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + len ;
+	    while (cp < cp_end)
+	    {
+		r = *cp++ ;
+		assert (ROW_IS_ALIVE (r)) ;
+	    }
+	}
+	else
+	{
+	    i = Col [c].shared2.order ;
+	    assert (i >= n_col2 && i < n_col) ;
+	}
+    }
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    i = 0 ;
+	    len = Row [r].length ;
+	    deg = Row [r].shared1.degree ;
+	    assert (len > 0) ;
+	    assert (deg > 0) ;
+	    rp = &A [Row [r].start] ;
+	    rp_end = rp + len ;
+	    while (rp < rp_end)
+	    {
+		c = *rp++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    i++ ;
+		}
+	    }
+	    assert (i > 0) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_deg_lists ====================================================== */
+/* ========================================================================== */
+
+/*
+    Prints the contents of the degree lists.  Counts the number of columns
+    in the degree list and compares it to the total it should have.  Also
+    checks the row degrees.
+*/
+
+PRIVATE void debug_deg_lists
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+)
+{
+    /* === Local variables ================================================== */
+
+    int deg ;
+    int col ;
+    int have ;
+    int row ;
+
+    /* === Check the degree lists =========================================== */
+
+    if (n_col > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    have = 0 ;
+    DEBUG4 (("Degree lists: %d\n", min_score)) ;
+    for (deg = 0 ; deg <= n_col ; deg++)
+    {
+	col = head [deg] ;
+	if (col == EMPTY)
+	{
+	    continue ;
+	}
+	DEBUG4 (("%d:", deg)) ;
+	while (col != EMPTY)
+	{
+	    DEBUG4 ((" %d", col)) ;
+	    have += Col [col].shared1.thickness ;
+	    assert (COL_IS_ALIVE (col)) ;
+	    col = Col [col].shared4.degree_next ;
+	}
+	DEBUG4 (("\n")) ;
+    }
+    DEBUG4 (("should %d have %d\n", should, have)) ;
+    assert (should == have) ;
+
+    /* === Check the row degrees ============================================ */
+
+    if (n_row > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    for (row = 0 ; row < n_row ; row++)
+    {
+	if (ROW_IS_ALIVE (row))
+	{
+	    assert (Row [row].shared1.degree <= max_deg) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Ensures that the tag_mark is less that the maximum and also ensures that
+    each entry in the mark array is less than the tag mark.
+*/
+
+PRIVATE void debug_mark
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    RowInfo Row [],
+    int tag_mark,
+    int max_mark
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    /* === Check the Row marks ============================================== */
+
+    assert (tag_mark > 0 && tag_mark <= max_mark) ;
+    if (n_row > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    for (r = 0 ; r < n_row ; r++)
+    {
+	assert (Row [r].shared2.mark < tag_mark) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_matrix ========================================================= */
+/* ========================================================================== */
+
+/*
+    Prints out the contents of the columns and the rows.
+*/
+
+PRIVATE void debug_matrix
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A []
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+    int c ;
+    int *rp ;
+    int *rp_end ;
+    int *cp ;
+    int *cp_end ;
+
+    /* === Dump the rows and columns of the matrix ========================== */
+
+    if (debug_colamd < 3)
+    {
+	return ;
+    }
+    DEBUG3 (("DUMP MATRIX:\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
+	if (ROW_IS_DEAD (r))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d degree %d\n",
+		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
+	rp = &A [Row [r].start] ;
+	rp_end = rp + Row [r].length ;
+	while (rp < rp_end)
+	{
+	    c = *rp++ ;
+	    DEBUG3 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
+	}
+    }
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
+		Col [c].start, Col [c].length,
+		Col [c].shared1.thickness, Col [c].shared2.score)) ;
+	cp = &A [Col [c].start] ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    r = *cp++ ;
+	    DEBUG3 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
+	}
+    }
+}
+
+#endif
+
diff --git a/SRC/old_colamd.h b/SRC/old_colamd.h
new file mode 100644
index 0000000..68a8ef0
--- /dev/null
+++ b/SRC/old_colamd.h
@@ -0,0 +1,86 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief colamd include file
+ */
+/* ========================================================================== */
+/* === colamd prototypes and definitions ==================================== */
+/* ========================================================================== */
+
+/*
+    This is the colamd include file,
+
+	http://www.cise.ufl.edu/~davis/colamd/colamd.h
+
+    for use in the colamd.c, colamdmex.c, and symamdmex.c files located at
+
+	http://www.cise.ufl.edu/~davis/colamd/
+
+    See those files for a description of colamd and symamd, and for the
+    copyright notice, which also applies to this file.
+
+    August 3, 1998.  Version 1.0.
+*/
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only A [0..2] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and A [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and A [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* A [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* A [3]: whether or not the input columns were jumbled or had duplicates */
+#define COLAMD_JUMBLED_COLS 3
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+#ifdef _CRAY
+#define int short
+#elif defined (_LONGINT)
+#define int long
+#endif
+
+int colamd_recommended		/* returns recommended value of Alen */
+(
+    int nnz,			/* nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+) ;
+
+void colamd_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+int colamd			/* returns TRUE if successful, FALSE otherwise*/
+(				/* A and p arguments are modified on output */
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* size of the array A */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
diff --git a/SRC/pdGetDiagU.c b/SRC/pdGetDiagU.c
new file mode 100644
index 0000000..f301133
--- /dev/null
+++ b/SRC/pdGetDiagU.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file p@(pre)GetDiagU.c
+ * \brief Extracts the main diagonal of matrix U 
+ *
+ * <pre>
+ * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Xiaoye S. Li
+ * Created:  April 16, 2002
+ * Modified: May 15, 2016
+ * </pre>
+ */
+
+
+
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
+ *  
+ * Arguments
+ * =========
+ *
+ * n        (input) int
+ *          Dimension of the matrix.
+ *
+ * LUstruct (input) LUstruct_t*
+ *          The data structures to store the distributed L and U factors.
+ *          see superlu_ddefs.h for its definition.
+ *
+ * grid     (input) gridinfo_t*
+ *          The 2D process mesh. It contains the MPI communicator, the number
+ *          of process rows (NPROW), the number of process columns (NPCOL),
+ *          and my process rank. It is an input argument to all the
+ *          parallel routines.
+ *
+ * diagU    (output) double*, dimension (n)
+ *          The main diagonal of matrix U.
+ *          On exit, it is available on all processes.
+ *
+ *
+ * Note
+ * ====
+ *
+ * The diagonal blocks of the L and U matrices are stored in the L
+ * data structures, and are on the diagonal processes of the
+ * 2D process grid.
+ *
+ * This routine is modified from gather_diag_to_all() in pdgstrs_Bglobal.c.
+ * </pre>
+ */
+void pdGetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+                  double *diagU)
+{
+
+    int_t *xsup;
+    int iam, knsupc, pkk;
+    int nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int_t i, j, jj, k, lk, lwork, nsupers, p;
+    int_t num_diag_procs, *diag_procs, *diag_len;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double *dblock, *dwork, *lusup;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    if ( !(dwork = doubleMalloc_dist(jj)) ) ABORT("Malloc fails for dwork[]");
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy diagonal into buffer dwork[]. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBj( k, grid );
+		nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+		lusup = Llu->Lnzval_bc_ptr[lk];
+		for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
+		    dwork[lwork+i] = lusup[i*(nsupr+1)];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( dwork, lwork, MPI_DOUBLE, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( dwork, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
+	}
+
+	/* Scatter dwork[] into global diagU vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    dblock = &diagU[FstBlockC( k )];
+	    for (i = 0; i < knsupc; ++i) dblock[i] = dwork[lwork+i];
+	    lwork += knsupc;
+	}
+    } /* for p = ... */
+
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    SUPERLU_FREE(dwork);
+}
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
new file mode 100644
index 0000000..bcb0d49
--- /dev/null
+++ b/SRC/pddistribute.c
@@ -0,0 +1,1071 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Re-distribute A on the 2D process mesh.
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * colptr (output) int*
+ *
+ * rowind (output) int*
+ *
+ * a      (output) double*
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+int_t
+dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
+                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
+                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
+                double *a[])
+{
+    NRformat_loc *Astore;
+    int_t  *perm_r; /* row permutation vector */
+    int_t  *perm_c; /* column permutation vector */
+    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
+    int_t  nnz_loc;    /* number of local nonzeros */
+    int_t  SendCnt; /* number of remote nonzeros to be sent */
+    int_t  RecvCnt; /* number of remote nonzeros to be sent */
+    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ptr_to_send;
+    double *aij, **aij_send, *nzval, *dtemp;
+    double *nzval_a;
+    int    iam, it, p, procs;
+    MPI_Request *send_req;
+    MPI_Status  status;
+    
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dReDistribute_A()");
+#endif
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nnzToRecv = intCalloc_dist(2*procs);
+    nnzToSend = nnzToRecv + procs;
+
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+       THEN ALLOCATE SPACE.
+       THIS ACCOUNTS FOR THE FIRST PASS OF A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+	    ++nnzToSend[p]; 
+	}
+    }
+
+    /* All-to-all communication */
+    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		  grid->comm);
+
+    maxnnzToRecv = 0;
+    nnz_loc = SendCnt = RecvCnt = 0;
+
+    for (p = 0; p < procs; ++p) {
+	if ( p != iam ) {
+	    SendCnt += nnzToSend[p];
+	    RecvCnt += nnzToRecv[p];
+	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+	} else {
+	    nnz_loc += nnzToRecv[p];
+	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+	}
+    }
+    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+
+    /* Allocate space for storing the triplets after redistribution. */
+    if ( k ) { /* count can be zero. */
+        if ( !(ia = intMalloc_dist(2*k)) )
+            ABORT("Malloc fails for ia[].");
+        if ( !(aij = doubleMalloc_dist(k)) )
+            ABORT("Malloc fails for aij[].");
+    }
+    ja = ia + k;
+
+    /* Allocate temporary storage for sending/receiving the A triplets. */
+    if ( procs > 1 ) {
+      if ( !(send_req = (MPI_Request *)
+	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
+        ABORT("Malloc fails for ia_send[].");
+      if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) )
+        ABORT("Malloc fails for aij_send[].");
+      if ( SendCnt ) { /* count can be zero */
+          if ( !(index = intMalloc_dist(2*SendCnt)) )
+              ABORT("Malloc fails for index[].");
+          if ( !(nzval = doubleMalloc_dist(SendCnt)) )
+              ABORT("Malloc fails for nzval[].");
+      }
+      if ( !(ptr_to_send = intCalloc_dist(procs)) )
+        ABORT("Malloc fails for ptr_to_send[].");
+      if ( maxnnzToRecv ) { /* count can be zero */
+          if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
+              ABORT("Malloc fails for itemp[].");
+          if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) )
+              ABORT("Malloc fails for dtemp[].");
+      }
+
+      for (i = 0, j = 0, p = 0; p < procs; ++p) {
+          if ( p != iam ) {
+	      ia_send[p] = &index[i];
+	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	      aij_send[p] = &nzval[j];
+	      j += nnzToSend[p];
+	  }
+      }
+    } /* if procs > 1 */
+      
+    if ( !(*colptr = intCalloc_dist(n+1)) )
+        ABORT("Malloc fails for *colptr[].");
+
+    /* ------------------------------------------------------------
+       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+       THIS ACCOUNTS FOR THE SECOND PASS OF A.
+       ------------------------------------------------------------*/
+    nnz_loc = 0; /* Reset the local nonzero count. */
+    nzval_a = Astore->nzval;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+
+	    if ( p != iam ) { /* remote */
+	        k = ptr_to_send[p];
+	        ia_send[p][k] = irow;
+	        ia_send[p][k + nnzToSend[p]] = jcol;
+		aij_send[p][k] = nzval_a[j];
+		++ptr_to_send[p]; 
+	    } else {          /* local */
+	        ia[nnz_loc] = irow;
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = nzval_a[j];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       NOTE: Can possibly use MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    it = 2*nnzToSend[p];
+	    MPI_Isend( ia_send[p], it, mpi_int_t,
+		       p, iam, grid->comm, &send_req[p] );
+	    it = nnzToSend[p];
+	    MPI_Isend( aij_send[p], it, MPI_DOUBLE,
+	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    it = 2*nnzToRecv[p];
+	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+	    it = nnzToRecv[p];
+            MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
+		      grid->comm, &status );
+	    for (i = 0; i < nnzToRecv[p]; ++i) {
+	        ia[nnz_loc] = itemp[i];
+		jcol = itemp[i + nnzToRecv[p]];
+		/*assert(jcol<n);*/
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = dtemp[i];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
+	    }
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    MPI_Wait( &send_req[p], &status);
+	    MPI_Wait( &send_req[procs+p], &status);
+	}
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE
+       ------------------------------------------------------------*/
+
+    SUPERLU_FREE(nnzToRecv);
+
+    if ( procs > 1 ) {
+	SUPERLU_FREE(send_req);
+	SUPERLU_FREE(ia_send);
+	SUPERLU_FREE(aij_send);
+	if ( SendCnt ) {
+            SUPERLU_FREE(index);
+            SUPERLU_FREE(nzval);
+        }
+	SUPERLU_FREE(ptr_to_send);
+        if ( maxnnzToRecv ) {
+            SUPERLU_FREE(itemp);
+            SUPERLU_FREE(dtemp);
+        }
+    }
+
+    /* ------------------------------------------------------------
+       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
+       ------------------------------------------------------------*/
+    if ( nnz_loc ) { /* nnz_loc can be zero */
+        if ( !(*rowind = intMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *rowind[].");
+        if ( !(*a = doubleMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *a[].");
+    }
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = (*colptr)[0];
+    (*colptr)[0] = 0;
+    for (j = 1; j < n; ++j) {
+	k += jsize;
+	jsize = (*colptr)[j];
+	(*colptr)[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (i = 0; i < nnz_loc; ++i) {
+	j = ja[i];
+	k = (*colptr)[j];
+	(*rowind)[k] = ia[i];
+	(*a)[k] = aij[i];
+	++(*colptr)[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
+    (*colptr)[0] = 0;
+
+    if ( nnz_loc ) {
+        SUPERLU_FREE(ia);
+        SUPERLU_FREE(aij);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit dReDistribute_A()");
+#endif
+ 
+    return 0;
+} /* dReDistribute_A */
+
+float
+pddistribute(fact_t fact, int_t n, SuperMatrix *A,
+	     ScalePermstruct_t *ScalePermstruct,
+	     Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
+	     gridinfo_t *grid)
+/*
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * LUstruct (input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ *
+ */
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
+          len, len1, nsupc;
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+    int iam, jbrow, kcol, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NRformat_loc *Astore;
+    double *a;
+    int_t *asub, *xa;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;   
+    int_t *lsub, *xlsub, *usub, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+    int   *index1;        /* temporary pointer to array of int */
+    double *lusup, *uval; /* nonzero values in L and U */
+    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+    double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in 
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+    double *dense, *dense_col; /* SPA */
+    double zero = 0.0;
+    int_t ldaspa;     /* LDA of SPA */
+    int_t iword, dword;
+    float mem_use = 0.0;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 ) 
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = (NRformat_loc *) A->Store;
+
+#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    dword = sizeof(double);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pddistribute()");
+#endif
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_();
+#endif
+
+    dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
+		      grid, &xa, &asub, &a);
+
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf("--------\n"
+		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+#if ( PRNTlevel>=1 )
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else {
+        /* ------------------------------------------------------------
+	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+	   ------------------------------------------------------------*/
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We first need to set up the L and U data structures and then
+	 * propagate the values of A into them.
+	 */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+    
+	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+#if ( PRNTlevel>=1 )
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+#endif
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr = 
+              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+	
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword;
+#endif
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+	
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+	
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+	
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+#if ( PRNTlevel>=1 )
+        mem_use -= 2.0*k * iword;
+#endif
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+
+	/* ------------------------------------------------ */
+#if ( PRNTlevel>=1 )	
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr = 
+              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+	/* -------------------------------------------------------------- */
+#if ( PRNTlevel>=1 )
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+#endif
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+		
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		} /* for j ... */
+
+		jbrow = PROW( jb, grid );
+
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see 
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif		
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in 
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) ) 
+			ABORT("Malloc fails for index[]");
+		    Lrowind_bc_ptr[ljb] = index;
+		    if (!(Lnzval_bc_ptr[ljb] = 
+                         doubleMalloc_dist(len*nsupc))) {
+			fprintf(stderr, "col block " IFMT " ", jb);
+			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
+		    }
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len; 
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[],
+                       and the initial values of A from SPA into Lnzval[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		      MPI_MAX, grid->comm);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
+        SUPERLU_FREE(asub);
+        SUPERLU_FREE(a);
+    }
+    SUPERLU_FREE(xa);
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit pddistribute()");
+#endif
+    
+    return (mem_use);
+} /* PDDISTRIBUTE */
diff --git a/SRC/pdgsequ.c b/SRC/pdgsequ.c
new file mode 100644
index 0000000..8657313
--- /dev/null
+++ b/SRC/pdgsequ.c
@@ -0,0 +1,244 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Computes row and column scalings
+ *
+ * File name:	pdgsequ.c
+ * History:     Modified from LAPACK routine DGEEQU
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+ <pre>    
+    Purpose   
+    =======   
+
+    PDGSEQU computes row and column scalings intended to equilibrate an   
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make   
+    the largest element in each row and column of the matrix B with   
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+    number and BIGNUM = largest safe number.  Use of these scaling   
+    factors is not guaranteed to reduce the condition number of A but   
+    works well in practice.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+ 
+    Arguments   
+    =========   
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+	    
+    R       (output) double*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors   
+            for A.
+	    
+    C       (output) double*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+	    
+    ROWCND  (output) double*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+            AMAX is neither too large nor too small, it is not worth   
+            scaling by R.
+	    
+    COLCND  (output) double*
+            If INFO = 0, COLCND contains the ratio of the smallest   
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            worth scaling by C.
+	    
+    AMAX    (output) double*
+            Absolute value of largest matrix element.  If AMAX is very   
+            close to overflow or very close to underflow, the matrix   
+            should be scaled.
+	    
+    INFO    (output) int*
+            = 0:  successful exit   
+            < 0:  if INFO = -i, the i-th argument had an illegal value   
+            > 0:  if INFO = i,  and i is   
+                  <= M:  the i-th row of A is exactly zero   
+                  >  M:  the (i-M)-th column of A is exactly zero   
+
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+    ===================================================================== 
+</pre>
+*/
+
+void
+pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
+	double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
+{
+
+    /* Local variables */
+    NRformat_loc *Astore;
+    double *Aval;
+    int i, j, irow, jcol, m_loc;
+    double rcmin, rcmax;
+    double bignum, smlnum;
+    double tempmax, tempmin;
+    double *loc_max;
+    int *r_sizes, *displs;
+    double *loc_r;
+    int_t  procs;
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("pdgsequ", grid, i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+    
+    /* Get machine constants. */
+    smlnum = dmach_dist("S");
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
+	++irow;
+    }
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+  
+    /* Get the global MAX and MIN for R */
+    tempmax = rcmax;
+    tempmin = rcmin;
+    MPI_Allreduce( &tempmax, &rcmax, 
+		1, MPI_DOUBLE, MPI_MAX, grid->comm);
+    MPI_Allreduce( &tempmin, &rcmin, 
+		1, MPI_DOUBLE, MPI_MIN, grid->comm);
+
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	    jcol = Astore->colind[j];
+	    c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
+	}
+	++irow;
+    }
+
+    /* Find the global maximum for c[j] */
+    if ( !(loc_max = doubleMalloc_dist(A->ncol)))
+      ABORT("Malloc fails for loc_max[].");
+    for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
+    MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
+    SUPERLU_FREE(loc_max);
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* gather R from each process to get the global R.  */
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
+      ABORT("Malloc fails for r_sizes[].");
+    displs = r_sizes + procs;
+    if ( !(loc_r = doubleMalloc_dist(m_loc)))
+      ABORT("Malloc fails for loc_r[].");
+    j = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
+
+    /* First gather the size of each piece. */
+    MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
+      
+    /* Set up the displacements for allgatherv */
+    displs[0] = 0;
+    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
+
+    /* Now gather the actual data */
+    MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
+                MPI_DOUBLE, grid->comm);
+      
+    SUPERLU_FREE(r_sizes);
+    SUPERLU_FREE(loc_r);
+
+    return;
+
+} /* pdgsequ */
diff --git a/SRC/pdgsmv.c b/SRC/pdgsmv.c
new file mode 100644
index 0000000..538e736
--- /dev/null
+++ b/SRC/pdgsmv.c
@@ -0,0 +1,383 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief  Parallel sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+void pdgsmv_init
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
+			  The type of A can be:
+			  Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */
+ int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
+ gridinfo_t *grid,     /* Input */
+ pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
+ )
+{
+    NRformat_loc *Astore;
+    int iam, p, procs;
+    int *SendCounts, *RecvCounts;
+    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
+    int_t TotalIndSend, TotalValSend;
+    int_t *colind, *rowptr;
+    int_t *ind_tosend = NULL, *ind_torecv = NULL;
+    int_t *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, *spa, *itemp;
+    double *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    m = A->nrow;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = Astore->nzval;
+    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for SendCounts[]");
+    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
+    RecvCounts = SendCounts + procs;
+    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
+        ABORT("Malloc fails for ptr_ind_tosend[]");
+    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
+    if ( !(extern_start = intMalloc_dist(m_loc)) )
+        ABORT("Malloc fails for extern_start[]");
+    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
+       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
+       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
+       LOCAL PART OF X.
+       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
+        ABORT("Malloc fails for spa[]");
+    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+        k = extern_start[i];
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
+	    jcol = colind[j];
+            p = row_to_proc[jcol];
+	    if ( p != iam ) { /* External */
+	        if ( spa[jcol] == 0 ) { /* First time see this index */
+		    ++SendCounts[p];
+		    spa[jcol] = 1;
+                }
+	    } else { /* Swap to beginning the part of A corresponding
+			to the local part of X */
+		l = colind[k];
+		t = nzval[k];
+		colind[k] = jcol;
+		nzval[k] = nzval[j];
+		colind[j] = l;
+		nzval[j] = t;
+		++k;
+	    }
+	}
+	extern_start[i] = k;
+    }
+
+    /* ------------------------------------------------------------
+       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
+       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    /* Build pointers to ind_tosend[]. */
+    ptr_ind_tosend[0] = 0;
+    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
+        TotalIndSend += SendCounts[p]; /* Total to send. */
+	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
+    }
+#if 0
+    ptr_ind_tosend[iam] = 0; /* Local part of X */
+#endif
+    if ( TotalIndSend ) {
+        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
+	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
+    }
+
+    /* Build SPA to aid global to local translation. */
+    for (i = 0; i < n; ++i) spa[i] = EMPTY;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
+	        p = row_to_proc[jcol];
+		if ( p == iam ) { /* Local */
+		  /*assert(jcol>=fst_row);*/
+		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
+		} else {          /* External */
+		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
+		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
+		  ++ptr_ind_tosend[p];
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
+       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    colind[j] = spa[jcol];
+	}
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE EXTERNAL INDICES OF X.
+       ------------------------------------------------------------*/
+    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
+		 grid->comm);
+
+    /* Build pointers to ind_torecv[]. */
+    ptr_ind_torecv[0] = 0;
+    for (p = 0, TotalValSend = 0; p < procs; ++p) {
+        TotalValSend += RecvCounts[p]; /* Total to receive. */
+	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
+    }
+    if ( TotalValSend ) {
+        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
+	    ABORT("Malloc fails for ind_torecv[]");
+    }
+
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
+        if ( SendCounts[p] ) {
+	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
+		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
+	}
+	if ( RecvCounts[p] ) {
+	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
+		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
+	}
+    }
+    for (p = 0; p < procs; ++p) {
+        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
+	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
+    }
+
+    /* Allocate storage for the X values to to transferred. */
+    if ( TotalIndSend &&
+         !(val_torecv = doubleMalloc_dist(TotalIndSend)) )
+        ABORT("Malloc fails for val_torecv[].");
+    if ( TotalValSend &&
+         !(val_tosend = doubleMalloc_dist(TotalValSend)) )
+        ABORT("Malloc fails for val_tosend[].");
+
+    gsmv_comm->extern_start = extern_start;
+    gsmv_comm->ind_tosend = ind_tosend;
+    gsmv_comm->ind_torecv = ind_torecv;
+    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
+    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
+    gsmv_comm->SendCounts = SendCounts;
+    gsmv_comm->RecvCounts = RecvCounts;
+    gsmv_comm->val_tosend = val_tosend;
+    gsmv_comm->val_torecv = val_torecv;
+    gsmv_comm->TotalIndSend = TotalIndSend;
+    gsmv_comm->TotalValSend = TotalValSend;
+    
+    SUPERLU_FREE(spa);
+    SUPERLU_FREE(send_req);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("pdgsmv_init::rowptr", m_loc+1, rowptr);
+    PrintInt10("pdgsmv_init::extern_start", m_loc, extern_start);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgsmv_init()");
+#endif
+
+} /* PDGSMV_INIT */
+
+
+/*
+ * Performs sparse matrix-vector multiplication.
+ */
+void
+pdgsmv
+(
+ int_t  abs,               /* Input. Do abs(A)*abs(x). */
+ SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
+			      The column indices are translated into
+			      the relative positions in the gathered x-vector.
+			      The type of A can be:
+			      Stype = NR_loc; Dtype = SLU_D; Mtype = GE. */
+ gridinfo_t *grid,         /* Input */
+ pdgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
+ double x[],       /* Input. The distributed source vector */
+ double ax[]       /* Output. The distributed destination vector */
+)
+{
+    NRformat_loc *Astore;
+    int iam, procs;
+    int_t i, j, p, m, m_loc, n, fst_row, jcol;
+    int_t *colind, *rowptr;
+    int   *SendCounts, *RecvCounts;
+    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, TotalValSend;
+    double *nzval, *val_tosend, *val_torecv;
+    double zero = 0.0;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdgsmv()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A_internal->Store;
+    m = A_internal->nrow;
+    n = A_internal->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = (double *) Astore->nzval;
+    extern_start = gsmv_comm->extern_start;
+    ind_torecv = gsmv_comm->ind_torecv;
+    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
+    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
+    SendCounts = gsmv_comm->SendCounts;
+    RecvCounts = gsmv_comm->RecvCounts;
+    val_tosend = (double *) gsmv_comm->val_tosend;
+    val_torecv = (double *) gsmv_comm->val_torecv;
+    TotalValSend = gsmv_comm->TotalValSend;
+
+    /* ------------------------------------------------------------
+       COPY THE X VALUES INTO THE SEND BUFFER.
+       ------------------------------------------------------------*/
+    for (i = 0; i < TotalValSend; ++i) {
+        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
+	val_tosend[i] = x[j];
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE X VALUES.
+       ------------------------------------------------------------*/
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        if ( RecvCounts[p] ) {
+	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
+                      MPI_DOUBLE, p, iam,
+                      grid->comm, &send_req[p]);
+	}
+	if ( SendCounts[p] ) {
+	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
+                      MPI_DOUBLE, p, p,
+                      grid->comm, &recv_req[p]);
+	}
+    }
+    
+    /* ------------------------------------------------------------
+       PERFORM THE ACTUAL MULTIPLICATION.
+       ------------------------------------------------------------*/
+    if ( abs ) { /* Perform abs(A)*abs(x) */
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = 0.0;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax[i] += fabs(nzval[j]) * fabs(x[jcol]);
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]);
+	    }
+	}
+    } else {
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = zero;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax[i] += nzval[j] * x[jcol];
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax[i] += nzval[j] * val_torecv[jcol];
+	    }
+	}
+    }
+
+    SUPERLU_FREE(send_req);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgsmv()");
+#endif
+
+} /* PDGSMV */
+
+void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm)
+{
+    int_t *it;
+    double *dt;
+    SUPERLU_FREE(gsmv_comm->extern_start);
+    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
+    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
+    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
+    SUPERLU_FREE(gsmv_comm->SendCounts);
+    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
+    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
+}
+
diff --git a/SRC/pdgsmv_AXglobal.c b/SRC/pdgsmv_AXglobal.c
new file mode 100644
index 0000000..95ff28c
--- /dev/null
+++ b/SRC/pdgsmv_AXglobal.c
@@ -0,0 +1,324 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Performs sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+
+static void dcreate_msr_matrix(SuperMatrix *, int_t [], int_t,
+			      double **, int_t **);
+static void dPrintMSRmatrix(int, double [], int_t [], gridinfo_t *);
+
+
+int pdgsmv_AXglobal_setup
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */
+ Glu_persist_t *Glu_persist, /* input */
+ gridinfo_t *grid,     /* input */
+ int_t *m,             /* output */
+ int_t *update[],      /* output */
+ double *val[],        /* output */
+ int_t *bindx[],       /* output */
+ int_t *mv_sup_to_proc /* output */
+ )
+{
+    int n;
+    int input_option;
+    int N_update;    /* Number of variables updated on this process (output) */
+    int iam = grid->iam;
+    int nprocs = grid->nprow * grid->npcol;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *supno = Glu_persist->supno;
+    int_t nsupers;
+    int i, nsup, p, t1, t2, t3;
+
+
+    /* Initialize the list of global indices.
+     * NOTE: the list of global indices must be in ascending order.
+     */
+    n = A->nrow;
+    input_option = SUPER_LINEAR;
+    nsupers = supno[n-1] + 1;
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) {
+	PrintInt10("xsup", supno[n-1]+1, xsup);
+	PrintInt10("supno", n, supno);
+    }
+#endif
+
+    if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on
+					     individual rows.  */
+	/* Figure out mv_sup_to_proc[] on all processes. */
+	for (p = 0; p < nprocs; ++p) {
+	    t1 = n / nprocs;       /* Number of rows */
+	    t2 = n - t1 * nprocs;  /* left-over, which will be assigned
+				      to the first t2 processes.  */
+	    if ( p >= t2 ) t2 += (p * t1); /* Starting row number */
+	    else { /* First t2 processes will get one more row. */
+ 	        ++t1;              /* Number of rows. */
+		t2 = p * t1;       /* Starting row. */
+	    }
+	    /* Make sure the starting and ending rows are at the
+	       supernode boundaries. */
+	    t3 = t2 + t1;      /* Ending row. */
+	    nsup = supno[t2];
+	    if ( t2 > xsup[nsup] ) { /* Round up the starting row. */
+		t1 -= xsup[nsup+1] - t2;
+		t2 = xsup[nsup+1];
+	    }
+	    nsup = supno[t3];
+	    if ( t3 > xsup[nsup] ) /* Round up the ending row. */
+		t1 += xsup[nsup+1] - t3;
+	    t3 = t2 + t1 - 1;
+	    if ( t1 ) {
+		for (i = supno[t2]; i <= supno[t3]; ++i) {
+		    mv_sup_to_proc[i] = p;
+#if ( DEBUGlevel>=3 )
+		    if ( mv_sup_to_proc[i] == p-1 ) {
+			fprintf(stderr, 
+				"mv_sup_to_proc conflicts at supno %d\n", i);
+			exit(-1);
+		    }
+#endif
+		}
+	    }
+	    
+	    if ( iam == p ) {
+		N_update = t1;
+		if ( N_update ) {
+		    if ( !(*update = intMalloc_dist(N_update)) )
+			ABORT("Malloc fails for update[]");
+		}
+		for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i;
+#if ( DEBUGlevel>=3 )
+		printf("(%2d) N_update = %4d\t"
+		       "supers %4d to %4d\trows %4d to %4d\n",
+		       iam, N_update, supno[t2], supno[t3], t2, t3);
+#endif
+	    }
+	} /* for p ... */
+    } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on
+						   individual supernodes.  */
+	/* This may cause bad load balance, because the blocks are usually
+	   small in the beginning and large toward the end.   */
+	t1 = nsupers / nprocs;
+	t2 = nsupers - t1 * nprocs; /* left-over */
+	if ( iam >= t2 ) t2 += (iam * t1);
+	else {
+	    ++t1;          /* Number of blocks. */
+	    t2 = iam * t1; /* Starting block. */
+	}
+	N_update = xsup[t2+t1] - xsup[t2];
+	if ( !(*update = intMalloc_dist(N_update)) )
+	    ABORT("Malloc fails for update[]");
+	for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i;
+    }
+
+
+    /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */
+    dcreate_msr_matrix(A, *update, N_update, val, bindx);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc);
+    dPrintMSRmatrix(N_update, *val, *bindx, grid);
+#endif
+
+    *m = N_update;
+    return 0;
+} /* PDGSMV_AXglobal_SETUP */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Create the distributed modified sparse row (MSR) matrix: bindx/val.
+ * For a submatrix of size m-by-n, the MSR arrays are as follows:
+ *    bindx[0]      = m + 1
+ *    bindx[0..m]   = pointer to start of each row
+ *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
+ *                    where, ks = bindx[k], ke = bindx[k+1]-1
+ *    val[k]        = A(k,k), k < m, diagonal elements
+ *    val[m]        = not used
+ *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
+ * Both arrays are of length nnz + 1.
+ * </pre> 
+*/
+static void dcreate_msr_matrix
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */
+ int_t update[],       /* input (local) */
+ int_t N_update,       /* input (local) */
+ double **val,         /* output */
+ int_t **bindx         /* output */
+)
+{
+    int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag;
+    NCPformat *Astore;
+    double *nzval;
+    int_t *rowcnt;
+    double zero = 0.0;
+    
+    if ( !N_update ) return;
+
+    n = A->ncol;
+    Astore = A->Store;
+    nzval = Astore->nzval;
+
+    /* One pass of original matrix A to count nonzeros of each row. */
+    if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) )
+	ABORT("Malloc fails for rowcnt[]");
+    lo = update[0];
+    hi = update[N_update-1];
+    nnz_local = 0;
+    nnz_diag = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow != j ) /* Exclude diagonal */
+		    ++rowcnt[irow - lo];
+		else ++nnz_diag; /* Count nonzero diagonal entries */
+		++nnz_local;
+	    }
+	}
+    }
+
+    /* Add room for the logical diagonal zeros which are not counted
+       in nnz_local. */
+    nnz_local += (N_update - nnz_diag);
+
+    /* Allocate storage for bindx[] and val[]. */
+    if ( !(*val = (double *) doubleMalloc_dist(nnz_local+1)) )
+	ABORT("Malloc fails for val[]");
+    for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */
+    if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) )
+	ABORT("Malloc fails for bindx[]");
+
+    /* Set up row pointers. */
+    (*bindx)[0] = N_update + 1;
+    for (j = 1; j <= N_update; ++j) {
+	(*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1];
+	rowcnt[j-1] = (*bindx)[j-1];
+    }
+
+    /* One pass of original matrix A to fill in matrix entries. */
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow == j ) /* Diagonal */
+		    (*val)[irow - lo] = nzval[i];
+		else {
+		    irow -= lo;
+		    k = rowcnt[irow];
+		    (*bindx)[k] = j;
+		    (*val)[k] = nzval[i];
+		    ++rowcnt[irow];
+		}
+	    }
+	}
+    }
+
+    SUPERLU_FREE(rowcnt);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ * </pre>
+ */
+int
+pdgsmv_AXglobal(int_t m, int_t update[], double val[], int_t bindx[],
+                double X[], double ax[])
+{
+    int_t i, j, k;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = 0.0;
+
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    ax[i] += val[k] * X[j];
+	}
+	ax[i] += val[i] * X[update[i]]; /* diagonal */
+    }
+    return 0;
+} /* PDGSMV_AXglobal */
+ 
+/*
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ */
+int
+pdgsmv_AXglobal_abs(int_t m, int_t update[], double val[], int_t bindx[],
+	            double X[], double ax[])
+{
+    int_t i, j, k;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = 0.0;
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    ax[i] += fabs(val[k]) * fabs(X[j]);
+	}
+	ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */
+    }
+    
+    return 0;
+} /* PDGSMV_AXglobal_ABS */
+
+/*
+ * Print the local MSR matrix
+ */
+static void dPrintMSRmatrix
+(
+ int m,       /* Number of rows of the submatrix. */
+ double val[],
+ int_t bindx[],
+ gridinfo_t *grid
+)
+{
+    int iam, nnzp1;
+
+    if ( !m ) return;
+
+    iam = grid->iam;
+    nnzp1 = bindx[m];
+    printf("(%2d) MSR submatrix has %d rows -->\n", iam, m);
+    PrintDouble5("val", nnzp1, val);
+    PrintInt10("bindx", nnzp1, bindx);
+}
diff --git a/SRC/pdgsrfs.c b/SRC/pdgsrfs.c
new file mode 100644
index 0000000..49a5363
--- /dev/null
+++ b/SRC/pdgsrfs.c
@@ -0,0 +1,262 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Last modified:
+ * December 31, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief 
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSRFS improves the computed solution to a system of linear   
+ * equations and provides error bounds and backward error estimates
+ * for the solution. 
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) double* (local)
+ *        The m_loc-by-NRHS right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *       
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) double* (local)
+ *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
+ *            transformed system A1*Y = Pc*Pr*B. where
+ *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
+ *        On exit, the improved solution matrix Y.
+ *
+ *        In order to obtain the solution X to the original system,
+ *        Y should be permutated by Pc^T, and premultiplied by diag(C)
+ *        if DiagScale = COL or BOTH.
+ *        This must be done after this routine is called.
+ *
+ * ldx    (input) int (local)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (output) SOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        
+ * Internal Parameters   
+ * ===================   
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.   
+ * </pre>
+ */
+void
+pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
+	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
+	double *B, int_t ldb, double *X, int_t ldx, int nrhs, 
+	SOLVEstruct_t *SOLVEstruct,
+	double *berr, SuperLUStat_t *stat, int *info)
+{
+#define ITMAX 20
+    
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double *ax, *R, *dx, *temp, *work, *B_col, *X_col;
+    int_t count, i, j, lwork, nz;
+    int   iam;
+    double eps, lstres;
+    double s, safmin, safe1, safe2;
+
+    /* Data structures used by matrix-vector multiply routine. */
+    pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
+    NRformat_loc *Astore;
+    int_t        m_loc, fst_row;
+
+
+    /* Initialization. */
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    iam = grid->iam;
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	      || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("PDGSRFS", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgsrfs()");
+#endif
+
+    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
+    if ( !(work = doubleMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = dx = work;
+    temp = ax + m_loc;
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = dmach_dist("Epsilon");
+    safmin = dmach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,   
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    pdgsmv(0, A, grid, gsmv_comm, X_col, ax);
+	    
+	    /* Compute residual, stored in R[]. */
+	    for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
+
+	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
+	    pdgsmv(1, A, grid, gsmv_comm, X_col, temp);
+	    for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]);
+	    
+	    s = 0.0;
+	    for (i = 0; i < m_loc; ++i) {
+		if ( temp[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
+		} else if ( temp[i] != 0.0 ) {
+                    /* Adding SAFE1 to the numerator guards against
+                       spuriously zero residuals (underflow). */
+                    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
+		
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		pdgstrs(n, LUstruct, ScalePermstruct, grid,
+			dx, m_loc, fst_row, m_loc, 1, 
+			SOLVEstruct, stat, info);
+
+		/* Update solution. */
+		for (i = 0; i < m_loc; ++i) X_col[i] += dx[i];
+
+		lstres = berr[j];
+		++count;
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(work);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgsrfs()");
+#endif
+
+} /* PDGSRFS */
+
diff --git a/SRC/pdgsrfs_ABXglobal.c b/SRC/pdgsrfs_ABXglobal.c
new file mode 100644
index 0000000..44621be
--- /dev/null
+++ b/SRC/pdgsrfs_ABXglobal.c
@@ -0,0 +1,465 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Improves the computed solution and provies error bounds
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015  version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*-- Function prototypes --*/
+static void gather_1rhs_diag_to_all(int_t, double [], Glu_persist_t *,
+                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
+				    int_t [], double [], double []);
+static void redist_all_to_diag(int_t, double [], Glu_persist_t *,
+                               LocalLU_t *, gridinfo_t *, int_t [], double []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pdgsrfs_ABXglobal improves the computed solution to a system of linear   
+ * equations and provides error bounds and backward error estimates
+ * for the solution. 
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
+ *        are permutation matrices. The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ *        NOTE: Currently, A must reside in all processes when calling
+ *              this routine.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) double* (global)
+ *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *       
+ *        NOTE: Currently, B must reside on all processes when calling
+ *              this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) double* (global)
+ *        On entry, the solution matrix X, as computed by PDGSTRS.
+ *        On exit, the improved solution matrix X.
+ *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
+ *        in order to obtain the solution to the original system.
+ *
+ *        NOTE: Currently, X must reside on all processes when calling
+ *              this routine.
+ *
+ * ldx    (input) int (global)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        
+ * Internal Parameters   
+ * ===================   
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.   
+ * </pre>
+ */
+
+void
+pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
+		  gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx,
+		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
+{
+
+
+#define ITMAX 20
+    
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    /* 
+     * Data structures used by matrix-vector multiply routine.
+     */
+    int_t  N_update; /* Number of variables updated on this process */
+    int_t  *update;  /* vector elements (global index) updated 
+			on this processor.                     */
+    int_t  *bindx;
+    double *val;
+    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
+			       matrix-vector multiply.  */
+    /*-- end data structures for matrix-vector multiply --*/
+    double *b, *ax, *R, *B_col, *temp, *work, *X_col,
+           *x_trs, *dx_trs;
+    int_t count, ii, j, jj, k, knsupc, lk, lwork,
+          nprow, nsupers, nz, p;
+    int   i, iam, pkk;
+    int_t *ilsum, *xsup;
+    double eps, lstres;
+    double s, safmin, safe1, safe2;
+
+    /* NEW STUFF */
+    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
+    int_t *diag_len; /* Length of the X vector on diagonal processes. */
+
+    /*-- Function prototypes --*/
+    extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *,
+			 double *, int, SuperLUStat_t *, int *);
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 ||
+	      A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("pdgsrfs_ABXglobal", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+    /* Initialization. */
+    iam = grid->iam;
+    nprow = grid->nprow;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()");
+#endif
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. number of diag processes = " IFMT "\n", num_diag_procs);
+	PrintInt10("diag_procs", num_diag_procs, diag_procs);
+	PrintInt10("diag_len", num_diag_procs, diag_len);
+    }
+#endif
+
+    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
+	ABORT("Calloc fails for mv_sup_to_proc[]");
+
+    pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
+		          &val, &bindx, mv_sup_to_proc);
+
+    i = CEILING( nsupers, nprow ); /* Number of local block rows */
+    ii = Llu->ldalsum + i * XK_H;
+    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    jj = SUPERLU_MAX( jj, N_update );
+    lwork = N_update         /* For ax and R */
+	  + ii               /* For dx_trs */
+	  + ii               /* For x_trs */
+          + k                /* For b */
+	  + jj;              /* for temp */
+    if ( !(work = doubleMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = work;
+    dx_trs = work + N_update;
+    x_trs  = dx_trs + ii;
+    b      = x_trs + ii;
+    temp   = b + k;
+
+#if ( DEBUGlevel>=2 )
+    {
+	double *dwork = doubleMalloc_dist(n);
+	for (i = 0; i < n; ++i) {
+	    if ( i & 1 ) dwork[i] = 1.;
+	    else dwork[i] = 2.;
+        }
+	/* Check correctness of matrix-vector multiply. */
+	pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
+	PrintDouble5("Mult A*x", N_update, ax);
+	SUPERLU_FREE(dwork);
+    }
+#endif
+
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = dmach_dist("Epsilon");
+    safmin = dmach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+
+	/* Copy X into x on the diagonal processes. */
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+	for (p = 0; p < num_diag_procs; ++p) {
+	    pkk = diag_procs[p];
+	    if ( iam == pkk ) {
+		for (k = p; k < nsupers; k += num_diag_procs) {
+		    knsupc = SuperSize( k );
+		    lk = LBi( k, grid );
+		    ii = ilsum[lk] + (lk+1)*XK_H;
+		    jj = FstBlockC( k );
+		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
+		    dx_trs[ii-XK_H] = k;/* Block number prepended in header. */
+		}
+	    }
+	}
+	/* Copy B into b distributed the same way as matrix-vector product. */
+        if ( N_update ) ii = update[0];
+	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,   
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
+	    
+	    /* Compute residual. */
+	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];
+
+	    /* Compute abs(op(A))*abs(X) + abs(B). */
+	    pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
+	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);
+	    
+	    s = 0.0;
+	    for (i = 0; i < N_update; ++i) {
+		if ( temp[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
+		} else if ( temp[i] != 0.0 ) {
+                    /* Adding SAFE1 to the numerator guards against
+                       spuriously zero residuals (underflow). */
+		    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
+		
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
+				   mv_sup_to_proc, dx_trs);
+		pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
+
+		/* Update solution. */
+		for (p = 0; p < num_diag_procs; ++p) 
+		    if ( iam == diag_procs[p] )
+			for (k = p; k < nsupers; k += num_diag_procs) {
+			    lk = LBi( k, grid );
+			    ii = ilsum[lk] + (lk+1)*XK_H;
+			    knsupc = SuperSize( k );
+			    for (i = 0; i < knsupc; ++i)
+				x_trs[i + ii] += dx_trs[i + ii];
+			}
+		lstres = berr[j];
+		++count;
+		/* Transfer x_trs (on diagonal processes) into X
+		   (on all processes). */
+		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
+					num_diag_procs, diag_procs, diag_len,
+					X_col, temp);
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+
+    /* Deallocate storage used by matrix-vector multiplication. */
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    if ( N_update ) {
+	SUPERLU_FREE(update);
+	SUPERLU_FREE(bindx);
+	SUPERLU_FREE(val);
+    }
+    SUPERLU_FREE(mv_sup_to_proc);
+    SUPERLU_FREE(work);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()");
+#endif
+
+} /* PDGSRFS_ABXGLOBAL */
+
+
+/*! \brief
+ *
+ * <pre>
+ * r[] is the residual vector distributed the same way as
+ * matrix-vector product.
+ * </pre>
+ */
+static void
+redist_all_to_diag(int_t n, double r[], Glu_persist_t *Glu_persist,
+		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
+		   double work[])
+{
+    int_t i, ii, k, lk, lr, nsupers;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, psrc, pkk;
+    MPI_Status status;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+    lr = 0;
+
+    for (k = 0; k < nsupers; ++k) {
+	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
+	psrc = mv_sup_to_proc[k];
+	knsupc = SuperSize( k );
+	lk = LBi( k, grid );
+	ii = ilsum[lk] + (lk+1)*XK_H;
+	if ( iam == psrc ) {
+	    if ( iam != pkk ) { /* Send X component. */
+		MPI_Send( &r[lr], knsupc, MPI_DOUBLE, pkk, Xk,
+			 grid->comm );
+	    } else { /* Local copy. */
+		for (i = 0; i < knsupc; ++i)
+		    work[i + ii] = r[i + lr];
+	    }
+	    lr += knsupc;
+	} else {
+	    if ( iam == pkk ) { /* Recv X component. */
+		MPI_Recv( &work[ii], knsupc, MPI_DOUBLE, psrc, Xk,
+			 grid->comm, &status );
+	    }
+	}
+    }
+} /* REDIST_ALL_TO_DIAG */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * </pre>
+ */
+static void
+gather_1rhs_diag_to_all(int_t n, double x[],
+			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+			gridinfo_t *grid, int_t num_diag_procs,
+			int_t diag_procs[], int_t diag_len[],
+			double y[], double work[])
+{
+    int_t i, ii, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = ilsum[lk] + (lk+1)*XK_H;
+		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
+	    lwork += knsupc;
+	}
+    }
+} /* GATHER_1RHS_DIAG_TO_ALL */
+
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
new file mode 100644
index 0000000..dc1bff5
--- /dev/null
+++ b/SRC/pdgssvx.c
@@ -0,0 +1,1463 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a system of linear equations A*X=B
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * October  1, 2014
+ * April 5, 2015
+ * December 31, 2015  version 4.3
+ * December 31, 2016  version 5.1.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ * 
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using 
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously 
+ *      solved problem to save time by reusing part or all of 
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *         
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described 
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where 
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *     
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply 
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *         
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply 
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous 
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *         
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous 
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply 
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *         
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *         
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *           
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *         
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was 
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *         
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * SOLVEstruct (input/output) SOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of various data types.
+ * </pre>
+ */
+
+void
+pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, 
+	ScalePermstruct_t *ScalePermstruct,
+	double B[], int ldb, int nrhs, gridinfo_t *grid,
+	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
+	SuperLUStat_t *stat, int *info)
+{
+    NRformat_loc *Astore;
+    SuperMatrix GA;      /* Global A in NC format */
+    NCformat *GAstore;
+    double   *a_GA;
+    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+	      routine. They will be freed after PDDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    double   *a;
+    int_t    *colptr, *rowind;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *rowptr, *colind;  /* Local A in NR*/
+    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t    nnz_loc, m_loc, fst_row, icol;
+    int      iam;
+    int      ldx;  /* LDA for matrix X (local). */
+    char     equed[1], norm[1];
+    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    double   *X, *b_col, *b_work, *x_col;
+    double   t;
+    float    GA_mem_use;    /* memory usage by global A */
+    float    dist_mem_use; /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+
+    /* Structures needed for parallel symbolic factorization */
+    int_t *sizes, *fstVtxSep, parSymbFact;
+    int   noDomains, nprocs_num;
+    MPI_Comm symb_comm; /* communicator for symbolic factorization */
+    int   col, key; /* parameters for creating a new communicator */
+    Pslu_freeable_t Pslu_freeable;
+    float  flinfo;
+
+    /* Initialization. */
+    m       = A->nrow;
+    n       = A->ncol;
+    Astore  = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc   = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a       = (double *) Astore->nzval;
+    rowptr  = Astore->rowptr;
+    colind  = Astore->colind;
+    sizes   = NULL;
+    fstVtxSep = NULL;
+    symb_comm = MPI_COMM_NULL;
+
+    /* Test the input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < 0 || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	printf("ERROR: Extra precise iterative refinement yet to support.\n");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+		|| A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < m_loc )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( sp_ienv_dist(2) > sp_ienv_dist(3) ) {
+        *info = 1;
+	printf("ERROR: Relaxation (NREL) cannot be larger than max. supernode size (NSUP).\n"
+	"\t-> Check parameter setting in sp_ienv_dist.c to correct error.\n");
+    }
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("pdgssvx", grid, -*info);
+	return;
+    }
+
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    parSymbFact = options->ParSymbFact;
+	
+    iam = grid->iam;
+    job = 5;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+    /* The following arrays are replicated on all processes. */
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    /********/
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgssvx()");
+#endif
+
+    /* Not factored & ask for equilibration */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW: 
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL: 
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       Diagonal scaling to equilibrate the matrix. (simple scheme)
+       ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			a[i] *= R[irow];       /* Scale rows. */
+		    }
+		    ++irow;
+		}
+		break;
+	      case COL:
+		for (j = 0; j < m_loc; ++j)
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
+		        icol = colind[i];
+			a[i] *= C[icol];          /* Scale columns. */
+		    }
+		break;
+	      case BOTH:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			icol = colind[i];
+			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
+		    }
+		    ++irow;
+		}
+	        break;
+	    }
+	} else { /* Compute R & C from scratch */
+            /* Compute the row and column scalings. */
+	    pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+
+	    if ( iinfo > 0 ) {
+		if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		} else {
+#if ( PRNTlevel>=1 )
+                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+                }
+ 	    } else if ( iinfo < 0 ) return;
+
+	    /* Now iinfo == 0 */
+
+            /* Equilibrate matrix A if it is badly-scaled. 
+               A <-- diag(R)*A*diag(C)                     */
+	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
+
+	    if ( strncmp(equed, "R", 1)==0 ) {
+		  ScalePermstruct->DiagScale = ROW;
+		  rowequ = ROW;
+	    } else if ( strncmp(equed, "C", 1)==0 ) {
+		  ScalePermstruct->DiagScale = COL;
+		  colequ = COL;
+	    } else if ( strncmp(equed, "B", 1)==0 ) {
+		  ScalePermstruct->DiagScale = BOTH;
+		  rowequ = ROW;
+		  colequ = COL;
+	    } else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		/*fflush(stdout);*/
+	    }
+#endif
+	} /* end if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... LAPACK style, not involving MC64 */
+
+    if ( !factored ) { /* Skip this if already factored. */
+        /*
+         * For serial symbolic factorization, gather A from the distributed
+	 * compressed row format to global A in compressed column format.
+         * Numerical values are gathered only when a row permutation
+         * for large diagonal is sought after.
+         */
+	if ( Fact != SamePattern_SameRowPerm &&
+             (parSymbFact == NO || options->RowPerm != NO) ) {
+             /* Performs serial symbolic factorzation and/or MC64 */
+
+            need_value = (options->RowPerm == LargeDiag);
+
+            pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
+
+            GAstore = (NCformat *) GA.Store;
+            colptr = GAstore->colptr;
+            rowind = GAstore->rowind;
+            nnz = GAstore->nnz;
+            GA_mem_use = (nnz + n + 1) * sizeof(int_t);
+
+            if ( need_value ) {
+                a_GA = (double *) GAstore->nzval;
+                GA_mem_use += nnz * sizeof(double);
+            } else assert(GAstore->nzval == NULL);
+	}
+
+        /* ------------------------------------------------------------
+           Find the row permutation Pr for A, and apply Pr*[GA].
+	   GA is overwritten by Pr*[GA].
+           ------------------------------------------------------------*/
+        if ( options->RowPerm != NO ) {
+	    t = SuperLU_timer_();
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
+	            /* Permute the global matrix GA for symbfact() */
+	            for (i = 0; i < colptr[n]; ++i) {
+	            	irow = rowind[i]; 
+		    	rowind[i] = perm_r[irow];
+	            }
+	        } else { /* options->RowPerm == LargeDiag */
+	            /* Get a new perm_r[] */
+	            if ( job == 5 ) {
+		        /* Allocate storage for scaling factors. */
+		        if ( !(R1 = doubleMalloc_dist(m)) )
+		            ABORT("SUPERLU_MALLOC fails for R1[]");
+		    	if ( !(C1 = doubleMalloc_dist(n)) )
+		            ABORT("SUPERLU_MALLOC fails for C1[]");
+	            }
+
+	            if ( !iam ) { /* Process 0 finds a row permutation */
+		        iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA,
+		                perm_r, R1, C1);
+		
+                        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		        if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+                            }
+		        }
+	            } else {
+		        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+                            }
+		        }
+	            }
+
+	    	    if ( iinfo && job == 5) { /* Error return */
+	                SUPERLU_FREE(R1);
+	        	SUPERLU_FREE(C1);
+   	            }
+#if ( PRNTlevel>=2 )
+	            dmin = dmach_dist("Overflow");
+	            dsum = 0.0;
+	            dprod = 1.0;
+#endif
+	            if ( iinfo == 0 ) {
+	              if ( job == 5 ) {
+		        if ( Equil ) {
+		            for (i = 0; i < n; ++i) {
+			        R1[i] = exp(R1[i]);
+			        C1[i] = exp(C1[i]);
+		            }
+
+		            /* Scale the distributed matrix further.
+			       A <-- diag(R1)*A*diag(C1)            */
+		            irow = fst_row;
+		            for (j = 0; j < m_loc; ++j) {
+			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			            icol = colind[i];
+			            a[i] *= R1[irow] * C1[icol];
+#if ( PRNTlevel>=2 )
+			            if ( perm_r[irow] == icol ) { /* New diagonal */
+			              if ( job == 2 || job == 3 )
+				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+			              else if ( job == 4 )
+				        dsum += fabs(a[i]);
+			              else if ( job == 5 )
+				        dprod *= fabs(a[i]);
+			            }
+#endif
+			        }
+			        ++irow;
+		            }
+
+		            /* Multiply together the scaling factors --
+			       R/C from simple scheme, R1/C1 from MC64. */
+		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		            else for (i = 0; i < m; ++i) R[i] = R1[i];
+		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		            else for (i = 0; i < n; ++i) C[i] = C1[i];
+		    
+		            ScalePermstruct->DiagScale = BOTH;
+		            rowequ = colequ = 1;
+
+		        } /* end Equil */
+
+                        /* Now permute global GA to prepare for symbfact() */
+                        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	                        irow = rowind[i];
+		                rowind[i] = perm_r[irow];
+		            }
+		        }
+		        SUPERLU_FREE (R1);
+		        SUPERLU_FREE (C1);
+	              } else { /* job = 2,3,4 */
+		        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			        irow = rowind[i];
+			        rowind[i] = perm_r[irow];
+		            } /* end for i ... */
+		        } /* end for j ... */
+	              } /* end else job ... */
+                    } else { /* if iinfo != 0 */
+			for (i = 0; i < m; ++i) perm_r[i] = i;
+		    }
+
+#if ( PRNTlevel>=2 )
+	            if ( job == 2 || job == 3 ) {
+		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	            } else if ( job == 4 ) {
+		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	            } else if ( job == 5 ) {
+		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	            }
+#endif
+                } /* end if options->RowPerm ... */
+
+	        t = SuperLU_timer_() - t;
+	        stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+                if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
+	                            job, t);
+#endif
+            } /* end if Fact ... */
+
+        } else { /* options->RowPerm == NOROWPERM / NATURAL */
+            for (i = 0; i < m; ++i) perm_r[i] = i;
+        }
+
+#if ( DEBUGlevel>=2 )
+        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
+#endif
+    } /* end if (!factored) */
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = pdlangs(norm, A, grid);
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. anorm %e\n", anorm);
+#endif
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization: symbolic factorization, 
+       redistribution, and numerical factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+
+	if ( parSymbFact == YES || permc_spec == PARMETIS ) {
+	    nprocs_num = grid->nprow * grid->npcol;
+  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));
+
+	    /* create a new communicator for the first noDomains
+               processes in grid->comm */
+	    key = iam;
+    	    if (iam < noDomains) col = 0;
+	    else col = MPI_UNDEFINED;
+	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
+
+	    if ( permc_spec == NATURAL || permc_spec == MY_PERMC ) {
+		if ( permc_spec == NATURAL ) {
+		     for (j = 0; j < n; ++j) perm_c[j] = j;
+                }
+		if ( !(sizes = intMalloc_dist(2 * noDomains)) ) 
+		     ABORT("SUPERLU_MALLOC fails for sizes.");
+		if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) )
+		    ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+		for (i = 0; i < 2*noDomains - 2; ++i) {
+		    sizes[i] = 0;
+		    fstVtxSep[i] = 0;
+		}
+		sizes[2*noDomains - 2] = m;
+		fstVtxSep[2*noDomains - 2] = 0;
+	    } else if ( permc_spec != PARMETIS ) {   /* same as before */
+		printf("{" IFMT "," IFMT "}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n",
+		       MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
+	    }
+        }
+
+	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
+          /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
+	  if ( permc_spec == PARMETIS ) {
+	      /* Get column permutation vector in perm_c.                    *
+	       * This routine takes as input the distributed input matrix A  *
+	       * and does not modify it.  It also allocates memory for       *
+	       * sizes[] and fstVtxSep[] arrays, that contain information    *
+	       * on the separator tree computed by ParMETIS.                 */
+	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
+                                  	   noDomains, &sizes, &fstVtxSep,
+                                           grid, &symb_comm);
+	      if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	          fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
+#endif
+		  *info = flinfo;
+		  return;
+     	      }
+	  } else {
+	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
+          }
+        }
+
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T
+	   (a.k.a. column etree), depending on the choice of ColPerm.
+	   Adjust perm_c[] to be consistent with a postorder of etree.
+	   Permute columns of A to form A*Pc'. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    if ( parSymbFact == NO ) { /* Perform serial symbolic factorization */
+		/* GA = Pr*A, perm_r[] is already applied. */
+	        int_t *GACcolbeg, *GACcolend, *GACrowind;
+
+		/* After this routine, GAC = GA*Pc^T.  */
+	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
+
+	        /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */
+	        GACstore = (NCPformat *) GAC.Store;
+	        GACcolbeg = GACstore->colbeg;
+	        GACcolend = GACstore->colend;
+	        GACrowind = GACstore->rowind;
+	        for (j = 0; j < n; ++j) {
+	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+		        irow = GACrowind[i];
+		        GACrowind[i] = perm_c[irow];
+	            }
+	        }
+
+	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
+                   the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 ) 
+                if ( !iam )
+		  printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+  	        t = SuperLU_timer_();
+	        if ( !(Glu_freeable = (Glu_freeable_t *)
+		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		    ABORT("Malloc fails for Glu_freeable.");
+
+	    	/* Every process does this. */
+	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
+			     	 Glu_persist, Glu_freeable);
+
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if ( iinfo <= 0 ) { /* Successful return */
+		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+		    	printf("\tNo of supers " IFMT "\n", (long long) Glu_persist->supno[n-1]+1);
+		    	printf("\tSize of G(L) " IFMT "\n", (long long) Glu_freeable->xlsub[n]);
+		    	printf("\tSize of G(U) " IFMT "\n", (long long) Glu_freeable->xusub[n]);
+		    	printf("\tint %d, short %d, float %d, double %d\n", 
+			       (int) sizeof(int_t), (int) sizeof(short),
+        		       (int) sizeof(float), (int) sizeof(double));
+		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+			   	symb_mem_usage.for_lu*1e-6, 
+			   	symb_mem_usage.total*1e-6,
+			   	symb_mem_usage.expansions);
+		    }
+#endif
+	    	} else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		    if ( !iam )
+		        fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
+#endif
+		    *info = iinfo;
+		    return;
+	        }
+	    } /* end serial symbolic factorization */
+	    else {  /* parallel symbolic factorization */
+	    	t = SuperLU_timer_();
+	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable, 
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage); 
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	      	    fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
+#endif
+		    *info = flinfo;
+		    return;
+                }
+	    }
+
+            /* Destroy global GA */
+            if ( parSymbFact == NO || options->RowPerm != NO )
+                Destroy_CompCol_Matrix_dist(&GA);
+            if ( parSymbFact == NO )
+ 	        Destroy_CompCol_Permuted_dist(&GAC);
+
+	} /* end if Fact ... */
+
+        if (sizes) SUPERLU_FREE (sizes);
+        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
+	if (symb_comm != MPI_COMM_NULL)
+	  MPI_Comm_free (&symb_comm); 
+
+	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
+	    /* CASE OF SERIAL SYMBOLIC */
+  	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. 
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+  	       distribution routine. */
+	    t = SuperLU_timer_();
+	    dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
+                                      Glu_freeable, LUstruct, grid);
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+
+  	    /* Deallocate storage used in symbolic factorization. */
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        iinfo = symbfact_SubFree(Glu_freeable);
+	        SUPERLU_FREE(Glu_freeable);
+	    }
+	} else { /* CASE OF PARALLEL SYMBOLIC */
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+	       distribution routine. */
+	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+    	    t = SuperLU_timer_();
+	    dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct,
+		  			   &Pslu_freeable, LUstruct, grid);
+	    if (dist_mem_use > 0)
+	        ABORT ("Not enough memory available for dist_psymbtonum\n");
+            
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+	}
+
+	/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]);*/
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+
+#if 0
+
+// #ifdef GPU_PROF
+
+//  if(!iam )
+//  {
+//      char* ttemp;
+
+//      ttemp = getenv("IO_FILE");
+//      if(ttemp!=NULL)
+//      {   
+//          printf("File being opend is %s\n",ttemp );
+//          FILE* fp;
+//          fp = fopen(ttemp,"w");
+//          if(!fp)
+//          {
+//              fprintf(stderr," Couldn't open output file %s\n",ttemp);
+//          }
+
+//          int nsup=Glu_persist->supno[n-1]+1;
+//          int ii;
+//          for (ii = 0; ii < nsup; ++ii)
+//          {
+//                  fprintf(fp,"%d,%d,%d,%d,%d,%d\n",gs1.mnk_min_stats[ii],gs1.mnk_min_stats[ii+nsup],
+//                  gs1.mnk_min_stats[ii+2*nsup],
+//                  gs1.mnk_max_stats[ii],gs1.mnk_max_stats[ii+nsup],gs1.mnk_max_stats[ii+2*nsup]);
+//          }
+
+//          // lastly put the timeing stats that we need
+
+//          fprintf(fp,"Min %lf Max %lf totaltime %lf \n",gs1.osDgemmMin,gs1.osDgemmMax,stat->utime[FACT]);
+//          fclose(fp);
+//      }
+
+//  }
+// #endif
+
+#endif
+
+	if ( options->PrintStat ) {
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+
+	    dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
+	    if (parSymbFact == TRUE) {
+	        /* The memory used in the redistribution routine
+		   includes the memory used for storing the symbolic
+  		   structure and the memory allocated for numerical
+		   factorization */
+	        temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use);
+                if ( options->RowPerm != NO )
+                    temp = SUPERLU_MAX(temp, GA_mem_use);
+            } else {
+	        temp = SUPERLU_MAX (
+                         symb_mem_usage.total + GA_mem_use, /* symbfact step */
+		         symb_mem_usage.for_lu + dist_mem_use +
+                             num_mem_usage.for_lu  /* distribution step */
+                       );
+            }
+            
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+
+            if (!iam) {
+		printf("\n** Memory Usage **********************************\n");
+                printf("** NUMfact space (MB): (sum-of-all-processes)\n"
+		       "    L\\U :        %8.2f |  Total : %8.2f\n",
+		       for_lu * 1e-6, total * 1e-6);
+                printf("** Total highmark (MB):\n"
+		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+		       avg * 1e-6,  
+		       avg / grid->nprow / grid->npcol * 1e-6,
+		       max * 1e-6);
+		printf("**************************************************\n");
+            }
+	} /* end printing stats */
+    
+    } /* end if (!factored) */
+
+    
+    if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+	/* Need to reset the solve's communication pattern,
+	   because perm_r[] and/or perm_c[] is changed.    */
+	if ( options->SolveInitialized == YES ) { /* Initialized before */
+	    dSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+	    options->SolveInitialized = NO;   /* Reset the solve state */
+	}
+     }
+#if 0
+    /* Need to revisit: Why the following is not good enough for X-to-B
+       distribution -- inv_perm_c changed */
+	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+	             LUstruct->Glu_persist, SOLVEstruct);
+#endif
+
+	
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = doubleMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed. 
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+		        b_col[i] *= R[irow];
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    b_col[i] *= C[irow];
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+#if 0 /* Sherry */
+	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+#endif
+            memcpy(x_col, b_col, m_loc * sizeof(double));
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	if ( options->SolveInitialized == NO ) { /* First time */
+	    dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
+		       SOLVEstruct);
+            /* Inside this routine, SolveInitialized is set to YES.
+	       For repeated call to pdgssvx(), no need to re-initialilze
+	       the Solve data & communication structures, unless a new
+	       factorization with Fact == DOFACT or SamePattern is asked for. */
+	} 
+
+	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
+		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    int_t *it;
+            int_t *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+	          /* This was allocated and set to NULL in dSolveInit() */
+	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
+
+	    t = SuperLU_timer_();
+	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
+	        /* All these cases need to re-initialize gsmv structure */
+	        if ( options->RefineInitialized )
+		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
+	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
+			    SOLVEstruct->gsmv_comm);
+	       
+                /* Save a copy of the transformed local col indices
+		   in colind_gsmv[]. */
+	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
+	        if ( !(it = intMalloc_dist(nnz_loc)) )
+		    ABORT("Malloc fails for colind_gsmv[]");
+	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+	        options->RefineInitialized = YES;
+	    } else if ( Fact == SamePattern ||
+			Fact == SamePattern_SameRowPerm ) {
+	        double atemp;
+	        int_t k, jcol, p;
+	        /* Swap to beginning the part of A corresponding to the
+		   local part of X, as was done in pdgsmv_init() */
+	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+		    k = rowptr[i];
+		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+		        jcol = colind[j];
+		        p = SOLVEstruct->row_to_proc[jcol];
+		        if ( p == iam ) { /* Local */
+		            atemp = a[k]; a[k] = a[j]; a[j] = atemp;
+		            ++k;
+		        }
+		    }
+	        }
+	      
+	        /* Re-use the local col indices of A obtained from the
+		   previous call to pdgsmv_init() */
+	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
+	    }
+
+	    if ( nrhs == 1 ) { /* Use the existing solve structure */
+	        SOLVEstruct1 = SOLVEstruct;
+	    } else { /* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS. 
+			So we use SOLVEstruct1 for the refinement step.
+		     */
+	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
+		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
+		    ABORT("Malloc fails for SOLVEstruct1");
+	        /* Copy the same stuff */
+	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+		
+		/* Initialize the *gstrs_comm for 1 RHS. */
+		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+		    ABORT("Malloc fails for gstrs_comm[]");
+		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
+			     Glu_persist, SOLVEstruct1);
+	    }
+
+	    pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
+		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+            /* Deallocate the storage associated with SOLVEstruct1 */
+	    if ( nrhs > 1 ) {
+	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
+	        SUPERLU_FREE(SOLVEstruct1);
+	    }
+
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	} /* end if IterRefine */
+
+	/* Permute the solution matrix B <= Pc'*X. */
+	pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
+			       SOLVEstruct->inv_perm_c,
+			       X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+	printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+	for (i = 0; i < m_loc; ++i)
+	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
+#endif
+	
+	/* Transform the solution matrix X to a solution of the original
+	   system before equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+		        b_col[i] *= C[irow];
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    b_col[i] *= R[irow];
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 && *info == 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it was not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW: 
+		SUPERLU_FREE(C);
+		break;
+	    case COL: 
+		SUPERLU_FREE(R);
+		break;
+	}
+    }
+
+#if 0
+    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+ 	Destroy_CompCol_Permuted_dist(&GAC);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgssvx()");
+#endif
+
+}
diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c
new file mode 100644
index 0000000..48b8ad8
--- /dev/null
+++ b/SRC/pdgssvx_ABglobal.c
@@ -0,0 +1,1105 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a system of linear equations A*X=B,
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015   version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pdgssvx_ABglobal solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ *
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using 
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has several options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously 
+ *      solved problem to save time by reusing part or all of 
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *      -  A, the input matrix
+ *
+ *      as well as the following options, which are described in more 
+ *      detail below:
+ *
+ *      -  options->Equil,   to specify how to scale the rows and columns
+ *                           of A to "equilibrate" it (to try to reduce its
+ *                           condition number and so improve the
+ *                           accuracy of the computed solution)
+ *
+ *      -  options->RowPerm, to specify how to permute the rows of A
+ *                           (typically to control numerical stability)
+ *
+ *      -  options->ColPerm, to specify how to permute the columns of A
+ *                           (typically to control fill-in and enhance
+ *                           parallelism during factorization)
+ *
+ *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                           pivots encountered during factorization
+ *                           (to control numerical stability)
+ *
+ *      The outputs returned include
+ *         
+ *      -  ScalePermstruct,  modified to describe how the input matrix A
+ *                           was equilibrated and permuted:
+ *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                        columns of A were scaled
+ *         -  ScalePermstruct->R, array of row scale factors
+ *         -  ScalePermstruct->C, array of column scale factors
+ *         -  ScalePermstruct->perm_r, row permutation vector
+ *         -  ScalePermstruct->perm_c, column permutation vector
+ *
+ *            (part of ScalePermstruct may also need to be supplied on input,
+ *             depending on options->RowPerm and options->ColPerm as described 
+ *             later).
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *                Pc*Pr*diag(R)*A*diag(C)
+ *             where 
+ *                Pr and Pc are row and columns permutation matrices determined
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
+ *                  respectively, and 
+ *                diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  ScalePermstruct->C
+ *
+ *      -  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
+ *               in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *     
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In this
+ *            case the algorithm saves time by reusing the previously computed
+ *            column permutation vector stored in ScalePermstruct->perm_c
+ *            and the "elimination tree" of A stored in LUstruct->etree.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->RowPerm
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply 
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->perm_c, the column permutation
+ *      -  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *         
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated and row permuted
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are ignored.
+ *      This is because the permutations from ScalePermstruct->perm_r and
+ *      ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply 
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
+ *                                     column scaled
+ *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *                             if any
+ *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous 
+ *                                  matrix
+ *      -  all of LUstruct, the previously computed information about L and U
+ *                (the actual numerical values of L and U stored in
+ *                 LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *         
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated 
+ *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous 
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply 
+ *
+ *      -  A, the unfactored matrix, only in the case that iterative refinment
+ *            is to be done (specifically A must be the output A from 
+ *            the previous call, so that it has been scaled and permuted)
+ *      -  all of ScalePermstruct
+ *      -  all of LUstruct, including the actual numerical values of L and U
+ *
+ *      all of which are unmodified on output.
+ *         
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *         
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *           
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *         
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix*
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *         The number of linear equations is A->nrow. The type of A must be:
+ *         Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. That is, A is stored in
+ *         compressed column format (also known as Harwell-Boeing format).
+ *         See supermatrix.h for the definition of 'SuperMatrix'.
+ *         This routine only handles square A, however, the LU factorization
+ *         routine pdgstrf can factorize rectangular matrices.
+ *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
+ *         depending on ScalePermstruct->DiagScale, options->RowPerm and
+ *         options->colpem:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->RowPerm != NATURAL, A is further overwritten by
+ *                Pr*diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                Pc*Pr*diag(R)*A*diag(C).
+ *         If all the above condition are true, the LU decomposition is
+ *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ *         NOTE: Currently, A must reside in all processes when calling
+ *               this routine.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t*
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was 
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *         
+ * B       (input/output) double*
+ *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
+ *         On exit, the solution matrix if info = 0;
+ *
+ *         NOTE: Currently, B must reside in all processes when calling
+ *               this routine.
+ *
+ * ldb     (input) int (global)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t*
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * berr    (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ *
+ * See superlu_ddefs.h for the definitions of various data types.
+ * </pre>
+ */
+void
+pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, 
+		 ScalePermstruct_t *ScalePermstruct,
+		 double B[], int ldb, int nrhs, gridinfo_t *grid,
+		 LUstruct_t *LUstruct, double *berr,
+		 SuperLUStat_t *stat, int *info)
+{
+    SuperMatrix AC;
+    NCformat *Astore;
+    NCPformat *ACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
+	      routine. They will be freed after DDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    double   *a;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *colptr, *rowind;
+    int_t    Equil, factored, job, notran, colequ, rowequ;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
+    int      iam;
+    int      ldx;  /* LDA for matrix X (global). */
+    char     equed[1], norm[1];
+    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    double   *X, *b_col, *b_work, *x_col;
+    double   t;
+    static superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+
+    /* Test input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < 0 || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	fprintf(stderr, "Extra precise iterative refinement yet to support.");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
+         A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < A->nrow )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("pdgssvx_ABglobal", grid, -*info);
+	return;
+    }
+
+    /* Initialization */
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    iam = grid->iam;
+    job = 5;
+    m = A->nrow;
+    n = A->ncol;
+    Astore = A->Store;
+    nnz = Astore->nnz;
+    a = Astore->nzval;
+    colptr = Astore->colptr;
+    rowind = Astore->rowind;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()");
+#endif
+
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW: 
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL: 
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       Diagonal scaling to equilibrate the matrix.
+       ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			a[i] *= R[irow];       /* Scale rows. */
+		    }
+		}
+		break;
+	      case COL:
+		for (j = 0; j < n; ++j)
+		    for (i = colptr[j]; i < colptr[j+1]; ++i)
+			a[i] *= C[j];          /* Scale columns. */
+		break;
+	      case BOTH: 
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			a[i] *= R[irow] * C[j]; /* Scale rows and columns. */
+		    }
+		}
+	        break;
+	    }
+	} else {
+	    if ( !iam ) {
+		/* Compute row and column scalings to equilibrate matrix A. */
+		dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
+	    
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		} else {
+		    if ( iinfo > 0 ) {
+			if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", 
+				    iinfo);
+#endif
+			} else {
+#if ( PRNTlevel>=1 )
+                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", 
+				     iinfo-n);
+#endif
+                        }
+		    }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		} 
+	    }
+	
+            if ( iinfo == 0 ) {
+	        /* Equilibrate matrix A. */
+	        dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
+	        if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+	        } else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+	        } else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+	        } else ScalePermstruct->DiagScale = NOEQUIL;
+            }
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		/*fflush(stdout);*/
+	    }
+#endif
+	} /* if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... */
+    
+    /* ------------------------------------------------------------
+       Permute rows of A. 
+       ------------------------------------------------------------*/
+    if ( options->RowPerm != NO ) {
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
+	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
+	    for (i = 0; i < colptr[n]; ++i) {
+		    irow = rowind[i]; 
+		    rowind[i] = perm_r[irow];
+	    }
+	} else if ( !factored ) {
+	    if ( job == 5 ) {
+		/* Allocate storage for scaling factors. */
+		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
+		    ABORT("SUPERLU_MALLOC fails for R1[]");
+		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
+		    ABORT("SUPERLU_MALLOC fails for C1[]");
+	    }
+
+	    if ( !iam ) {
+		/* Process 0 finds a row permutation for large diagonal. */
+		iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a,
+                                perm_r, R1, C1);
+
+                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );		
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		    if ( job == 5 && Equil ) {
+		       MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		       MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+		   }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		   MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		   if ( job == 5 && Equil ) {
+		      MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		      MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+		   }
+		}
+	    }
+
+	    if ( iinfo && job == 5) {
+	        SUPERLU_FREE(R1);
+	        SUPERLU_FREE(C1);
+   	    }
+
+#if ( PRNTlevel>=2 )
+	    dmin = dmach_dist("Overflow");
+	    dsum = 0.0;
+	    dprod = 1.0;
+#endif
+	    if ( iinfo == 0 ) {
+	      if ( job == 5 ) {
+		if ( Equil ) {
+		    for (i = 0; i < n; ++i) {
+			R1[i] = exp(R1[i]);
+			C1[i] = exp(C1[i]);
+		    }
+		    for (j = 0; j < n; ++j) {
+			for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			    irow = rowind[i];
+			    a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */
+			    rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			    if ( rowind[i] == j ) /* New diagonal */
+				dprod *= fabs(a[i]);
+#endif
+			}
+		    }
+
+		    /* Multiply together the scaling factors. */
+		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		    else for (i = 0; i < m; ++i) R[i] = R1[i];
+		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		    else for (i = 0; i < n; ++i) C[i] = C1[i];
+		    
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = colequ = 1;
+		} else { /* No equilibration. */
+		    for (i = colptr[0]; i < colptr[n]; ++i) {
+		        irow = rowind[i];
+			rowind[i] = perm_r[irow];
+         	    }
+		}
+		SUPERLU_FREE (R1);
+		SUPERLU_FREE (C1);
+	      } else { /* job = 2,3,4 */
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			if ( rowind[i] == j ) { /* New diagonal */
+			    if ( job == 2 || job == 3 )
+				dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+			    else if ( job == 4 )
+				dsum += fabs(a[i]);
+			    else if ( job == 5 )
+				dprod *= fabs(a[i]);
+			}
+#endif
+		    } /* end for i ... */
+		} /* end for j ... */
+              } /* end else */
+            } else { /* if iinfo != 0 */
+		for (i = 0; i < m; ++i) perm_r[i] = i;
+	    }
+
+#if ( PRNTlevel>=2 )
+	    if ( job == 2 || job == 3 ) {
+		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	    } else if ( job == 4 ) {
+		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	    } else if ( job == 5 ) {
+		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	    }
+#endif
+	    
+        } /* else !factored */
+
+	t = SuperLU_timer_() - t;
+	stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+#endif
+    
+    } else { /* options->RowPerm == NOROWPERM */
+        for (i = 0; i < m; ++i) perm_r[i] = i;
+    }
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = dlangs_dist(norm, A);
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. anorm %e\n", anorm);
+#endif
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+	if ( permc_spec != MY_PERMC && Fact == DOFACT )
+	    /* Use an ordering provided by SuperLU */
+	    get_perm_c_dist(iam, permc_spec, A, perm_c);
+
+	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	   (a.k.a. column etree), depending on the choice of ColPerm.
+	   Adjust perm_c[] to be consistent with a postorder of etree.
+	   Permute columns of A to form A*Pc'. */
+	sp_colorder(options, A, perm_c, etree, &AC);
+
+	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
+	ACstore = AC.Store;
+	for (j = 0; j < n; ++j) 
+	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
+		irow = ACstore->rowind[i];
+		ACstore->rowind[i] = perm_c[irow];
+	    }
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Perform a symbolic factorization on matrix A and set up the
+	   nonzero data structures which are suitable for supernodal GENP. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+#if ( PRNTlevel>=1 ) 
+	    if ( !iam ) 
+		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+	    t = SuperLU_timer_();
+	    if ( !(Glu_freeable = (Glu_freeable_t *)
+		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		ABORT("Malloc fails for Glu_freeable.");
+
+	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
+			     Glu_persist, Glu_freeable);
+
+	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+
+	    if ( iinfo <= 0 ) {
+		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 ) 
+		if ( !iam ) {
+		    printf("\tNo of supers %ld\n", (long long)Glu_persist->supno[n-1]+1);
+		    printf("\tSize of G(L) %ld\n", (long long)Glu_freeable->xlsub[n]);
+		    printf("\tSize of G(U) %ld\n", (long long)Glu_freeable->xusub[n]);
+		    printf("\tint %d, short %d, float %d, double %d\n", 
+			   (int) sizeof(int_t), (int) sizeof(short), 
+ 			   (int) sizeof(float), (int) sizeof(double));
+		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+			   symb_mem_usage.for_lu*1e-6, 
+			   symb_mem_usage.total*1e-6,
+			   symb_mem_usage.expansions);
+		}
+#endif
+	    } else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		if ( !iam )
+		    fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo);
+#endif
+                *info = iinfo;  
+                return;
+	    }
+	}
+
+	/* Distribute the L and U factors onto the process grid. */
+	t = SuperLU_timer_();
+	dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
+	stat->utime[DIST] = SuperLU_timer_() - t;
+
+	/* Deallocate storage used in symbolic factor. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    iinfo = symbfact_SubFree(Glu_freeable);
+	    SUPERLU_FREE(Glu_freeable);
+	}
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+
+#if ( PRNTlevel>=1 )
+	{
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+	    dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    temp = SUPERLU_MAX(symb_mem_usage.total,
+			       symb_mem_usage.for_lu +
+			       (float)dist_mem_use + num_mem_usage.for_lu);
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+	    if ( !iam ) {
+		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
+		       for_lu*1e-6, total*1e-6);
+		printf("\tAll space (MB):"
+		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
+		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
+		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
+		printf(".. pdgstrf INFO = %d\n", *info);
+	    }
+	}
+#endif
+    
+    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
+	/* Permute columns of A to form A*Pc' using the existing perm_c.
+	 * NOTE: rows of A were previously permuted to Pc*A.
+	 */
+	sp_colorder(options, A, perm_c, NULL, &AC);
+    } /* if !factored ... */
+	
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = doubleMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed. 
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) b_col[i] *= R[i];
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) b_col[i] *= C[i];
+		b_col += ldb;
+	    }
+	}
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pr*B.
+	   ------------------------------------------------------------*/
+	if ( options->RowPerm != NO ) {
+	    if ( notran ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
+		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		    b_col += ldb;
+		}
+	    }
+	}
+
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pc*B.
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
+		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    t = SuperLU_timer_();
+	    pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
+			      X, ldx, nrhs, berr, stat, info);
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	}
+
+	/* Permute the solution matrix X <= Pc'*X. */
+	for (j = 0; j < nrhs; j++) {
+	    b_col = &B[j*ldb];
+	    x_col = &X[j*ldx];
+	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
+	}
+	
+	/* Transform the solution matrix X to a solution of the original system
+	   before the equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < n; ++i) b_col[i] *= C[i];
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < n; ++i) b_col[i] *= R[i];
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it is not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW: 
+		SUPERLU_FREE(C);
+		break;
+	    case COL: 
+		SUPERLU_FREE(R);
+		break;
+	}
+    }
+    if ( !factored || (factored && options->IterRefine) )
+	Destroy_CompCol_Permuted_dist(&AC);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()");
+#endif
+}
+
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
new file mode 100644
index 0000000..00aaeba
--- /dev/null
+++ b/SRC/pdgstrf.c
@@ -0,0 +1,1820 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs LU factorization in parallel
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *     September 1, 1999
+ *     Feburary 7, 2001  use MPI_Isend/MPI_Irecv
+ *     October 15, 2008  latency-reducing panel factorization
+ *     July    12, 2011  static scheduling and arbitrary look-ahead
+ *     March   13, 2013  change NTAGS to MPI_TAG_UB value
+ *     September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
+ *     December 31, 2015 rename xMACH to xMACH_DIST
+ *
+ * Sketch of the algorithm 
+ *
+ * ======================= 
+ *    
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pdgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ *
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pdgstrf2(k), one column at a time
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j)
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i)
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ * </pre>
+ */
+
+#include <math.h>
+/*#include "mkl.h"*/
+#include "superlu_ddefs.h"
+
+#ifdef GPU_ACC
+#include "cublas_utils.h"
+/*#include "cublas_dgemm.h"*/
+// #define NUM_CUDA_STREAMS 16
+// #define NUM_CUDA_STREAMS 16
+#endif 
+
+/* Various defininations     */
+/* 
+    Name    : SUPERNODE_PROFILE  
+    Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec
+    obtained,bandwidth achived:
+    Overhead : Low 
+*/
+// #define SUPERNODE_PROFILE   
+
+/* 
+    Name    :   BAELINE
+    Purpose : baseline to compare performance against
+    Overhead : NA : this wont be used for running experiments
+*/
+// #define BASELINE
+
+/* 
+    Name    :   PHI_FRAMEWORK
+    Purpose : To simulate and test algorithm used for offloading Phi
+    Overhead : NA : this wont be used for running experiments
+*/
+#define PHI_FRAMEWORK
+
+#define PDGSTRF2 pdgstrf2_trsm
+#define PDGSTRS2 pdgstrs2_omp
+
+extern void PDGSTRF2 (superlu_dist_options_t *, int_t, int_t, double,
+                        Glu_persist_t *, gridinfo_t *, LocalLU_t *,
+                        MPI_Request *, int, SuperLUStat_t *, int *);
+#ifdef _CRAY
+extern void PDGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *,
+                      LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+extern void PDGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *,
+                      LocalLU_t *, SuperLUStat_t *);
+#endif
+
+#ifdef ISORT
+extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2);
+extern void isort1 (int_t N, int_t * ARRAY);
+
+#else
+
+int
+superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
+
+/************************************************************************/
+
+#include "dscatter.c"
+
+/************************************************************************/
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t
+pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
+       LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int *info)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs1 = _cptofcd ("L", strlen ("L"));
+    _fcd ftcs2 = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs3 = _cptofcd ("U", strlen ("U"));
+#endif
+    double zero = 0.0, alpha = 1.0, beta = 0.0;
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf;
+    int_t **Lsub_buf_2, **Usub_buf_2;
+    double **Lval_buf_2, **Uval_buf_2;          /* pointers to starts of bufs */
+    double *lusup, *lusup1, *uval, *Uval_buf;   /* pointer to current buf     */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+        lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+        nlb, nub, nsupc, rel, rukp, il, iu;
+    int_t Pc, Pr;
+    int iam, kcol, krow, yourcol, mycol, myrow, pi, pj;
+    int j, k, lk, nsupers;  /* k - current panel to work on */
+    int k0;        /* counter of the next supernode to be factored */
+    int kk, kk0, kk1, kk2, jj0; /* panels in the look-ahead window */
+    int iukp0, rukp0, flag0, flag1;
+    int nsupr, nbrow, segsize;
+    int msg0, msg2;
+    int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    double **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t *index;
+    double *nzval;
+    int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    double *ucol;
+    int *indirect, *indirect2;
+    double *tempv, *tempv2d;
+    int iinfo;
+    int *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    double *tempU2d, *tempu;
+    int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
+        *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
+        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
+    float edag_supno_l_bytes;
+#ifdef ISORT
+    int_t *iperm_u;
+#endif
+    int *msgcnt;   /* Count the size of the message xfer'd in each buffer:
+		    *     0 : transferred in Lsub_buf[]
+		    *     1 : transferred in Lval_buf[]
+		    *     2 : transferred in Usub_buf[]
+		    *     3 : transferred in Uval_buf[]
+		    */
+    int **msgcnts, **msgcntsU; /* counts for each panel in the
+                                  look-ahead window */
+    int *factored;  /* factored[j]==0 : L col panel j is factorized */
+    int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+    int num_look_aheads, look_id, *look_ahead;
+    int_t *perm_c_supno, *iperm_c_supno;
+    MPI_Request *recv_req, **recv_reqs, **send_reqs, **send_reqs_u,
+        **recv_reqs_u;
+    MPI_Request *send_req, *U_diag_blk_send_req = NULL;
+    MPI_Status status;
+    void *attr_val;
+    int flag;
+
+    int iword = sizeof (int_t);
+    int dword = sizeof (double);
+
+    /* For measuring load imbalence in omp threads*/
+    double omp_load_imblc = 0.0;
+    double *omp_loop_time;
+
+    double CPUOffloadTimer      = 0;
+    double CPUOffloadFlop       = 0;
+    double CPUOffloadMop        = 0;
+    double schur_flop_timer     = 0.0;
+    double pdgstrf2_timer       = 0.0;
+    double pdgstrs2_timer       = 0.0;
+    double lookaheadupdatetimer = 0.0;
+    double InitTimer            = 0.0; /* including compute schedule, malloc */
+    double tt_start, tt_end;
+
+#if !defined( GPU_ACC )
+    /* Counter for couting memory operations */
+    double scatter_mem_op_counter  = 0.0;
+    double scatter_mem_op_timer    = 0.0;
+    double scatterL_mem_op_counter = 0.0;
+    double scatterL_mem_op_timer   = 0.0;
+    double scatterU_mem_op_counter = 0.0;
+    double scatterU_mem_op_timer   = 0.0;
+
+    double GatherLTimer            = 0.0;
+    double LookAheadRowSepMOP      = 0.0;
+    double GatherUTimer             = 0.0;
+    double GatherMOP               = 0.0;
+    double LookAheadGEMMTimer      = 0.0;
+    double LookAheadGEMMFlOp       = 0.0;
+    double LookAheadScatterTimer   = 0.0;
+    double LookAheadScatterMOP     = 0.0;
+    double RemainGEMMTimer         = 0.0;
+    double RemainScatterTimer      = 0.0;
+    double NetSchurUpTimer         = 0.0;
+    double schur_flop_counter      = 0.0;
+#endif
+
+#if ( PRNTlevel>= 1)
+    /* count GEMM max dimensions */
+    int gemm_max_m = 0, gemm_max_n = 0, gemm_max_k = 0;
+#endif
+
+#if ( DEBUGlevel>=2 )
+    int_t num_copy = 0, num_update = 0;
+#endif
+#if ( PRNTlevel==3 )
+    int zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if (m < 0)
+        *info = -2;
+    else if (n < 0)
+        *info = -3;
+    if (*info) {
+        pxerr_dist ("pdgstrf", grid, -*info);
+        return (-1);
+    }
+
+    /* Quick return if possible. */
+    if (m == 0 || n == 0) return 0;
+ 
+    /* 
+     * Initialization.  
+     */
+    iam = grid->iam;
+    Pc = grid->npcol; 
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    mycol = MYCOL (iam, grid);
+    nsupers = Glu_persist->supno[n - 1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = smach_dist("Epsilon");
+    thresh = s_eps * anorm;
+
+    MPI_Attr_get (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    if (!flag) {
+        fprintf (stderr, "Could not get TAG_UB\n");
+        return (-1);
+    }
+    int tag_ub = *(int *) attr_val;
+
+#if ( PRNTlevel>=1 )
+    if (!iam)
+        printf ("MPI tag upper bound = %d\n", tag_ub);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    if (s_eps == 0.0)
+        printf (" ***** warning s_eps = %e *****\n", s_eps);
+    CHECK_MALLOC (iam, "Enter pdgstrf()");
+#endif
+
+    stat->ops[FACT]      = 0.0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+
+    /* make sure the range of look-ahead window [0, MAX_LOOKAHEADS-1] */
+    num_look_aheads = SUPERLU_MAX(0, SUPERLU_MIN(options->num_lookaheads, MAX_LOOKAHEADS - 1));
+
+    if (Pr * Pc > 1) {
+        if (!(U_diag_blk_send_req =
+              (MPI_Request *) SUPERLU_MALLOC (Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for U_diag_blk_send_req[].");
+	/* flag no outstanding Isend */
+        U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; /* used 0 before */
+
+        /* allocating buffers for look-ahead */
+        i = Llu->bufmax[0];
+        if (i != 0) {
+            if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
+                ABORT ("Malloc fails for Lsub_buf.");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[1];
+        if (i != 0) {
+            if (!(Llu->Lval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
+                ABORT ("Malloc fails for Lval_buf[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[2];
+        if (i != 0) {
+            if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Usub_buf_2[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[3];
+        if (i != 0) {
+            if (!(Llu->Uval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Uval_buf_2[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+        }
+    }
+
+    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) 
+		* iword +
+		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) 
+		* dword, stat );
+
+    /* creating pointers to the look-ahead buffers */
+    if (! (Lsub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Lsub_buf_2[].");
+    if (! (Lval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (double *))))
+        ABORT ("Malloc fails for Lval_buf_2[].");
+    if (! (Usub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Uval_buf_2[].");
+    if (! (Uval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (double *))))
+        ABORT ("Malloc fails for buf_2[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        Lval_buf_2[i] = Llu->Lval_buf_2[i];
+        Lsub_buf_2[i] = Llu->Lsub_buf_2[i];
+        Uval_buf_2[i] = Llu->Uval_buf_2[i];
+        Usub_buf_2[i] = Llu->Usub_buf_2[i];
+    }
+
+    if (!(msgcnts = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcnts[].");
+    if (!(msgcntsU = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcntsU[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(msgcnts[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcnts[].");
+        if (!(msgcntsU[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcntsU[].");
+    }
+
+    if (! (recv_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs_u[].");
+    if (! (send_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (send_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (recv_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(recv_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req_u[i].");
+        if (!(send_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_req_u[i].");
+        if (!(send_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pc * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_reqs[i].");
+        if (!(recv_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (4 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req[].");
+        send_reqs[i][0] = send_reqs[i][1] = MPI_REQUEST_NULL;
+        recv_reqs[i][0] = recv_reqs[i][1] = MPI_REQUEST_NULL;
+    }
+
+    if (!(factored = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factored[].");
+    if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factoredU[].");
+    for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+    log_memory(2 * nsupers * iword, stat);
+
+    int num_threads = 1;
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    {
+        if (omp_get_thread_num () == 0) {
+            num_threads = omp_get_num_threads ();
+        }
+    }
+#endif
+
+#if 0
+    omp_loop_time = (double *) _mm_malloc (sizeof (double) * num_threads,64);
+#else
+    omp_loop_time = (double *) doubleMalloc_dist(num_threads);
+#endif
+
+#if ( PRNTlevel>=1 )
+    if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+#endif
+    double tt1 = SuperLU_timer_ ();
+
+    nblocks = 0;
+    ncb = nsupers / Pc; /* number of column blocks, horizontal */
+    nrb = nsupers / Pr; /* number of row blocks, vertical  */
+
+    /* in order to have dynamic scheduling */
+    int *full_u_cols;
+    int *blk_ldu;
+#if 0
+    full_u_cols = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+    blk_ldu = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+#else
+    full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
+    blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
+#endif
+    log_memory(2 * ncb * iword, stat);
+
+
+    /* insert a check condition here */
+
+#if 0  /* Sherry: not used? */
+    /* This bunch is used for static scheduling */
+    pair *full_col_count = (pair *) _mm_malloc (sizeof (pair) * ncb,64);
+    int_t *count_cols, *sum_cols, *partition;
+    count_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    sum_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    partition = (int_t *) _mm_malloc (sizeof (int_t) * num_threads * ncb,64);
+    int_t ldp = ncb;
+#endif
+
+    /* ##################################################################
+     *  Compute a good static schedule based on the factorization task graph.
+     * ################################################################## */
+    perm_c_supno = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+    iperm_c_supno = perm_c_supno + nsupers;
+
+    static_schedule(options, m, n, LUstruct, grid, stat,
+		    perm_c_supno, iperm_c_supno, info);
+
+#if ( DEBUGlevel >= 2 )
+    PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno);
+    
+    /* Turn off static schedule */
+    printf("[%d] .. Turn off static schedule for debugging ..\n", iam);
+    for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i;
+#endif
+     /* ################################################################## */
+
+    /* constructing look-ahead table to indicate the last dependency */
+    int *look_ahead_l; /* Sherry: add comment on look_ahead_l[] */
+    stat->num_look_aheads = num_look_aheads;
+
+    look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
+    look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
+    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+    log_memory(3 * nsupers * iword, stat);
+
+    /* go through U-factor */
+    for (lb = 0; lb < nrb; ++lb) {
+        ib = lb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[lb];
+        if (index) { /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k]; /* global block number */
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+    if (myrow < nsupers % grid->nprow) {
+        ib = nrb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[nrb];
+        if (index) {             /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k];
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+
+    if (options->SymPattern == NO) {
+        /* go through L-factor */
+        for (lb = 0; lb < ncb; lb++) {
+            ib = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            ib = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+    }
+    MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, grid->comm);
+    SUPERLU_FREE (look_ahead_l);
+
+#ifdef ISORT
+    iperm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+    perm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+#else
+    perm_u = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+#endif
+    log_memory(nsupers * iword, stat);
+
+    k = sp_ienv_dist (3);       /* max supernode size */
+#if 0
+    if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) )
+         ABORT("Malloc fails for ujrow[].");
+#else
+    /* Instead of half storage, we'll do full storage */
+    if (!(Llu->ujrow = doubleCalloc_dist (k * k)))
+        ABORT ("Malloc fails for ujrow[].");
+    log_memory(k * k * iword, stat);
+#endif
+
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf (".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm,
+                thresh);
+        printf
+            (".. Buffer size: Lsub %ld\tLval %ld\tUsub %ld\tUval %ld\tLDA %ld\n",
+             (long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
+             (long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
+             (long int) Llu->bufmax[4]);
+    }
+#endif
+   
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv; 
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    k = CEILING (nsupers, Pr);  /* Number of local block rows */
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int i = 0; i < nsupers; ++i) {
+        int tpc = PCOL (i, grid);
+        if (mycol == tpc) {
+            lk = LBj (i, grid);
+            lsub = Lrowind_bc_ptr[lk];
+            if (lsub != NULL) {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+
+    /* Buffer size is max of look ahead window */
+    /* int_t buffer_size =
+         SUPERLU_MAX (max_row_size * num_threads * ldt,
+                      get_max_buffer_size ());           */
+            
+#ifdef GPU_ACC
+    int cublas_nb = get_cublas_nb();
+    int nstreams = get_num_cuda_streams ();
+
+    int buffer_size  = SUPERLU_MAX(max_row_size*nstreams*cublas_nb,get_max_buffer_size());
+    /* array holding last column blk for each partition,
+       used in SchCompUdt--CUDA.c         */
+  #if 0
+    int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
+  #else
+    int *stream_end_col = SUPERLU_MALLOC( nstreams * sizeof(int) );
+  #endif
+
+#else /* not to use GPU */
+
+    int Threads_per_process = get_thread_per_process();
+    int buffer_size  = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size());
+#endif /* end ifdef GPU_ACC */
+
+#if 0
+    /* symmetric assumption -- using L's supernode to estimate. */
+    /* Note that in following expression 8 can be anything
+       as long as its not too big */
+    int bigu_size = 8 * sp_ienv_dist (3) * (max_row_size);
+#else
+    int_t bigu_size = estimate_bigu_size( nsupers, ldt, 
+					  Ufstnz_br_ptr,
+					  Glu_persist, grid, perm_u );
+#endif
+
+    /* bigU and bigV are either on CPU or on GPU, not both. */
+    double* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
+                     bigU has the same size either on CPU or on CPU. */
+    double* bigV; /* for GEMM output matrix, i.e. update matrix. 
+                     On CPU, bigV is small for block-by-block update.
+	             On GPU, bigV is large to hold the aggregate GEMM output.*/
+
+#if ( PRNTlevel>=1 )
+    if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+#endif
+
+#ifdef GPU_ACC
+
+    if ( checkCuda(cudaHostAlloc((void**)&bigU,  bigu_size * sizeof(double), cudaHostAllocDefault)) )
+        ABORT("Malloc fails for dgemm buffer U ");
+
+    int bigv_size = buffer_size;
+#if ( PRNTlevel>=1 )
+    if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
+#endif
+    if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(double) ,cudaHostAllocDefault)) )
+        ABORT("Malloc fails for dgemm buffer V");
+ 
+    DisplayHeader();
+
+#if ( PRNTlevel>=1 )
+    printf(" Starting with %d Cuda Streams \n",nstreams );
+#endif
+
+    cublasHandle_t *handle;
+    handle = (cublasHandle_t *) SUPERLU_MALLOC(sizeof(cublasHandle_t)*nstreams);
+    for(int i = 0; i < nstreams; i++) handle[i] = create_handle();
+
+    // creating streams 
+    cudaStream_t *streams;
+    streams = (cudaStream_t *) SUPERLU_MALLOC(sizeof(cudaStream_t)*nstreams);
+    for (int i = 0; i < nstreams; ++i)
+        checkCuda( cudaStreamCreate(&streams[i]) );
+    
+    // allocating data in device 
+    double *dA, *dB, *dC;
+    cudaError_t cudaStat;
+#if 0
+    // cudaStat = cudaMalloc( (void**)&dA, m*k*sizeof(double));
+    // HOw much should be the size of dA?
+    // for time being just making it 
+    // cudaStat = cudaMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double));
+#endif
+
+    cudaStat = cudaMalloc( (void**)&dA, max_row_size*sp_ienv_dist(3)* sizeof(double));
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating A in the device %ld \n",m*k*sizeof(double) );
+        return 1;
+    }
+
+    // size of B should be max_supernode_size*buffer
+
+    cudaStat = cudaMalloc((void**)&dB, bigu_size * sizeof(double));
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(double));
+        return 1;
+    }
+
+    cudaStat = cudaMalloc((void**)&dC, buffer_size* sizeof(double) );
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating C in the device \n" );
+        return 1;
+    }
+
+    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
+			  + bigu_size + buffer_size ) * dword;
+
+#else  /* not to use GPU */
+    
+    if ( !(bigU = doubleMalloc_dist(bigu_size)) )
+        ABORT ("Malloc fails for dgemm u buff U"); 
+          //Maximum size of bigU= sqrt(buffsize) ?
+
+    int bigv_size = 8 * ldt * ldt * num_threads;
+#if ( PRNTlevel>=1 )
+    if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
+#endif
+    if ( !(bigV = doubleMalloc_dist(bigv_size)) )
+        ABORT ("Malloc failed for dgemm buffer V");
+
+#endif /* end ifdef GPU_ACC */
+
+    log_memory((bigv_size + bigu_size) * dword, stat);
+
+    // mlock(bigU,(bigu_size) * sizeof (double));   
+
+#if ( PRNTlevel>=1 )
+    if(!iam) {
+	printf ("  Max row size is %d \n", max_row_size);
+        printf ("  Threads per process %d \n", num_threads);
+	/* printf ("  Using buffer_size of %d \n", buffer_size); */
+    }
+#endif
+
+    if (!(tempv2d = doubleCalloc_dist (2 * ((size_t) ldt) * ldt)))
+        ABORT ("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt * ldt;
+    if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+    if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+    if (!(iuip = intMalloc_dist (k)))  ABORT ("Malloc fails for iuip[].");
+    if (!(ruip = intMalloc_dist (k)))  ABORT ("Malloc fails for ruip[].");
+
+    log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+	       + 2 * k * iword, stat);
+
+    int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
+        *RemainFullRow,*RemainStRow,*Remain_lptr,*Remain_ib;
+
+    lookAheadFullRow   = intMalloc_dist( (num_look_aheads+1) );
+    lookAheadStRow     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_lptr     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_ib       = intMalloc_dist( (num_look_aheads+1) );
+
+    int_t mrb=    (nsupers+Pr-1) / Pr;
+    int_t mcb=    (nsupers+Pc-1) / Pc;
+    
+    RemainFullRow   = intMalloc_dist(mrb); 
+    RemainStRow     = intMalloc_dist(mrb);
+#if 0
+    Remain_lptr     = (int *) _mm_malloc(sizeof(int)*mrb,1);
+#else
+    Remain_lptr     = intMalloc_dist(mrb);
+#endif
+    // mlock(Remain_lptr, sizeof(int)*mrb );
+    Remain_ib       = intMalloc_dist(mrb);
+    
+    Remain_info_t *Remain_info;
+#if 0
+    Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64);
+#else
+    Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
+#endif
+    log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
+
+    double *lookAhead_L_buff, *Remain_L_buff;
+    Ublock_info_t *Ublock_info;
+    ldt = sp_ienv_dist (3);       /* max supernode size */
+    lookAhead_L_buff = doubleMalloc_dist(ldt*ldt* (num_look_aheads+1) );
+    log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
+
+#if 0
+    Remain_L_buff = (double *) _mm_malloc( sizeof(double)*(Llu->bufmax[1]),64);
+    Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64);
+    int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+    int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+    int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
+#else
+    Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1]);
+    Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
+    int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+    int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+    int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+#endif
+    log_memory(Llu->bufmax[1] * dword, stat);
+
+    InitTimer = SuperLU_timer_() - tt1;
+
+    double pxgstrfTimer = SuperLU_timer_();
+
+    /* ##################################################################
+       ** Handle first block column separately to start the pipeline. **
+       ################################################################## */
+    look_id = 0;
+    msgcnt = msgcnts[0]; /* First count in the window */
+    send_req = send_reqs[0];
+    recv_req = recv_reqs[0];
+
+    k0 = 0;
+    k = perm_c_supno[0];
+    kcol = PCOL (k, grid);
+    krow = PROW (k, grid);
+    if (mycol == kcol) {
+        double ttt1 = SuperLU_timer_();
+
+	/* panel factorization */
+        PDGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_()-ttt1; 
+
+        scp = &grid->rscp;      /* The scope of process row. */
+
+        /* Multicasts numeric values of L(:,0) to process rows. */
+        lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        if (lsub) {
+            msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub[1] * SuperSize (k);
+        } else {
+            msgcnt[0] = msgcnt[1] = 0;
+        }
+
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+
+                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+                        iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+            } /* end if */
+        }  /* end for pj ... */
+    } else {  /* Post immediate receives. */
+        if (ToRecv[k] >= 1) {   /* Recv block column L(:,0). */
+            scp = &grid->rscp;  /* The scope of process row. */
+            MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
+                       SLU_MPI_TAG (0, 0) /* 0 */ ,
+                       scp->comm, &recv_req[0]);
+            MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, kcol,
+                       SLU_MPI_TAG (1, 0) /* 1 */ ,
+                       scp->comm, &recv_req[1]);
+        }
+    } /* end if mycol == 0 */
+
+    factored[k] = 0; /* flag column k as factored. */
+
+    /* post receive of first U-row */
+    if (myrow != krow) {
+        if (ToRecv[k] == 2) {   /* Recv block row U(k,:). */
+            scp = &grid->cscp;  /* The scope of process column. */
+            Usub_buf = Llu->Usub_buf_2[0];
+            Uval_buf = Llu->Uval_buf_2[0];
+            MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                       SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][0]);
+            MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
+                       SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][1]);
+        }
+    }
+
+    /* ##################################################################
+       **** MAIN LOOP ****
+       ################################################################## */
+    for (k0 = 0; k0 < nsupers; ++k0) {
+        k = perm_c_supno[k0];
+
+        /* ============================================ *
+         * ======= look-ahead the new L columns ======= *
+         * ============================================ */
+        /* tt1 = SuperLU_timer_(); */
+        if (k0 == 0) { /* look-ahead all the columns in the window */
+            kk1 = k0 + 1;
+            kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        } else {  /* look-ahead one new column after the current window */
+            kk1 = k0 + num_look_aheads;
+            kk2 = SUPERLU_MIN (kk1, nsupers - 1);
+        }
+
+        for (kk0 = kk1; kk0 <= kk2; kk0++) {
+	    /* loop through look-ahead window in L */
+
+            kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
+            look_id = kk0 % (1 + num_look_aheads); /* which column in window */
+
+            if (look_ahead[kk] < k0) { /* does not depend on current column */
+                kcol = PCOL (kk, grid);
+                if (mycol == kcol) { /* I own this panel */
+
+                    /* Panel factorization -- Factor diagonal and subdiagonal
+                       L blocks and test for exact singularity.  */
+                    factored[kk] = 0; /* flag column kk as factored */
+                    double ttt1 = SuperLU_timer_();
+
+                    PDGSTRF2 (options, kk0, kk, thresh, Glu_persist,
+                              grid, Llu, U_diag_blk_send_req, tag_ub, stat, info);
+
+                     pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+
+                    /* Multicasts numeric values of L(:,kk) to process rows. */
+                    /* ttt1 = SuperLU_timer_(); */
+                    msgcnt = msgcnts[look_id];  /* point to the proper count array */
+                    send_req = send_reqs[look_id];
+
+                    lk = LBj (kk, grid);    /* Local block number in L */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    if (lsub1) {
+                        msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
+                        msgcnt[1] = lsub1[1] * SuperSize (kk); /* Lval_buf[] size */
+                    } else {
+                        msgcnt[0] = 0;
+                        msgcnt[1] = 0;
+                    }
+                    scp = &grid->rscp;  /* The scope of process row. */
+                    for (pj = 0; pj < Pc; ++pj) {
+                        if (ToSendR[lk][pj] != EMPTY) {
+                            lusup1 = Lnzval_bc_ptr[lk];
+                            MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                       SLU_MPI_TAG (0, kk0),  /* (4*kk0)%tag_ub */
+                                       scp->comm, &send_req[pj]);
+                            MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
+                                       SLU_MPI_TAG (1, kk0),  /* (4*kk0+1)%tag_ub */
+                                       scp->comm, &send_req[pj + Pc]);
+#if ( DEBUGlevel>=2 )
+			    printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
+				    iam, kk, msgcnt[0], msgcnt[1], pj);
+#endif
+                        }
+                    }
+                    /* stat->time9 += SuperLU_timer_() - ttt1; */
+                } else {     /* Post Recv of block column L(:,kk). */
+                    /* double ttt1 = SuperLU_timer_(); */
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        recv_req = recv_reqs[look_id];
+
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   MPI_DOUBLE, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+                    }
+                    /* stat->time10 += SuperLU_timer_() - ttt1; */
+                }  /* end if mycol == Pc(kk) */
+            }  /* end if look-ahead in L supernodes */
+
+            /* post irecv for U-row look-ahead */
+            krow = PROW (kk, grid);
+            if (myrow != krow) {
+                if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    Usub_buf = Llu->Usub_buf_2[look_id];
+                    Uval_buf = Llu->Uval_buf_2[look_id];
+
+                    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                               SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][0]);
+                    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
+                               SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][1]);
+                }
+            }
+
+        }  /* end for each column in look-ahead window for L supernodes */
+
+        /* stat->time4 += SuperLU_timer_()-tt1; */
+
+        /* ================================= *
+         * ==== look-ahead the U rows    === *
+         * ================================= */
+        kk1 = k0;
+        kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = kk1; kk0 < kk2; kk0++) {
+            kk = perm_c_supno[kk0]; /* order determined from static schedule */  
+            if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+                kcol = PCOL (kk, grid);
+                krow = PROW (kk, grid);
+                lk = LBj (kk, grid);  /* Local block number across row. NOT USED?? -- Sherry */
+
+                look_id = kk0 % (1 + num_look_aheads);
+                msgcnt = msgcntsU[look_id];
+                recv_req = recv_reqs[look_id];
+
+                /* ================================================= *
+                 * Check if diagonal block has been received         *
+                 * for panel factorization of U in look-ahead window *
+                 * ================================================= */
+
+                if (mycol == kcol) {  /* I own this column panel, no need
+                                         to receive L  */
+                    flag0 = flag1 = 1;
+                    msgcnt[0] = msgcnt[1] = -1; /* No need to transfer Lsub, nor Lval */
+                } else { /* Check to receive L(:,kk) from the left */
+                    flag0 = flag1 = 0;
+                    if ( ToRecv[kk] >= 1 ) {
+                        if ( recv_req[0] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[0], &flag0, &status);
+                            if ( flag0 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                                recv_req[0] = MPI_REQUEST_NULL;
+                            }
+                        } else flag0 = 1;
+
+                        if ( recv_req[1] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[1], &flag1, &status);
+                            if ( flag1 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[1]);
+                                recv_req[1] = MPI_REQUEST_NULL;
+                            }
+                        } else flag1 = 1;
+                    } else msgcnt[0] = 0;
+                }
+
+                if (flag0 && flag1) { /* L(:,kk) is ready */
+                    /* tt1 = SuperLU_timer_(); */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    if (myrow == krow) {
+                        factoredU[kk0] = 1;
+                        /* Parallel triangular solve across process row *krow* --
+                           U(k,j) = L(k,k) \ A(k,j).  */
+                        /* double ttt2 = SuperLU_timer_(); */
+                        double ttt2 = SuperLU_timer_();
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+			{
+                            PDGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
+                                      stat);
+                        }
+    
+                        pdgstrs2_timer += SuperLU_timer_()-ttt2;
+                        /* stat->time8 += SuperLU_timer_()-ttt2; */
+
+                        /* Multicasts U(kk,:) to process columns. */
+                        lk = LBi (kk, grid);
+                        usub = Ufstnz_br_ptr[lk];
+                        uval = Unzval_br_ptr[lk];
+                        if (usub) {
+                            msgcnt[2] = usub[2]; /* metadata size */
+                            msgcnt[3] = usub[1]; /* Uval[] size */
+                        } else {
+                            msgcnt[2] = msgcnt[3] = 0;
+                        }
+
+                        if (ToSendD[lk] == YES) {
+                            for (pi = 0; pi < Pr; ++pi) {
+                                if (pi != myrow) {
+#if ( PROFlevel>=1 )
+                                    TIC (t1);
+#endif
+
+                                    MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                                               SLU_MPI_TAG (2, kk0), /* (4*kk0+2)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi]);
+                                    MPI_Isend (uval, msgcnt[3], MPI_DOUBLE,
+                                               pi, SLU_MPI_TAG (3, kk0), /* (4*kk0+3)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi + Pr]);
+
+#if ( PROFlevel>=1 )
+                                    TOC (t2, t1);
+                                    stat->utime[COMM] += t2;
+                                    msg_cnt += 2;
+                                    msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                                    printf ("[%d] Send U(%4d,:) to Pr %2d\n",
+                                            iam, k, pi);
+#endif
+                                }   /* if pi ... */
+                            }   /* for pi ... */
+                        }       /* if ToSendD ... */
+
+                        /* stat->time2 += SuperLU_timer_()-tt1; */
+
+                    } /* end if myrow == krow */
+                } /* end if flag0 ... */
+            } /* end if factoredU[] ... */
+        } /* end for kk0 ... */
+
+        /* ============================================== *
+         * == start processing the current row of U(k,:) *
+         * ============================================== */
+        knsupc = SuperSize (k);
+        krow = PROW (k, grid);
+        kcol = PCOL (k, grid);
+
+        /* tt1 = SuperLU_timer_(); */
+        look_id = k0 % (1 + num_look_aheads);
+        recv_req = recv_reqs[look_id];
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+        Usub_buf = Llu->Usub_buf_2[look_id];
+        Uval_buf = Llu->Uval_buf_2[look_id];
+
+        if (mycol == kcol) {
+            lk = LBj (k, grid); /* Local block number in L */
+
+            for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup buffer */
+                if (ToSendR[lk][pj] != EMPTY) {
+                    MPI_Wait (&send_req[pj], &status);
+                    MPI_Wait (&send_req[pj + Pc], &status);
+                }
+            }
+            lsub = Lrowind_bc_ptr[lk];
+            lusup = Lnzval_bc_ptr[lk];
+        } else {
+            if (ToRecv[k] >= 1) { /* Recv block column L(:,k). */
+
+                scp = &grid->rscp;  /* The scope of process row. */
+
+                /* ============================================= *
+                 * Waiting for L(:,kk) for outer-product uptate  *
+                 * if iam in U(kk,:), then the diagonal block    *
+		 * did not reach in time for panel factorization *
+		 * of U(k,:)           	                         *
+                 * ============================================= */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                if (recv_req[0] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[0], &status);
+                    MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                    recv_req[0] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[0] = msgcntsU[look_id][0];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", 
+			   iam, k, look_id, msgcnt[0]);
+#endif
+                }
+
+                if (recv_req[1] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[1], &status);
+                    MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[1]);
+                    recv_req[1] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[1] = msgcntsU[look_id][1];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", 
+			   iam, k, look_id, msgcnt[1]);
+#endif
+                }
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
+                     iam, k, msgcnt[0], msgcnt[1], kcol);
+                fflush (stdout);
+#endif
+
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[0])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[0] = 0;
+	    }
+
+            lsub = Lsub_buf_2[look_id];
+            lusup = Lval_buf_2[look_id];
+        }                       /* if mycol = Pc(k) */
+        /* stat->time1 += SuperLU_timer_()-tt1; */
+
+        scp = &grid->cscp;      /* The scope of process column. */
+
+        /* tt1 = SuperLU_timer_(); */
+        if (myrow == krow) { /* I own U(k,:) */
+            lk = LBi (k, grid);
+            usub = Ufstnz_br_ptr[lk];
+            uval = Unzval_br_ptr[lk];
+
+            if (factoredU[k0] == -1) {
+                /* Parallel triangular solve across process row *krow* --
+                   U(k,j) = L(k,k) \ A(k,j).  */
+                 double ttt2 = SuperLU_timer_(); 
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+                {
+                    PDGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
+                }
+                pdgstrs2_timer += SuperLU_timer_() - ttt2; 
+
+	        /* Sherry -- need to set factoredU[k0] = 1; ?? */
+
+                /* Multicasts U(k,:) along process columns. */
+                if ( usub ) {
+                    msgcnt[2] = usub[2]; /* metadata size */
+                    msgcnt[3] = usub[1]; /* Uval[] size */
+                } else {
+                    msgcnt[2] = msgcnt[3] = 0;
+                }
+
+                if (ToSendD[lk] == YES) {
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) {
+#if ( PROFlevel>=1 )
+                            TIC (t1);
+#endif
+                            MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                                      SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */
+                                      scp->comm);
+                            MPI_Send (uval, msgcnt[3], MPI_DOUBLE, pi,
+                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ 
+                                      scp->comm);
+#if ( PROFlevel>=1 )
+                            TOC (t2, t1);
+                            stat->utime[COMM] += t2;
+                            msg_cnt += 2;
+                            msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                            printf ("[%d] Send U(%4d,:) down to Pr %2d\n", iam, k, pi);
+#endif
+                        } /* if pi ... */
+                    } /* for pi ... */
+                } /* if ToSendD ... */
+
+            } else { /* Panel U(k,:) already factorized */
+
+               /* ================================================ *
+                 * Wait for downward sending of U(k,:) to complete *
+		 * for outer-product update                        *
+                 * =============================================== */
+
+                if (ToSendD[lk] == YES) {
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) {
+                            MPI_Wait (&send_reqs_u[look_id][pi], &status);
+                            MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
+                        }
+                    }
+                }
+                msgcnt[2] = msgcntsU[look_id][2];
+                msgcnt[3] = msgcntsU[look_id][3];
+            }
+            /* stat->time2 += SuperLU_timer_()-tt1; */
+
+        } else {    /* myrow != krow */
+
+            /* ========================================= *
+             * wait for U(k,:) for outer-product updates *
+             * ========================================= */
+
+            if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Wait (&recv_reqs_u[look_id][0], &status);
+                MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+                MPI_Wait (&recv_reqs_u[look_id][1], &status);
+                MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[3]);
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+#endif
+                usub = Usub_buf;
+                uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[2])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[2] = 0;
+	    }
+            /* stat->time6 += SuperLU_timer_()-tt1; */
+        } /* end if myrow == Pr(k) */
+
+        /*
+         * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+         *  for (j = k+1; k < N; ++k) {
+         *     for (i = k+1; i < N; ++i)
+         *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+         *              && L(i,k) != 0 && U(k,j) != 0 )
+         *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+         */
+        msg0 = msgcnt[0];
+        msg2 = msgcnt[2];
+        /* tt1 = SuperLU_timer_(); */
+        if (msg0 && msg2) {     /* L(:,k) and U(k,:) are not empty. */
+            nsupr = lsub[1];    /* LDA of lusup. */
+            if (myrow == krow) { /* Skip diagonal block L(k,k). */
+                lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+                luptr0 = knsupc;
+                nlb = lsub[0] - 1;
+            } else {
+                lptr0 = BC_HEADER;
+                luptr0 = 0;
+                nlb = lsub[0];
+            }
+            iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+            rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+            nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+            klst = FstBlockC (k + 1);
+
+            /* -------------------------------------------------------------
+               Update the look-ahead block columns A(:,k+1:k+num_look_ahead)
+               ------------------------------------------------------------- */
+            iukp0 = iukp;
+            rukp0 = rukp;
+            /* reorder the remaining columns in bottome-up */
+            /* TAU_STATIC_TIMER_START("LOOK_AHEAD_UPDATE"); */
+            for (jj = 0; jj < nub; jj++) {
+#ifdef ISORT
+                iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+                perm_u[jj] = jj;
+#else
+                perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+                perm_u[2 * jj + 1] = jj;
+#endif
+                jb = usub[iukp];    /* Global block number of block U(k,j). */
+                nsupc = SuperSize (jb);
+                iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+                iukp += nsupc;
+            }
+            iukp = iukp0;
+#ifdef ISORT
+            isort (nub, iperm_u, perm_u);
+#else
+            qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+                   &superlu_sort_perm);
+#endif
+            j = jj0 = 0;
+
+/************************************************************************/
+            double ttx =SuperLU_timer_();
+
+#include "dlook_ahead_update.c"
+
+            lookaheadupdatetimer += SuperLU_timer_() - ttx;
+/************************************************************************/
+
+            /*ifdef OMP_LOOK_AHEAD */
+            /* TAU_STATIC_TIMER_STOP("LOOK_AHEAD_UPDATE"); */
+        }                       /* if L(:,k) and U(k,:) not empty */
+
+        /* stat->time3 += SuperLU_timer_()-tt1; */
+
+        /* ================== */
+        /* == post receive == */
+        /* ================== */
+        kk1 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = k0 + 1; kk0 <= kk1; kk0++) {
+            kk = perm_c_supno[kk0];
+            kcol = PCOL (kk, grid);
+
+            if (look_ahead[kk] == k0) {
+                if (mycol != kcol) {
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+
+                        look_id = kk0 % (1 + num_look_aheads);
+                        recv_req = recv_reqs[look_id];
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   MPI_DOUBLE, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+                    }
+                } else {
+                    lk = LBj (kk, grid);    /* Local block number. */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    lusup1 = Lnzval_bc_ptr[lk];
+                    if (factored[kk] == -1) {
+                        /* Factor diagonal and subdiagonal blocks and
+			   test for exact singularity.  */
+                        factored[kk] = 0; /* flag column kk as factored */
+                        double ttt1 = SuperLU_timer_(); 
+                        PDGSTRF2 (options, kk0, kk, thresh,
+                                  Glu_persist, grid, Llu, U_diag_blk_send_req,
+                                  tag_ub, stat, info);
+                        pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+
+                        /* Process column *kcol+1* multicasts numeric
+			   values of L(:,k+1) to process rows. */
+                        look_id = kk0 % (1 + num_look_aheads);
+                        send_req = send_reqs[look_id];
+                        msgcnt = msgcnts[look_id];
+
+                        if (lsub1) {
+                            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+                            msgcnt[1] = lsub1[1] * SuperSize (kk);
+                        } else {
+                            msgcnt[0] = 0;
+                            msgcnt[1] = 0;
+                        }
+
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        for (pj = 0; pj < Pc; ++pj) {
+                            if (ToSendR[lk][pj] != EMPTY) {
+                                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                           SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                           scp->comm, &send_req[pj]);
+                                MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
+                                           SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                           scp->comm, &send_req[pj + Pc]);
+                            }
+                        }
+                    }           /* for pj ... */
+                }
+            }
+        }
+
+        double tsch = SuperLU_timer_();
+
+	/*******************************************************************/
+
+#ifdef GPU_ACC
+
+#include "dSchCompUdt-cuda.c"
+
+#else 
+
+/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+#include "dSchCompUdt-2Ddynamic.c"
+
+#endif 
+	/*uncomment following to compare against SuperLU 3.3 baseline*/
+        /* #include "SchCompUdt--baseline.c"  */
+	/************************************************************************/
+        
+        NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+    }  /* for k0 = 0, ... */
+
+    /* ##################################################################
+       ** END MAIN LOOP: for k0 = ...
+       ################################################################## */
+    
+    pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
+
+    /* updating total flops */
+#if ( PRNTlevel>=1 )
+    if ( iam==0 ) {
+	printf("\nInitialization time\t%8.2lf seconds\n"
+	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
+        printf("\n---- Time breakdown in factorization ----\n");
+	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
+        printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
+        printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
+        printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
+	       
+        printf(".. Time in GEMM %8.2lf \n",
+	       LookAheadGEMMTimer + RemainGEMMTimer);
+        printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
+        printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
+
+        printf(".. Time to Scatter %8.2lf \n", 
+	       LookAheadScatterTimer + RemainScatterTimer);
+        printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
+        printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
+
+        printf("Total Time in Factorization            \t: %8.2lf seconds, \n", pxgstrfTimer);
+        printf("Total time in Schur update with offload\t  %8.2lf seconds,\n",CPUOffloadTimer );
+        printf("--------\n");
+	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
+    }
+#endif
+    
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+            dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+    // printf("Debug : MPI buffers 1\n");
+
+    /********************************************************
+     * Free memory                                          *
+     ********************************************************/
+
+    if (Pr * Pc > 1) {
+        SUPERLU_FREE (Lsub_buf_2[0]);   /* also free Lsub_buf_2[1] */
+        SUPERLU_FREE (Lval_buf_2[0]);   /* also free Lval_buf_2[1] */
+        if (Llu->bufmax[2] != 0)
+            SUPERLU_FREE (Usub_buf_2[0]);
+        if (Llu->bufmax[3] != 0)
+            SUPERLU_FREE (Uval_buf_2[0]);
+        if (U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) {
+            /* wait for last Isend requests to complete, deallocate objects */
+            for (krow = 0; krow < Pr; ++krow) {
+                if (krow != myrow)
+                    MPI_Wait (U_diag_blk_send_req + krow, &status);
+            }
+        }
+        SUPERLU_FREE (U_diag_blk_send_req);
+    }
+
+    log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword +
+		  (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword),
+		stat );
+    
+    SUPERLU_FREE (Lsub_buf_2);
+    SUPERLU_FREE (Lval_buf_2);
+    SUPERLU_FREE (Usub_buf_2);
+    SUPERLU_FREE (Uval_buf_2);
+    SUPERLU_FREE (perm_c_supno);
+    SUPERLU_FREE (perm_u);
+#ifdef ISORT
+    SUPERLU_FREE (iperm_u);
+#endif
+    SUPERLU_FREE (look_ahead);
+    SUPERLU_FREE (factoredU);
+    SUPERLU_FREE (factored);
+    log_memory(-(6 * nsupers * iword), stat);
+
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (msgcnts[i]);
+        SUPERLU_FREE (msgcntsU[i]);
+    }
+    SUPERLU_FREE (msgcnts);
+    SUPERLU_FREE (msgcntsU);
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (send_reqs_u[i]);
+        SUPERLU_FREE (recv_reqs_u[i]);
+        SUPERLU_FREE (send_reqs[i]);
+        SUPERLU_FREE (recv_reqs[i]);
+    }
+
+    SUPERLU_FREE (recv_reqs_u);
+    SUPERLU_FREE (send_reqs_u);
+    SUPERLU_FREE (recv_reqs);
+    SUPERLU_FREE (send_reqs);
+
+    // printf("Debug : MPI buffers 3\n");
+
+#ifdef GPU_ACC
+    checkCuda (cudaFreeHost (bigV));
+    checkCuda (cudaFreeHost (bigU));
+    cudaFree( (void*)dA ); /* Sherry added */
+    cudaFree( (void*)dB );
+    cudaFree( (void*)dC );
+    SUPERLU_FREE( handle );
+    SUPERLU_FREE( streams );
+    SUPERLU_FREE( stream_end_col );
+#else
+    SUPERLU_FREE (bigV);
+    SUPERLU_FREE (bigU);
+#endif
+
+    log_memory(-(bigv_size + bigu_size) * dword, stat);
+    // printf("Debug : MPI buffers 5\n");
+
+    SUPERLU_FREE (Llu->ujrow);
+    SUPERLU_FREE (tempv2d);
+    SUPERLU_FREE (indirect);
+    SUPERLU_FREE (indirect2); /* Sherry added */
+    SUPERLU_FREE (iuip);
+    SUPERLU_FREE (ruip);
+
+    ldt = sp_ienv_dist(3);
+    log_memory( -(3 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+		  + 2 * k * iword), stat );
+
+    /* Sherry added */
+    SUPERLU_FREE(omp_loop_time);
+    SUPERLU_FREE(full_u_cols);
+    SUPERLU_FREE(blk_ldu);
+    log_memory(-2 * ncb * dword, stat);
+
+    SUPERLU_FREE(lookAheadFullRow);
+    SUPERLU_FREE(lookAheadStRow);
+    SUPERLU_FREE(lookAhead_lptr);
+    SUPERLU_FREE(lookAhead_ib);
+
+    SUPERLU_FREE(RemainFullRow);
+    SUPERLU_FREE(RemainStRow);
+    SUPERLU_FREE(Remain_lptr);
+    SUPERLU_FREE(Remain_ib);
+    SUPERLU_FREE(Remain_info);
+    SUPERLU_FREE(lookAhead_L_buff);
+    SUPERLU_FREE(Remain_L_buff);
+    log_memory( -(4 * mrb * iword + mrb * sizeof(Remain_info_t) + 
+		  ldt * ldt * (num_look_aheads + 1) * dword +
+		  Llu->bufmax[1] * dword), stat );
+
+    SUPERLU_FREE(Ublock_info);
+    SUPERLU_FREE(Ublock_info_iukp);
+    SUPERLU_FREE(Ublock_info_rukp);
+    SUPERLU_FREE(Ublock_info_jb);
+
+
+#if ( PROFlevel>=1 )
+    TIC (t1);
+#endif
+
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+    // printf("test out\n");
+
+#if ( PROFlevel>=1 )
+    TOC (t2, t1);
+    stat->utime[COMM] += t2;
+    {
+        float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+
+        MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_cnt, &msg_cnt_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        if (!iam) {
+            printf ("\tPDGSTRF comm stat:"
+                    "\tAvg\tMax\t\tAvg\tMax\n"
+                    "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+                    msg_cnt_sum / Pr / Pc, msg_cnt_max,
+                    msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+        }
+    }
+#endif
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce (&zero_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce (&total_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            dPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
+            dPrintUblocks (iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit pdgstrf()");
+#endif
+
+    return 0;
+} /* PDGSTRF */
+
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
new file mode 100644
index 0000000..06f0f37
--- /dev/null
+++ b/SRC/pdgstrf2.c
@@ -0,0 +1,375 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Performs panel LU factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/* This pdgstrf2 is based on TRSM function */
+void
+pdgstrf2_trsm
+    (superlu_dist_options_t * options, int_t k0, int_t k, double thresh,
+     Glu_persist_t * Glu_persist, gridinfo_t * grid, LocalLU_t * Llu,
+     MPI_Request * U_diag_blk_send_req, int tag_ub,
+     SuperLUStat_t * stat, int *info)
+{
+    /* printf("entering pdgstrf2 %d \n", grid->iam); */
+    int cols_left, iam, l, pkk, pr;
+    int incx = 1, incy = 1;
+
+    int nsupr;                  /* number of rows in the block (LDA) */
+    int nsupc;                /* number of columns in the block */
+    int luptr;
+    int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt;
+    int_t *xsup = Glu_persist->xsup;
+    double *lusup, temp;
+    double *ujrow, *ublk_ptr;   /* pointer to the U block */
+    double alpha = -1, zero = 0.0;
+    int_t Pr;
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+
+    /* Initialization. */
+    iam = grid->iam;
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    krow = PROW (k, grid);
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    j = LBj (k, grid);          /* Local block number */
+    jfst = FstBlockC (k);
+    jlst = FstBlockC (k + 1);
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize (k);
+    if (Llu->Lrowind_bc_ptr[j])
+        nsupr = Llu->Lrowind_bc_ptr[j][1];
+    else
+        nsupr = 0;
+#ifdef PI_DEBUG
+    printf ("rank %d  Iter %d  k=%d \t dtrsm nsuper %d \n",
+            iam, k0, k, nsupr);
+#endif
+    ublk_ptr = ujrow = Llu->ujrow;
+
+    luptr = 0;                  /* Point to the diagonal entries. */
+    cols_left = nsupc;          /* supernode size */
+    int ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    u_diag_cnt = 0;
+    incy = ld_ujrow;
+
+    if ( U_diag_blk_send_req && 
+	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
+        /* There are pending sends - wait for all Isend to complete */
+        for (pr = 0; pr < Pr; ++pr)
+            if (pr != myrow) {
+                MPI_Wait (U_diag_blk_send_req + pr, &status);
+            }
+
+	/* flag no more outstanding send request. */
+	U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
+    }
+
+    if (iam == pkk) {            /* diagonal process */
+        for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
+            /* Diagonal pivot */
+            i = luptr;
+            /* Not to replace zero pivot.  */
+            if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0 )  {
+                if (fabs (lusup[i]) < thresh) {  /* Diagonal */
+
+#if ( PRNTlevel>=2 )
+                    printf ("(%d) .. col %d, tiny pivot %e  ",
+                            iam, jfst + j, lusup[i]);
+#endif
+                    /* Keep the new diagonal entry with the same sign. */
+                    if (lusup[i] < 0)  lusup[i] = -thresh;
+                    else  lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+                    printf ("replaced by %e\n", lusup[i]);
+#endif
+                    ++(stat->TinyPivots);
+                }
+            }
+
+#if 0
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+                 ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */
+#endif
+
+            /* storing U in full form  */
+            int st;
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) {
+                st = j * ld_ujrow + j;
+                ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */
+            }
+
+            if ( ujrow[0] == zero ) { /* Test for singularity. */
+                *info = j + jfst + 1;
+            } else {              /* Scale the j-th column within diag. block. */
+                temp = 1.0 / ujrow[0];
+                for (i = luptr + 1; i < luptr - j + nsupc; ++i)
+		    lusup[i] *= temp;
+                stat->ops[FACT] += nsupc - j - 1;
+            }
+
+            /* Rank-1 update of the trailing submatrix within diag. block. */
+            if (--cols_left) {
+                /* l = nsupr - j - 1;  */
+                l = nsupc - j - 1;  /* Piyush */
+                dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
+                       &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
+                       &nsupr);
+                stat->ops[FACT] += 2 * l * cols_left;
+            }
+
+            /* ujrow = ublk_ptr + u_diag_cnt;  */
+            ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */
+            luptr += nsupr + 1; /* move to next column */
+
+        }                       /* for column j ...  first loop */
+
+	/* ++++++++++second step ====== */
+
+        ublk_ptr = ujrow = Llu->ujrow;
+
+        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block */
+            /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+            for (pr = 0; pr < Pr; ++pr)
+                if (pr != krow) {
+                    /* tag = ((k0<<2)+2) % tag_ub;        */
+                    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+                    MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_DOUBLE, pr,
+                               SLU_MPI_TAG (4, k0) /* tag */ ,
+                               comm, U_diag_blk_send_req + pr);
+
+                }
+
+	    /* flag outstanding Isend */
+            U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
+        }
+
+        /* pragma below would be changed by an MKL call */
+
+        char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
+
+        l = nsupr - nsupc;
+        // n = nsupc;
+        double alpha = 1.0;
+#ifdef PI_DEBUG
+        printf ("calling dtrsm\n");
+        printf ("dtrsm diagonal param 11:  %d \n", nsupr);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+        dtrsm_ (&side, &uplo, &transa, &diag,
+                &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
+		1, 1, 1, 1);
+#else
+        dtrsm_ (&side, &uplo, &transa, &diag,
+                &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
+#endif
+
+    } else {  /* non-diagonal process */
+        /* ================================================ *
+         * Receive the diagonal block of U                  *
+         * for panel factorization of L(:,k)                *
+         * note: we block for panel factorization of L(:,k) *
+         * but panel factorization of U(:,k) don't          *
+         * ================================================ */
+
+        /* tag = ((k0<<2)+2) % tag_ub;        */
+        /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+        // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+        MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow,
+                  SLU_MPI_TAG (4, k0) /* tag */ ,
+                  comm, &status);
+        if (nsupr > 0) {
+            char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
+            double alpha = 1.0;
+
+#ifdef PI_DEBUG
+            printf ("dtrsm non diagonal param 11:  %d \n", nsupr);
+            if (!lusup)
+                printf (" Rank :%d \t Empty block column occured :\n", iam);
+#endif
+#if defined (USE_VENDOR_BLAS)
+            dtrsm_ (&side, &uplo, &transa, &diag,
+                    &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
+#else
+            dtrsm_ (&side, &uplo, &transa, &diag,
+                    &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
+#endif
+        }
+
+    }                           /* end if pkk ... */
+
+    /* printf("exiting pdgstrf2 %d \n", grid->iam);  */
+
+}  /* PDGSTRF2_trsm */
+
+
+/************************************************************************/
+void pdgstrs2_omp
+/************************************************************************/
+(int_t k0, int_t k, Glu_persist_t * Glu_persist,
+ gridinfo_t * grid, LocalLU_t * Llu, SuperLUStat_t * stat)
+{
+#ifdef PI_DEBUG
+    printf("====Entering pdgstrs2==== \n");
+#endif
+    int iam, pkk;
+    int incx = 1;
+    int nsupr;                /* number of rows in the block L(:,k) (LDA) */
+    int segsize;
+    int nsupc;                /* number of columns in the block */
+    int_t luptr, iukp, rukp;
+    int_t b, gb, j, klst, knsupc, lk, nb;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *usub;
+    double *lusup, *uval;
+
+#ifdef _OPENMP
+    int thread_id = omp_get_thread_num ();
+    int num_thread = omp_get_num_threads ();
+#else
+    int thread_id = 0;
+    int num_thread = 1;
+#endif
+
+    /* Quick return. */
+    lk = LBi (k, grid);         /* Local block number */
+    if (!Llu->Unzval_br_ptr[lk]) return;
+
+    /* Initialization. */
+    iam = grid->iam;
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
+    int gb_col_cycle;  /* cycle through block columns  */
+    klst = FstBlockC (k + 1);
+    knsupc = SuperSize (k);
+    usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if (iam == pkk) {
+        lk = LBj (k, grid);
+        nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+        lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+        nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1];   /* LDA of lusup[] */
+        lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b)  {
+        /* assuming column cyclic distribution of data among threads */
+        gb = usub[iukp];
+        gb_col_cycle = gb / grid->npcol;
+        nsupc = SuperSize (gb);
+        iukp += UB_DESCRIPTOR;
+
+        /* Loop through all the segments in the block. */
+        for (j = 0; j < nsupc; ++j) {
+#ifdef PI_DEBUG
+            printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
+#endif 
+            segsize = klst - usub[iukp++];
+            if (segsize) {    /* Nonzero segment. */
+                luptr = (knsupc - segsize) * (nsupr + 1);
+
+		/* if gb belongs to present thread then do the factorize */
+                if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
+#ifdef PI_DEBUG
+                    printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
+#endif
+#if defined (USE_VENDOR_BLAS)
+                    dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx, 1, 1, 1);
+#else
+                    dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx);
+#endif
+                }
+
+                if (thread_id == 0)
+                    stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
+                rukp += segsize;
+            }
+        }
+    }                           /* for b ... */
+
+} /* PDGSTRS2_omp */
+
diff --git a/SRC/pdgstrf_X1.c b/SRC/pdgstrf_X1.c
new file mode 100644
index 0000000..e02b74a
--- /dev/null
+++ b/SRC/pdgstrf_X1.c
@@ -0,0 +1,1347 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Performs the LU factorization in parallel
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *
+ * Sketch of the algorithm
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pdgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ * 
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pdgstrf2(k), one column at a time 
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j) 
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i) 
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ *
+ * Remaining issues
+ *   (1) Use local indices for L subscripts and SPA.  [DONE]
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+#define CRAY_X1
+#if ( VAMPIR>=1 )
+#include <VT.h>
+#endif
+
+/*
+ * Internal prototypes
+ */
+static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
+		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
+#ifdef _CRAY
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *);
+#endif
+
+/* 
+ 
+ *
+ */
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *  PDGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ * 
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+void pdgstrf
+/************************************************************************/
+(
+ superlu_options_t *options, int m, int n, double anorm,
+ LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
+ )
+
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd("N", strlen("N"));
+    _fcd ftcs1 = _cptofcd("L", strlen("L"));
+    _fcd ftcs2 = _cptofcd("N", strlen("N"));
+    _fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    double alpha = 1.0, beta = 0.0;
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf,
+          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
+    double *lusup, *lusup1, *uval, *Uval_buf,
+           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+          nlb, nub, nsupc, rel, rukp;
+    int_t Pc, Pr;
+    int   iam, kcol, krow, mycol, myrow, pi, pj;
+    int   j, k, lk, nsupers;
+    int   nsupr, nbrow, segsize;
+    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
+		      *     0 : transferred in Lsub_buf[]
+		      *     1 : transferred in Lval_buf[]
+		      *     2 : transferred in Usub_buf[] 
+		      *     3 : transferred in Uval_buf[]
+		      */
+    int_t  msg0, msg2;
+    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    double **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t  *index;
+    double *nzval;
+    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    double *ucol;
+    int_t  *indirect;
+    double *tempv, *tempv2d;
+    int_t iinfo;
+    int_t *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    double s_eps, thresh;
+    double *tempU2d, *tempu;
+    int    full, ldt, ldu, lead_zero, ncols;
+    MPI_Request recv_req[4], *send_req;
+    MPI_Status status;
+#ifdef CRAY_X1
+    int nonzero_segs;
+#endif
+#if ( DEBUGlevel>=2 ) 
+    int_t num_copy=0, num_update=0;
+#endif
+#if ( PRNTlevel==3 )
+    int_t  zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    int_t iword = sizeof(int_t), dword = sizeof(double);
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( m < 0 ) *info = -2;
+    else if ( n < 0 ) *info = -3;
+    if ( *info ) {
+	pxerbla("pdgstrf", grid, -*info);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( m == 0 || n == 0 ) return;
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = slamch_("Epsilon");
+    thresh = s_eps * anorm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrf()");
+#endif
+
+    stat->ops[FACT] = 0.0;
+
+    if ( Pr*Pc > 1 ) {
+	i = Llu->bufmax[0];
+	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lsub_buf.");
+	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
+	i = Llu->bufmax[1];
+	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lval_buf[].");
+	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
+	if ( Llu->bufmax[2] != 0 ) 
+	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
+		ABORT("Malloc fails for Usub_buf[].");
+	if ( Llu->bufmax[3] != 0 ) 
+	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
+		ABORT("Malloc fails for Uval_buf[].");
+	if ( !(send_req =
+	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
+	    ABORT("Malloc fails for send_req[].");
+    }
+    if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )
+	ABORT("Malloc fails for ujrow[].");
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
+	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
+	       Llu->bufmax[0], Llu->bufmax[1], 
+	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
+    }
+#endif
+
+    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
+    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
+    Lval_buf_2[0] = Llu->Lval_buf_2[0];
+    Lval_buf_2[1] = Llu->Lval_buf_2[1];
+    Usub_buf = Llu->Usub_buf;
+    Uval_buf = Llu->Uval_buf;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv;
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
+    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
+	ABORT("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt*ldt;
+#ifdef CRAY_X1
+    if ( !(indirect = intMalloc_dist(2*ldt)) )
+	ABORT("Malloc fails for indirect[].");
+#else
+    if ( !(indirect = intMalloc_dist(ldt)) )
+	ABORT("Malloc fails for indirect[].");
+#endif
+    k = CEILING( nsupers, Pr ); /* Number of local block rows */
+    if ( !(iuip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for iuip[].");
+    if ( !(ruip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for ruip[].");
+
+#if ( VAMPIR>=1 )
+    VT_symdef(1, "Send-L", "Comm");
+    VT_symdef(2, "Recv-L", "Comm");
+    VT_symdef(3, "Send-U", "Comm");
+    VT_symdef(4, "Recv-U", "Comm");
+    VT_symdef(5, "TRF2", "Factor");
+    VT_symdef(100, "Factor", "Factor");
+    VT_begin(100);
+    VT_traceon();
+#endif
+
+    /* ---------------------------------------------------------------
+       Handle the first block column separately to start the pipeline.
+       --------------------------------------------------------------- */
+    if ( mycol == 0 ) {
+#if ( VAMPIR>=1 )
+	VT_begin(5);
+#endif
+	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
+#if ( VAMPIR>=1 )
+	VT_end(5);
+#endif
+
+	scp = &grid->rscp; /* The scope of process row. */
+
+	/* Process column *kcol* multicasts numeric values of L(:,k) 
+	   to process rows. */
+	lsub = Lrowind_bc_ptr[0];
+	lusup = Lnzval_bc_ptr[0];
+	if ( lsub ) {
+	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
+	    msgcnt[1] = lsub[1] * SuperSize( 0 );
+	} else {
+	    msgcnt[0] = msgcnt[1] = 0;
+	}
+	
+	for (pj = 0; pj < Pc; ++pj) {
+	    if ( ToSendR[0][pj] != EMPTY ) {
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(1);
+#endif
+		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
+			  &send_req[pj] );
+		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
+			  &send_req[pj+Pc] );
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+		       iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+#if ( VAMPIR>=1 )
+		VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+		msg_cnt += 2;
+		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+	    }
+	} /* for pj ... */
+    } else { /* Post immediate receives. */
+	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
+	    scp = &grid->rscp; /* The scope of process row. */
+	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
+		      0, scp->comm, &recv_req[0] );
+	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
+		      1, scp->comm, &recv_req[1] );
+#if ( DEBUGlevel>=2 )
+	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
+#endif
+	}
+    } /* if mycol == 0 */
+
+    /* ------------------------------------------
+       MAIN LOOP: Loop through all block columns.
+       ------------------------------------------ */
+    for (k = 0; k < nsupers; ++k) {
+
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+
+	if ( mycol == kcol ) {
+	    lk = LBj( k, grid ); /* Local block number. */
+
+	    for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup. */
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    MPI_Wait( &send_req[pj], &status );
+		    MPI_Wait( &send_req[pj+Pc], &status );
+		}
+	    }
+	    lsub = Lrowind_bc_ptr[lk];
+	    lusup = Lnzval_bc_ptr[lk];
+	} else {
+	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
+		scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(2);
+#endif
+		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[0]);*/
+		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
+			 (4*k)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[0], &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
+		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[1]);*/
+		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			 (4*k+1)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[1], &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
+#if ( VAMPIR>=1 )
+		VT_end(2);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
+		       iam, k, msgcnt[0], msgcnt[1], kcol);
+		fflush(stdout);
+#endif
+		lsub = Lsub_buf_2[k%2];
+		lusup = Lval_buf_2[k%2];
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[0] ) ++zero_msg;
+#endif
+	    } else msgcnt[0] = 0;
+	} /* if mycol = Pc(k) */
+
+	scp = &grid->cscp; /* The scope of process column. */
+
+	if ( myrow == krow ) {
+	    /* Parallel triangular solve across process row *krow* --
+	       U(k,j) = L(k,k) \ A(k,j).  */
+#ifdef _CRAY
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
+#else
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
+#endif
+
+	    /* Multicasts U(k,:) to process columns. */
+	    lk = LBi( k, grid );
+	    usub = Ufstnz_br_ptr[lk];
+	    uval = Unzval_br_ptr[lk];
+	    if ( usub )	{
+		msgcnt[2] = usub[2];
+		msgcnt[3] = usub[1];
+	    } else {
+		msgcnt[2] = msgcnt[3] = 0;
+	    }
+
+	    if ( ToSendD[lk] == YES ) {
+		for (pi = 0; pi < Pr; ++pi) {
+		    if ( pi != myrow ) {
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+			VT_begin(3);
+#endif
+			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
+				 (4*k+2)%NTAGS, scp->comm);
+			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
+				 (4*k+3)%NTAGS, scp->comm);
+#if ( VAMPIR>=1 )
+			VT_end(3);
+#endif
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[COMM] += t2;
+			msg_cnt += 2;
+			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
+#endif
+		    } /* if pi ... */
+		} /* for pi ... */
+	    } /* if ToSendD ... */
+	} else { /* myrow != krow */
+	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(4);
+#endif
+		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[2]);*/
+		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+			 (4*k+2)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
+		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[3]);*/
+		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
+			 (4*k+3)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
+#if ( VAMPIR>=1 )
+		VT_end(4);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+		usub = Usub_buf;
+		uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[2] ) ++zero_msg;
+#endif
+	    } else msgcnt[2] = 0;
+	} /* if myrow == Pr(k) */
+	  
+	/* 
+	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+	 *  for (j = k+1; k < N; ++k) {
+	 *     for (i = k+1; i < N; ++i) 
+	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+	 *              && L(i,k) != 0 && U(k,j) != 0 )
+	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+	 */
+	msg0 = msgcnt[0];
+	msg2 = msgcnt[2];
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    nsupr = lsub[1]; /* LDA of lusup. */
+	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
+		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
+		luptr0 = knsupc;
+		nlb = lsub[0] - 1;
+	    } else {
+		lptr0 = BC_HEADER;
+		luptr0 = 0;
+		nlb = lsub[0];
+	    }
+	    lptr = lptr0;
+	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
+		ib = lsub[lptr];
+		lib = LBi( ib, grid );
+		iuip[lib] = BR_HEADER;
+		ruip[lib] = 0;
+		lptr += LB_DESCRIPTOR + lsub[lptr+1];
+	    }
+	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
+	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
+	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
+	    klst = FstBlockC( k+1 );
+	    
+	    /* ---------------------------------------------------
+	       Update the first block column A(:,k+1).
+	       --------------------------------------------------- */
+	    jb = usub[iukp];   /* Global block number of block U(k,j). */
+	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
+		--nub;
+		lptr = lptr0;
+		luptr = luptr0;
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+			 iam, full, k, jb, ldu, ncols, nsupc);
+		  ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+				tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr]; /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0, it = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (it = 0, i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    }  /* if jb == k+1 */
+	} /* if L(:,k) and U(k,:) not empty */
+
+
+	if ( k+1 < nsupers ) {
+	  kcol = PCOL( k+1, grid );
+	  if ( mycol == kcol ) {
+#if ( VAMPIR>=1 )
+	    VT_begin(5);
+#endif
+	    /* Factor diagonal and subdiagonal blocks and test for exact
+	       singularity.  */
+	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
+#if ( VAMPIR>=1 )
+	    VT_end(5);
+#endif
+
+	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
+	       to process rows. */
+	    lk = LBj( k+1, grid ); /* Local block number. */
+	    lsub1 = Lrowind_bc_ptr[lk];
+ 	    if ( lsub1 ) {
+		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
+		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
+	    } else {
+		msgcnt[0] = 0;
+		msgcnt[1] = 0;
+	    }
+	    scp = &grid->rscp; /* The scope of process row. */
+	    for (pj = 0; pj < Pc; ++pj) {
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+		    TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		    VT_begin(1);
+#endif
+		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
+			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
+		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
+			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
+#if ( VAMPIR>=1 )
+		    VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		    TOC(t2, t1);
+		    stat->utime[COMM] += t2;
+		    msg_cnt += 2;
+		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+			   iam, k+1, msgcnt[0], msgcnt[1], pj);
+#endif
+		}
+	    } /* for pj ... */
+	  } else { /* Post Recv of block column L(:,k+1). */
+	    if ( ToRecv[k+1] >= 1 ) {
+		scp = &grid->rscp; /* The scope of process row. */
+		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
+			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
+		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
+#endif
+	    }
+	  } /* if mycol == Pc(k+1) */
+        } /* if k+1 < nsupers */
+
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    /* ---------------------------------------------------
+	       Update all other blocks using block row U(k,:)
+	       --------------------------------------------------- */
+	    for (j = 0; j < nub; ++j) { 
+		lptr = lptr0;
+		luptr = luptr0;
+		jb = usub[iukp];  /* Global block number of block U(k,j). */
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+		       iam, full, k, jb, ldu, ncols, nsupc);
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		    ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+			        tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr];       /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			/* Skip descriptor.  Now point to fstnz index of 
+			   block U(i,j). */
+			iuip[lib] += UB_DESCRIPTOR;
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0 ; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted for the L blocks.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    } /* for j ... */
+	} /* if  k L(:,k) and U(k,:) are not empty */
+
+    } 
+    /* ------------------------------------------
+       END MAIN LOOP: for k = ...
+       ------------------------------------------ */
+
+#if ( VAMPIR>=1 )
+    VT_end(100);
+    VT_traceoff();
+#endif
+
+    if ( Pr*Pc > 1 ) {
+	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
+	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
+	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
+	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
+	SUPERLU_FREE(send_req);
+    }
+
+    SUPERLU_FREE(Llu->ujrow);
+    SUPERLU_FREE(tempv2d);
+    SUPERLU_FREE(indirect);
+    SUPERLU_FREE(iuip);
+    SUPERLU_FREE(ruip);
+
+    /* Prepare error message. */
+    if ( *info == 0 ) *info = n + 1;
+#if ( PROFlevel>=1 )
+    TIC(t1);
+#endif
+    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
+#if ( PROFlevel>=1 )
+    TOC(t2, t1);
+    stat->utime[COMM] += t2;
+    {
+	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+	
+	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_cnt, &msg_cnt_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	if ( !iam ) {
+	    printf("\tPDGSTRF comm stat:"
+		   "\tAvg\tMax\t\tAvg\tMax\n"
+		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
+		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
+	}
+    }
+#endif
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( PRNTlevel==2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+	if ( iam == i ) {
+	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    printf("(%d)\n", iam);
+	    PrintInt10("Recv", nsupers, Llu->ToRecv);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrf()");
+#endif
+} /* PDGSTRF */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the process column that owns block column *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+static void pdgstrf2
+/************************************************************************/
+(
+ superlu_options_t *options,
+ int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, int* info
+ )
+
+{
+    int    c, iam, l, pkk;
+    int    incx = 1, incy = 1;
+    int    nsupr; /* number of rows in the block (LDA) */
+    int    luptr;
+    int_t  i, krow, j, jfst, jlst;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  *xsup = Glu_persist->xsup;
+    double *lusup, temp;
+    double *ujrow;
+    double alpha = -1;
+    *info = 0;
+
+    /* Quick return. */
+
+    /* Initialization. */
+    iam   = grid->iam;
+    krow  = PROW( k, grid );
+    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    j     = LBj( k, grid ); /* Local block number */
+    jfst  = FstBlockC( k );
+    jlst  = FstBlockC( k+1 );
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize( k );
+    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
+    ujrow = Llu->ujrow;
+
+    luptr = 0; /* Point to the diagonal entries. */
+    c = nsupc;
+    for (j = 0; j < jlst - jfst; ++j) {
+	/* Broadcast the j-th row (nsupc - j) elements to
+	   the process column. */
+	if ( iam == pkk ) { /* Diagonal process. */
+	    i = luptr;
+	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
+		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
+#if ( PRNTlevel>=2 )
+		    printf("(%d) .. col %d, tiny pivot %e  ",
+			   iam, jfst+j, lusup[i]);
+#endif
+		    /* Keep the replaced diagonal with the same sign. */
+		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
+		    else lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+		    printf("replaced by %e\n", lusup[i]);
+#endif
+		    ++(stat->TinyPivots);
+		}
+	    }
+	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
+	}
+#if 0
+	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
+#else
+	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
+	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
+		   grid, COMM_COLUMN, &c);*/
+#endif
+
+#if ( DEBUGlevel>=2 )
+if ( k == 3329 && j == 2 ) {
+	if ( iam == pkk ) {
+	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	} else {
+	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	}
+}
+#endif
+
+	if ( !lusup ) { /* Empty block column. */
+	    --c;
+	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
+	    continue;
+	}
+
+	/* Test for singularity. */
+	if ( ujrow[0] == 0.0 ) {
+	    *info = j+jfst+1;
+	} else {
+	    /* Scale the j-th column of the matrix. */
+	    temp = 1.0 / ujrow[0];
+	    if ( iam == pkk ) {
+		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr-j-1;
+	    } else {
+		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr;
+	    }
+	}
+	    
+	/* Rank-1 update of the trailing submatrix. */
+	if ( --c ) {
+	    if ( iam == pkk ) {
+		l = nsupr - j - 1;
+#ifdef _CRAY
+		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#else
+		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * l * c;
+	    } else {
+#ifdef _CRAY
+		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#else
+		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * nsupr * c;
+	    }
+	}
+	
+	/* Move to the next column. */
+	if ( iam == pkk ) luptr += nsupr + 1;
+	else luptr += nsupr;
+
+    } /* for j ... */
+
+} /* PDGSTRF2 */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre> 
+ * Purpose
+ * =======
+ *   Perform parallel triangular solves
+ *           U(k,:) := A(k,:) \ L(k,k). 
+ *   Only the process column that owns block column *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * m      (input) int (global)
+ *        Number of rows in the matrix.
+ *
+ * k      (input) int (global)
+ *        The row number of the block row to be factorized.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization; 
+ *        See SuperLUStat_t structure defined in util.h.
+ * </pre>
+ */
+static void pdgstrs2
+/************************************************************************/
+#ifdef _CRAY
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
+ )
+#else
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat
+ )
+#endif
+
+{
+    int    iam, pkk;
+    int    incx = 1;
+    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int    segsize;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  luptr, iukp, rukp;
+    int_t  b, gb, j, klst, knsupc, lk, nb;
+    int_t  *xsup = Glu_persist->xsup;
+    int_t  *usub;
+    double *lusup, *uval;
+
+    /* Quick return. */
+    lk = LBi( k, grid ); /* Local block number */
+    if ( !Llu->Unzval_br_ptr[lk] ) return;
+
+    /* Initialization. */
+    iam  = grid->iam;
+    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    klst = FstBlockC( k+1 );
+    knsupc = SuperSize( k );
+    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if ( iam == pkk ) {
+	lk = LBj( k, grid );
+	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+	lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
+	lusup = Llu->Lval_buf_2[k%2];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b) {
+	gb = usub[iukp];
+	nsupc = SuperSize( gb );
+	iukp += UB_DESCRIPTOR;
+
+	/* Loop through all the segments in the block. */
+	for (j = 0; j < nsupc; ++j) {
+	    segsize = klst - usub[iukp++]; 
+	    if ( segsize ) { /* Nonzero segment. */
+		luptr = (knsupc - segsize) * (nsupr + 1);
+#ifdef _CRAY
+		STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
+		      &uval[rukp], &incx);
+#else
+		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx);
+#endif
+		stat->ops[FACT] += segsize * (segsize + 1);
+		rukp += segsize;
+	    }
+	}
+    } /* for b ... */
+
+} /* PDGSTRS2 */
+
+static int
+probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
+	   int buf_size)
+{
+    MPI_Status status;
+    int count; 
+
+    MPI_Probe( source, tag, comm, &status );
+    MPI_Get_count( &status, datatype, &count );
+    if ( count > buf_size ) {
+        printf("(%d) Recv'ed count %d > buffer size $d\n",
+	       iam, count, buf_size);
+	exit(-1);
+    }
+    return 0;
+}
diff --git a/SRC/pdgstrf_irecv.c b/SRC/pdgstrf_irecv.c
new file mode 100644
index 0000000..ff65da0
--- /dev/null
+++ b/SRC/pdgstrf_irecv.c
@@ -0,0 +1,1345 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Performs LU factorization in parallel
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *
+ *
+ * Sketch of the algorithm
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pdgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ * 
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pdgstrf2(k), one column at a time 
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j) 
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i) 
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ *
+ * Remaining issues
+ *   (1) Use local indices for L subscripts and SPA.  [DONE]
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+#if ( VAMPIR>=1 )
+#include <VT.h>
+#endif
+
+/*
+ * Internal prototypes
+ */
+static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
+		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
+#ifdef _CRAY
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *);
+#endif
+
+/************************************************************************/
+
+/*! \brief
+ * 
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *  PDGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ * 
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t pdgstrf
+/************************************************************************/
+(
+ superlu_options_t *options, int m, int n, double anorm,
+ LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
+ )
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd("N", strlen("N"));
+    _fcd ftcs1 = _cptofcd("L", strlen("L"));
+    _fcd ftcs2 = _cptofcd("N", strlen("N"));
+    _fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    double alpha = 1.0, beta = 0.0;
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf,
+          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
+    double *lusup, *lusup1, *uval, *Uval_buf,
+           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+          nlb, nub, nsupc, rel, rukp;
+    int_t Pc, Pr;
+    int   iam, kcol, krow, mycol, myrow, pi, pj;
+    int   j, k, lk, nsupers;
+    int   nsupr, nbrow, segsize;
+    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
+		      *     0 : transferred in Lsub_buf[]
+		      *     1 : transferred in Lval_buf[]
+		      *     2 : transferred in Usub_buf[] 
+		      *     3 : transferred in Uval_buf[]
+		      */
+    int_t  msg0, msg2;
+    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    double **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t  *index;
+    double *nzval;
+    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    double *ucol;
+    int_t  *indirect;
+    double *tempv, *tempv2d;
+    int_t iinfo;
+    int_t *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    double *tempU2d, *tempu;
+    int    full, ldt, ldu, lead_zero, ncols;
+    MPI_Request recv_req[4], *send_req;
+    MPI_Status status;
+#if ( DEBUGlevel>=2 ) 
+    int_t num_copy=0, num_update=0;
+#endif
+#if ( PRNTlevel==3 )
+    int_t  zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    int_t iword = sizeof(int_t), dword = sizeof(double);
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( m < 0 ) *info = -2;
+    else if ( n < 0 ) *info = -3;
+    if ( *info ) {
+	pxerbla("pdgstrf", grid, -*info);
+	return (-1);
+    }
+
+    /* Quick return if possible. */
+    if ( m == 0 || n == 0 ) return 0;
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = slamch_("Epsilon");
+    thresh = s_eps * anorm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrf()");
+#endif
+
+    stat->ops[FACT] = 0.0;
+
+    if ( Pr*Pc > 1 ) {
+	i = Llu->bufmax[0];
+	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lsub_buf.");
+	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
+	i = Llu->bufmax[1];
+	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lval_buf[].");
+	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
+	if ( Llu->bufmax[2] != 0 ) 
+	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
+		ABORT("Malloc fails for Usub_buf[].");
+	if ( Llu->bufmax[3] != 0 ) 
+	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
+		ABORT("Malloc fails for Uval_buf[].");
+	if ( !(send_req =
+	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
+	    ABORT("Malloc fails for send_req[].");
+    }
+    if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )
+	ABORT("Malloc fails for ujrow[].");
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
+	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
+	       Llu->bufmax[0], Llu->bufmax[1], 
+	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
+    }
+#endif
+
+    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
+    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
+    Lval_buf_2[0] = Llu->Lval_buf_2[0];
+    Lval_buf_2[1] = Llu->Lval_buf_2[1];
+    Usub_buf = Llu->Usub_buf;
+    Uval_buf = Llu->Uval_buf;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv;
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
+    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
+	ABORT("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt*ldt;
+    if ( !(indirect = intMalloc_dist(ldt)) )
+	ABORT("Malloc fails for indirect[].");
+    k = CEILING( nsupers, Pr ); /* Number of local block rows */
+    if ( !(iuip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for iuip[].");
+    if ( !(ruip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for ruip[].");
+
+#if ( VAMPIR>=1 )
+    VT_symdef(1, "Send-L", "Comm");
+    VT_symdef(2, "Recv-L", "Comm");
+    VT_symdef(3, "Send-U", "Comm");
+    VT_symdef(4, "Recv-U", "Comm");
+    VT_symdef(5, "TRF2", "Factor");
+    VT_symdef(100, "Factor", "Factor");
+    VT_begin(100);
+    VT_traceon();
+#endif
+
+    /* ---------------------------------------------------------------
+       Handle the first block column separately to start the pipeline.
+       --------------------------------------------------------------- */
+    if ( mycol == 0 ) {
+#if ( VAMPIR>=1 )
+	VT_begin(5);
+#endif
+	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
+#if ( VAMPIR>=1 )
+	VT_end(5);
+#endif
+
+	scp = &grid->rscp; /* The scope of process row. */
+
+	/* Process column *kcol* multicasts numeric values of L(:,k) 
+	   to process rows. */
+	lsub = Lrowind_bc_ptr[0];
+	lusup = Lnzval_bc_ptr[0];
+	if ( lsub ) {
+	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
+	    msgcnt[1] = lsub[1] * SuperSize( 0 );
+	} else {
+	    msgcnt[0] = msgcnt[1] = 0;
+	}
+	
+	for (pj = 0; pj < Pc; ++pj) {
+	    if ( ToSendR[0][pj] != EMPTY ) {
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(1);
+#endif
+		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
+			  &send_req[pj] );
+		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
+			  &send_req[pj+Pc] );
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+		       iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+#if ( VAMPIR>=1 )
+		VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+		msg_cnt += 2;
+		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+	    }
+	} /* for pj ... */
+    } else { /* Post immediate receives. */
+	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
+	    scp = &grid->rscp; /* The scope of process row. */
+	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
+		      0, scp->comm, &recv_req[0] );
+	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
+		      1, scp->comm, &recv_req[1] );
+#if ( DEBUGlevel>=2 )
+	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
+#endif
+	}
+    } /* if mycol == 0 */
+
+    /* ------------------------------------------
+       MAIN LOOP: Loop through all block columns.
+       ------------------------------------------ */
+    for (k = 0; k < nsupers; ++k) {
+
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+
+	if ( mycol == kcol ) {
+	    lk = LBj( k, grid ); /* Local block number. */
+
+	    for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup. */
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    MPI_Wait( &send_req[pj], &status );
+		    MPI_Wait( &send_req[pj+Pc], &status );
+		}
+	    }
+	    lsub = Lrowind_bc_ptr[lk];
+	    lusup = Lnzval_bc_ptr[lk];
+	} else {
+	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
+		scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(2);
+#endif
+		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[0]);*/
+		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
+			 (4*k)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[0], &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
+		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[1]);*/
+		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			 (4*k+1)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[1], &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
+#if ( VAMPIR>=1 )
+		VT_end(2);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
+		       iam, k, msgcnt[0], msgcnt[1], kcol);
+		fflush(stdout);
+#endif
+		lsub = Lsub_buf_2[k%2];
+		lusup = Lval_buf_2[k%2];
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[0] ) ++zero_msg;
+#endif
+	    } else msgcnt[0] = 0;
+	} /* if mycol = Pc(k) */
+
+	scp = &grid->cscp; /* The scope of process column. */
+
+	if ( myrow == krow ) {
+	    /* Parallel triangular solve across process row *krow* --
+	       U(k,j) = L(k,k) \ A(k,j).  */
+#ifdef _CRAY
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
+#else
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
+#endif
+
+	    /* Multicasts U(k,:) to process columns. */
+	    lk = LBi( k, grid );
+	    usub = Ufstnz_br_ptr[lk];
+	    uval = Unzval_br_ptr[lk];
+	    if ( usub )	{
+		msgcnt[2] = usub[2];
+		msgcnt[3] = usub[1];
+	    } else {
+		msgcnt[2] = msgcnt[3] = 0;
+	    }
+
+	    if ( ToSendD[lk] == YES ) {
+		for (pi = 0; pi < Pr; ++pi) {
+		    if ( pi != myrow ) {
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+			VT_begin(3);
+#endif
+			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
+				 (4*k+2)%NTAGS, scp->comm);
+			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
+				 (4*k+3)%NTAGS, scp->comm);
+#if ( VAMPIR>=1 )
+			VT_end(3);
+#endif
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[COMM] += t2;
+			msg_cnt += 2;
+			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
+#endif
+		    } /* if pi ... */
+		} /* for pi ... */
+	    } /* if ToSendD ... */
+	} else { /* myrow != krow */
+	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(4);
+#endif
+		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[2]);*/
+		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+			 (4*k+2)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
+		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[3]);*/
+		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
+			 (4*k+3)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
+#if ( VAMPIR>=1 )
+		VT_end(4);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+		usub = Usub_buf;
+		uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[2] ) ++zero_msg;
+#endif
+	    } else msgcnt[2] = 0;
+	} /* if myrow == Pr(k) */
+	  
+	/* 
+	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+	 *  for (j = k+1; k < N; ++k) {
+	 *     for (i = k+1; i < N; ++i) 
+	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+	 *              && L(i,k) != 0 && U(k,j) != 0 )
+	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+	 */
+	msg0 = msgcnt[0];
+	msg2 = msgcnt[2];
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    nsupr = lsub[1]; /* LDA of lusup. */
+	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
+		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
+		luptr0 = knsupc;
+		nlb = lsub[0] - 1;
+	    } else {
+		lptr0 = BC_HEADER;
+		luptr0 = 0;
+		nlb = lsub[0];
+	    }
+	    lptr = lptr0;
+	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
+		ib = lsub[lptr];
+		lib = LBi( ib, grid );
+		iuip[lib] = BR_HEADER;
+		ruip[lib] = 0;
+		lptr += LB_DESCRIPTOR + lsub[lptr+1];
+	    }
+	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
+	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
+	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
+	    klst = FstBlockC( k+1 );
+	    
+	    /* ---------------------------------------------------
+	       Update the first block column A(:,k+1).
+	       --------------------------------------------------- */
+	    jb = usub[iukp];   /* Global block number of block U(k,j). */
+	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
+		--nub;
+		lptr = lptr0;
+		luptr = luptr0;
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+			 iam, full, k, jb, ldu, ncols, nsupc);
+		  ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+				tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr]; /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0, it = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (it = 0, i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    }  /* if jb == k+1 */
+	} /* if L(:,k) and U(k,:) not empty */
+
+
+	if ( k+1 < nsupers ) {
+	  kcol = PCOL( k+1, grid );
+	  if ( mycol == kcol ) {
+#if ( VAMPIR>=1 )
+	    VT_begin(5);
+#endif
+	    /* Factor diagonal and subdiagonal blocks and test for exact
+	       singularity.  */
+	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
+#if ( VAMPIR>=1 )
+	    VT_end(5);
+#endif
+
+	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
+	       to process rows. */
+	    lk = LBj( k+1, grid ); /* Local block number. */
+	    lsub1 = Lrowind_bc_ptr[lk];
+ 	    if ( lsub1 ) {
+		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
+		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
+	    } else {
+		msgcnt[0] = 0;
+		msgcnt[1] = 0;
+	    }
+	    scp = &grid->rscp; /* The scope of process row. */
+	    for (pj = 0; pj < Pc; ++pj) {
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+		    TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		    VT_begin(1);
+#endif
+		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
+			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
+		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
+			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
+#if ( VAMPIR>=1 )
+		    VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		    TOC(t2, t1);
+		    stat->utime[COMM] += t2;
+		    msg_cnt += 2;
+		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+			   iam, k+1, msgcnt[0], msgcnt[1], pj);
+#endif
+		}
+	    } /* for pj ... */
+	  } else { /* Post Recv of block column L(:,k+1). */
+	    if ( ToRecv[k+1] >= 1 ) {
+		scp = &grid->rscp; /* The scope of process row. */
+		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
+			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
+		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
+#endif
+	    }
+	  } /* if mycol == Pc(k+1) */
+        } /* if k+1 < nsupers */
+
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    /* ---------------------------------------------------
+	       Update all other blocks using block row U(k,:)
+	       --------------------------------------------------- */
+	    for (j = 0; j < nub; ++j) { 
+		lptr = lptr0;
+		luptr = luptr0;
+		jb = usub[iukp];  /* Global block number of block U(k,j). */
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+		       iam, full, k, jb, ldu, ncols, nsupc);
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		    ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+			        tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr];       /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			/* Skip descriptor.  Now point to fstnz index of 
+			   block U(i,j). */
+			iuip[lib] += UB_DESCRIPTOR;
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0 ; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted for the L blocks.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    } /* for j ... */
+	} /* if  k L(:,k) and U(k,:) are not empty */
+
+    } 
+    /* ------------------------------------------
+       END MAIN LOOP: for k = ...
+       ------------------------------------------ */
+
+#if ( VAMPIR>=1 )
+    VT_end(100);
+    VT_traceoff();
+#endif
+
+    if ( Pr*Pc > 1 ) {
+	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
+	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
+	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
+	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
+	SUPERLU_FREE(send_req);
+    }
+
+    SUPERLU_FREE(Llu->ujrow);
+    SUPERLU_FREE(tempv2d);
+    SUPERLU_FREE(indirect);
+    SUPERLU_FREE(iuip);
+    SUPERLU_FREE(ruip);
+
+    /* Prepare error message. */
+    if ( *info == 0 ) *info = n + 1;
+#if ( PROFlevel>=1 )
+    TIC(t1);
+#endif
+    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
+#if ( PROFlevel>=1 )
+    TOC(t2, t1);
+    stat->utime[COMM] += t2;
+    {
+	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+	
+	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_cnt, &msg_cnt_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	if ( !iam ) {
+	    printf("\tPDGSTRF comm stat:"
+		   "\tAvg\tMax\t\tAvg\tMax\n"
+		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
+		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
+	}
+    }
+#endif
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+	if ( iam == i ) {
+	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    printf("(%d)\n", iam);
+	    PrintInt10("Recv", nsupers, Llu->ToRecv);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrf()");
+#endif
+} /* PDGSTRF */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the process column that owns block column *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+static void pdgstrf2
+/************************************************************************/
+(
+ superlu_options_t *options,
+ int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, int* info
+ )
+{
+    int    c, iam, l, pkk;
+    int    incx = 1, incy = 1;
+    int    nsupr; /* number of rows in the block (LDA) */
+    int    luptr;
+    int_t  i, krow, j, jfst, jlst;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  *xsup = Glu_persist->xsup;
+    double *lusup, temp;
+    double *ujrow;
+    double alpha = -1;
+    *info = 0;
+
+    /* Quick return. */
+
+    /* Initialization. */
+    iam   = grid->iam;
+    krow  = PROW( k, grid );
+    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    j     = LBj( k, grid ); /* Local block number */
+    jfst  = FstBlockC( k );
+    jlst  = FstBlockC( k+1 );
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize( k );
+    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
+    ujrow = Llu->ujrow;
+
+    luptr = 0; /* Point to the diagonal entries. */
+    c = nsupc;
+    for (j = 0; j < jlst - jfst; ++j) {
+	/* Broadcast the j-th row (nsupc - j) elements to
+	   the process column. */
+	if ( iam == pkk ) { /* Diagonal process. */
+	    i = luptr;
+	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
+		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
+#if ( PRNTlevel>=2 )
+		    printf("(%d) .. col %d, tiny pivot %e  ",
+			   iam, jfst+j, lusup[i]);
+#endif
+		    /* Keep the replaced diagonal with the same sign. */
+		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
+		    else lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+		    printf("replaced by %e\n", lusup[i]);
+#endif
+		    ++(stat->TinyPivots);
+		}
+	    }
+	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
+	}
+#if 0
+	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
+#else
+	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
+	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
+		   grid, COMM_COLUMN, &c);*/
+#endif
+
+#if ( DEBUGlevel>=2 )
+if ( k == 3329 && j == 2 ) {
+	if ( iam == pkk ) {
+	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	} else {
+	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	}
+}
+#endif
+
+	if ( !lusup ) { /* Empty block column. */
+	    --c;
+	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
+	    continue;
+	}
+
+	/* Test for singularity. */
+	if ( ujrow[0] == 0.0 ) {
+	    *info = j+jfst+1;
+	} else {
+	    /* Scale the j-th column of the matrix. */
+	    temp = 1.0 / ujrow[0];
+	    if ( iam == pkk ) {
+		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr-j-1;
+	    } else {
+		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr;
+	    }
+	}
+	    
+	/* Rank-1 update of the trailing submatrix. */
+	if ( --c ) {
+	    if ( iam == pkk ) {
+		l = nsupr - j - 1;
+#ifdef _CRAY
+		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#else
+		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * l * c;
+	    } else {
+#ifdef _CRAY
+		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#else
+		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * nsupr * c;
+	    }
+	}
+	
+	/* Move to the next column. */
+	if ( iam == pkk ) luptr += nsupr + 1;
+	else luptr += nsupr;
+
+    } /* for j ... */
+
+} /* PDGSTRF2 */
+
+
+/************************************************************************/
+static void pdgstrs2
+/************************************************************************/
+#ifdef _CRAY
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
+ )
+#else
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat
+ )
+#endif
+/* 
+ * Purpose
+ * =======
+ *   Perform parallel triangular solves
+ *           U(k,:) := A(k,:) \ L(k,k). 
+ *   Only the process row that owns block row *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * m      (input) int (global)
+ *        Number of rows in the matrix.
+ *
+ * k      (input) int (global)
+ *        The row number of the block row to be factorized.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization; 
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ */
+{
+    int    iam, pkk;
+    int    incx = 1;
+    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int    segsize;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  luptr, iukp, rukp;
+    int_t  b, gb, j, klst, knsupc, lk, nb;
+    int_t  *xsup = Glu_persist->xsup;
+    int_t  *usub;
+    double *lusup, *uval;
+
+    /* Quick return. */
+    lk = LBi( k, grid ); /* Local block number */
+    if ( !Llu->Unzval_br_ptr[lk] ) return;
+
+    /* Initialization. */
+    iam  = grid->iam;
+    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    klst = FstBlockC( k+1 );
+    knsupc = SuperSize( k );
+    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if ( iam == pkk ) {
+	lk = LBj( k, grid );
+	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+	lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
+	lusup = Llu->Lval_buf_2[k%2];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b) {
+	gb = usub[iukp];
+	nsupc = SuperSize( gb );
+	iukp += UB_DESCRIPTOR;
+
+	/* Loop through all the segments in the block. */
+	for (j = 0; j < nsupc; ++j) {
+	    segsize = klst - usub[iukp++]; 
+	    if ( segsize ) { /* Nonzero segment. */
+		luptr = (knsupc - segsize) * (nsupr + 1);
+#ifdef _CRAY
+		STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
+		      &uval[rukp], &incx);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx, 1, 1, 1);
+#else
+		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx);
+#endif
+		stat->ops[FACT] += segsize * (segsize + 1);
+		rukp += segsize;
+	    }
+	}
+    } /* for b ... */
+
+} /* PDGSTRS2 */
+
+static int
+probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
+	   int buf_size)
+{
+    MPI_Status status;
+    int count; 
+
+    MPI_Probe( source, tag, comm, &status );
+    MPI_Get_count( &status, datatype, &count );
+    if ( count > buf_size ) {
+        printf("(%d) Recv'ed count %d > buffer size $d\n",
+	       iam, count, buf_size);
+	exit(-1);
+    }
+    return 0;
+}
diff --git a/SRC/pdgstrf_sherry.c b/SRC/pdgstrf_sherry.c
new file mode 100644
index 0000000..9709699
--- /dev/null
+++ b/SRC/pdgstrf_sherry.c
@@ -0,0 +1,1389 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+#if ( VAMPIR>=1 )
+#include <VT.h>
+#endif
+
+/*
+ * Internal prototypes
+ */
+static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
+		     gridinfo_t *, LocalLU_t *, MPI_Request *,
+		     SuperLUStat_t *, int *);
+#ifdef _CRAY
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *);
+#endif
+
+
+/* 
+ * Sketch of the algorithm
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pdgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ * 
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pdgstrf2(k), one column at a time 
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j) 
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i) 
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ *
+ * Remaining issues
+ *   (1) Use local indices for L subscripts and SPA.  [DONE]
+ *
+ */
+/************************************************************************/
+int_t pdgstrf
+/************************************************************************/
+(
+ superlu_options_t *options, int m, int n, double anorm,
+ LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
+ )
+/* 
+ * Purpose
+ * =======
+ *
+ *  PDGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ * 
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ *
+ */
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd("N", strlen("N"));
+    _fcd ftcs1 = _cptofcd("L", strlen("L"));
+    _fcd ftcs2 = _cptofcd("N", strlen("N"));
+    _fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    double alpha = 1.0, beta = 0.0;
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf,
+          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
+    double *lusup, *lusup1, *uval, *Uval_buf,
+           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+          nlb, nub, nsupc, rel, rukp;
+    int_t Pc, Pr;
+    int   iam, kcol, krow, mycol, myrow, pi, pj;
+    int   j, k, lk, nsupers;
+    int   nsupr, nbrow, segsize;
+    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
+		      *     0 : transferred in Lsub_buf[]
+		      *     1 : transferred in Lval_buf[]
+		      *     2 : transferred in Usub_buf[] 
+		      *     3 : transferred in Uval_buf[]
+		      */
+    int_t  msg0, msg2;
+    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    double **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t  *index;
+    double *nzval;
+    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    double *ucol;
+    int_t  *indirect;
+    double *tempv, *tempv2d;
+    int_t iinfo;
+    int_t *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    double *tempU2d, *tempu;
+    int    full, ldt, ldu, lead_zero, ncols;
+    MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL;
+    MPI_Status status;
+#if ( DEBUGlevel>=2 ) 
+    int_t num_copy=0, num_update=0;
+#endif
+#if ( PRNTlevel==3 )
+    int_t  zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    int_t iword = sizeof(int_t), dword = sizeof(double);
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( m < 0 ) *info = -2;
+    else if ( n < 0 ) *info = -3;
+    if ( *info ) {
+	pxerbla("pdgstrf", grid, -*info);
+	return (-1);
+    }
+
+    /* Quick return if possible. */
+    if ( m == 0 || n == 0 ) return 0;
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = slamch_("Epsilon");
+    thresh = s_eps * anorm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrf()");
+#endif
+
+    stat->ops[FACT] = 0.0;
+
+    if ( Pr*Pc > 1 ) {
+	i = Llu->bufmax[0];
+	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lsub_buf.");
+	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
+	i = Llu->bufmax[1];
+	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lval_buf[].");
+	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
+	if ( Llu->bufmax[2] != 0 ) 
+	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
+		ABORT("Malloc fails for Usub_buf[].");
+	if ( Llu->bufmax[3] != 0 ) 
+	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
+		ABORT("Malloc fails for Uval_buf[].");
+	if ( !(U_diag_blk_send_req =
+	       (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))))
+	    ABORT("Malloc fails for U_diag_blk_send_req[].");
+        U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */
+	if ( !(send_req =
+	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
+	    ABORT("Malloc fails for send_req[].");
+    }
+    k = sp_ienv_dist(3); /* max supernode size */
+    if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) )
+	ABORT("Malloc fails for ujrow[].");
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
+	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
+	       Llu->bufmax[0], Llu->bufmax[1], 
+	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
+    }
+#endif
+
+    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
+    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
+    Lval_buf_2[0] = Llu->Lval_buf_2[0];
+    Lval_buf_2[1] = Llu->Lval_buf_2[1];
+    Usub_buf = Llu->Usub_buf;
+    Uval_buf = Llu->Uval_buf;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv;
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
+    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
+	ABORT("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt*ldt;
+    if ( !(indirect = intMalloc_dist(ldt)) )
+	ABORT("Malloc fails for indirect[].");
+    k = CEILING( nsupers, Pr ); /* Number of local block rows */
+    if ( !(iuip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for iuip[].");
+    if ( !(ruip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for ruip[].");
+
+#if ( VAMPIR>=1 )
+    VT_symdef(1, "Send-L", "Comm");
+    VT_symdef(2, "Recv-L", "Comm");
+    VT_symdef(3, "Send-U", "Comm");
+    VT_symdef(4, "Recv-U", "Comm");
+    VT_symdef(5, "TRF2", "Factor");
+    VT_symdef(100, "Factor", "Factor");
+    VT_begin(100);
+    VT_traceon();
+#endif
+
+    /* ---------------------------------------------------------------
+       Handle the first block column separately to start the pipeline.
+       --------------------------------------------------------------- */
+    if ( mycol == 0 ) {
+
+#if ( VAMPIR>=1 )
+	VT_begin(5);
+#endif
+	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, 
+		 U_diag_blk_send_req, stat, info);
+
+#if ( VAMPIR>=1 )
+	VT_end(5);
+#endif
+
+	scp = &grid->rscp; /* The scope of process row. */
+
+	/* Process column *kcol* multicasts numeric values of L(:,k) 
+	   to process rows. */
+	lsub = Lrowind_bc_ptr[0];
+	lusup = Lnzval_bc_ptr[0];
+	if ( lsub ) {
+	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
+	    msgcnt[1] = lsub[1] * SuperSize( 0 );
+	} else {
+	    msgcnt[0] = msgcnt[1] = 0;
+	}
+	
+	for (pj = 0; pj < Pc; ++pj) {
+	    if ( ToSendR[0][pj] != EMPTY ) {
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(1);
+#endif
+		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
+			  &send_req[pj] );
+		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
+			  &send_req[pj+Pc] );
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+		       iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+#if ( VAMPIR>=1 )
+		VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+		msg_cnt += 2;
+		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+	    }
+	} /* for pj ... */
+    } else { /* Post immediate receives. */
+	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
+	    scp = &grid->rscp; /* The scope of process row. */
+	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
+		      0, scp->comm, &recv_req[0] );
+	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
+		      1, scp->comm, &recv_req[1] );
+#if ( DEBUGlevel>=2 )
+	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
+#endif
+	}
+    } /* if mycol == 0 */
+
+    /* ------------------------------------------
+       MAIN LOOP: Loop through all block columns.
+       ------------------------------------------ */
+    for (k = 0; k < nsupers; ++k) {
+
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+
+	if ( mycol == kcol ) {
+	    lk = LBj( k, grid ); /* Local block number. */
+
+	    for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup. */
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    MPI_Wait( &send_req[pj], &status );
+		    MPI_Wait( &send_req[pj+Pc], &status );
+		}
+	    }
+	    lsub = Lrowind_bc_ptr[lk];
+	    lusup = Lnzval_bc_ptr[lk];
+	} else {
+	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
+		scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(2);
+#endif
+		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[0]);*/
+		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
+			 (4*k)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[0], &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
+		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[1]);*/
+		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			 (4*k+1)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[1], &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
+#if ( VAMPIR>=1 )
+		VT_end(2);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
+		       iam, k, msgcnt[0], msgcnt[1], kcol);
+		fflush(stdout);
+#endif
+		lsub = Lsub_buf_2[k%2];
+		lusup = Lval_buf_2[k%2];
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[0] ) ++zero_msg;
+#endif
+	    } else msgcnt[0] = 0;
+	} /* if mycol = Pc(k) */
+
+	scp = &grid->cscp; /* The scope of process column. */
+
+	if ( myrow == krow ) {
+	    /* Parallel triangular solve across process row *krow* --
+	       U(k,j) = L(k,k) \ A(k,j).  */
+#ifdef _CRAY
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
+#else
+	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
+#endif
+
+	    /* Multicasts U(k,:) to process columns. */
+	    lk = LBi( k, grid );
+	    usub = Ufstnz_br_ptr[lk];
+	    uval = Unzval_br_ptr[lk];
+	    if ( usub )	{
+		msgcnt[2] = usub[2];
+		msgcnt[3] = usub[1];
+	    } else {
+		msgcnt[2] = msgcnt[3] = 0;
+	    }
+
+	    if ( ToSendD[lk] == YES ) {
+		for (pi = 0; pi < Pr; ++pi) {
+		    if ( pi != myrow ) {
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+			VT_begin(3);
+#endif
+			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
+				 (4*k+2)%NTAGS, scp->comm);
+			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
+				 (4*k+3)%NTAGS, scp->comm);
+#if ( VAMPIR>=1 )
+			VT_end(3);
+#endif
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[COMM] += t2;
+			msg_cnt += 2;
+			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
+#endif
+		    } /* if pi ... */
+		} /* for pi ... */
+	    } /* if ToSendD ... */
+	} else { /* myrow != krow */
+	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		VT_begin(4);
+#endif
+		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[2]);*/
+		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+			 (4*k+2)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
+		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
+		  Llu->bufmax[3]);*/
+		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
+			 (4*k+3)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
+#if ( VAMPIR>=1 )
+		VT_end(4);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+		usub = Usub_buf;
+		uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[2] ) ++zero_msg;
+#endif
+	    } else msgcnt[2] = 0;
+	} /* if myrow == Pr(k) */
+	  
+	/* 
+	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+	 *  for (j = k+1; k < N; ++k) {
+	 *     for (i = k+1; i < N; ++i) 
+	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+	 *              && L(i,k) != 0 && U(k,j) != 0 )
+	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+	 */
+	msg0 = msgcnt[0];
+	msg2 = msgcnt[2];
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    nsupr = lsub[1]; /* LDA of lusup. */
+	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
+		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
+		luptr0 = knsupc;
+		nlb = lsub[0] - 1;
+	    } else {
+		lptr0 = BC_HEADER;
+		luptr0 = 0;
+		nlb = lsub[0];
+	    }
+	    lptr = lptr0;
+	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
+		ib = lsub[lptr];
+		lib = LBi( ib, grid );
+		iuip[lib] = BR_HEADER;
+		ruip[lib] = 0;
+		lptr += LB_DESCRIPTOR + lsub[lptr+1];
+	    }
+	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
+	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
+	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
+	    klst = FstBlockC( k+1 );
+	    
+	    /* ---------------------------------------------------
+	       Update the first block column A(:,k+1).
+	       --------------------------------------------------- */
+	    jb = usub[iukp];   /* Global block number of block U(k,j). */
+	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
+		--nub;
+		lptr = lptr0;
+		luptr = luptr0;
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+			 iam, full, k, jb, ldu, ncols, nsupc);
+		  ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+				tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr]; /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0, it = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (it = 0, i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[it++];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    }  /* if jb == k+1 */
+	} /* if L(:,k) and U(k,:) not empty */
+
+
+	if ( k+1 < nsupers ) {
+	  kcol = PCOL( k+1, grid );
+	  if ( mycol == kcol ) {
+#if ( VAMPIR>=1 )
+	    VT_begin(5);
+#endif
+	    /* Factor diagonal and subdiagonal blocks and test for exact
+	       singularity.  */
+	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu,
+		     U_diag_blk_send_req, stat, info);
+
+#if ( VAMPIR>=1 )
+	    VT_end(5);
+#endif
+
+	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
+	       to process rows. */
+	    lk = LBj( k+1, grid ); /* Local block number. */
+	    lsub1 = Lrowind_bc_ptr[lk];
+ 	    if ( lsub1 ) {
+		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
+		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
+	    } else {
+		msgcnt[0] = 0;
+		msgcnt[1] = 0;
+	    }
+	    scp = &grid->rscp; /* The scope of process row. */
+	    for (pj = 0; pj < Pc; ++pj) {
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+		    TIC(t1);
+#endif
+#if ( VAMPIR>=1 )
+		    VT_begin(1);
+#endif
+		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
+			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
+		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
+			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
+#if ( VAMPIR>=1 )
+		    VT_end(1);
+#endif
+#if ( PROFlevel>=1 )
+		    TOC(t2, t1);
+		    stat->utime[COMM] += t2;
+		    msg_cnt += 2;
+		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
+#endif
+#if ( DEBUGlevel>=2 )
+		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+			   iam, k+1, msgcnt[0], msgcnt[1], pj);
+#endif
+		}
+	    } /* for pj ... */
+	  } else { /* Post Recv of block column L(:,k+1). */
+	    if ( ToRecv[k+1] >= 1 ) {
+		scp = &grid->rscp; /* The scope of process row. */
+		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
+			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
+		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
+			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
+#endif
+	    }
+	  } /* if mycol == Pc(k+1) */
+        } /* if k+1 < nsupers */
+
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    /* ---------------------------------------------------
+	       Update all other blocks using block row U(k,:)
+	       --------------------------------------------------- */
+	    for (j = 0; j < nub; ++j) { 
+		lptr = lptr0;
+		luptr = luptr0;
+		jb = usub[iukp];  /* Global block number of block U(k,j). */
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+		       iam, full, k, jb, ldu, ncols, nsupc);
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		    ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+			        tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr];       /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			/* Skip descriptor.  Now point to fstnz index of 
+			   block U(i,j). */
+			iuip[lib] += UB_DESCRIPTOR;
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0 ; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    ucol[rel] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted for the L blocks.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    nzval[indirect[rel]] -= tempv[i];
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    } /* for j ... */
+	} /* if  k L(:,k) and U(k,:) are not empty */
+
+    } 
+    /* ------------------------------------------
+       END MAIN LOOP: for k = ...
+       ------------------------------------------ */
+
+#if ( VAMPIR>=1 )
+    VT_end(100);
+    VT_traceoff();
+#endif
+
+    if ( Pr*Pc > 1 ) {
+	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
+	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
+	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
+	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
+	SUPERLU_FREE(send_req);
+	if ( U_diag_blk_send_req[myrow] ) {
+	    /* wait for last Isend requests to complete, deallocate objects */ 
+	    for (krow = 0; krow < Pr; ++krow)
+		if ( krow != myrow )
+                    MPI_Wait(U_diag_blk_send_req + krow, &status);
+	}
+	SUPERLU_FREE(U_diag_blk_send_req);
+    }
+
+    SUPERLU_FREE(Llu->ujrow);
+    SUPERLU_FREE(tempv2d);
+    SUPERLU_FREE(indirect);
+    SUPERLU_FREE(iuip);
+    SUPERLU_FREE(ruip);
+
+    /* Prepare error message. */
+    if ( *info == 0 ) *info = n + 1;
+#if ( PROFlevel>=1 )
+    TIC(t1);
+#endif
+    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
+#if ( PROFlevel>=1 )
+    TOC(t2, t1);
+    stat->utime[COMM] += t2;
+    {
+	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+	
+	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_cnt, &msg_cnt_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	if ( !iam ) {
+	    printf("\tPDGSTRF comm stat:"
+		   "\tAvg\tMax\t\tAvg\tMax\n"
+		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
+		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
+	}
+    }
+#endif
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+	if ( iam == i ) {
+	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    printf("(%d)\n", iam);
+	    PrintInt10("Recv", nsupers, Llu->ToRecv);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrf()");
+#endif
+} /* PDGSTRF */
+
+
+/************************************************************************/
+static void pdgstrf2
+/************************************************************************/
+(
+ superlu_options_t *options,
+ int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req,
+ SuperLUStat_t *stat, int* info
+ )
+/* 
+ * Purpose
+ * =======
+ *
+ *   Panel factorization -- block column k
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that owns block column *k* participate
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ *
+ */
+{
+    int    cols_left, iam, l, pkk, pr;
+    int    incx = 1, incy = 1;
+    int    nsupr; /* number of rows in the block (LDA) */
+    int    luptr;
+    int_t  i, krow, j, jfst, jlst, u_diag_cnt;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  *xsup = Glu_persist->xsup;
+    int_t  Pr;
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    double *lusup, temp;
+    double *ujrow, *ublk_ptr; /* pointer to the U block */
+    double alpha = -1;
+    *info = 0;
+
+    /* Quick return. */
+
+    /* Initialization. */
+    iam   = grid->iam;
+    Pr    = grid->nprow;
+    krow  = PROW( k, grid );
+    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    j     = LBj( k, grid ); /* Local block number */
+    jfst  = FstBlockC( k );
+    jlst  = FstBlockC( k+1 );
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize( k );
+    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
+    ublk_ptr = ujrow = Llu->ujrow;
+
+    luptr = 0; /* point to the diagonal entries. */
+    cols_left = nsupc; /* supernode size */
+    u_diag_cnt = 0;
+
+    if ( iam == pkk ) { /* diagonal process */
+
+        if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) {
+	    /* There are pending sends - wait for all Isend to complete */
+            for (pr = 0; pr < Pr; ++pr)
+                if (pr != krow)
+                    MPI_Wait(U_diag_blk_send_req + pr, &status);
+        }
+
+	for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
+	    
+	    /* Diagonal pivot */
+	    i = luptr;
+	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
+		if ( fabs(lusup[i]) < thresh ) {
+#if ( PRNTlevel>=2 )
+		    printf("(%d) .. col %d, tiny pivot %e  ",
+			   iam, jfst+j, lusup[i]);
+#endif
+		    /* Keep the new diagonal entry with the same sign. */
+		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
+		    else lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+		    printf("replaced by %e\n", lusup[i]);
+#endif
+		    ++(stat->TinyPivots);
+		}
+	    }
+
+	    for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+                ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */
+
+	    if ( ujrow[0] == 0.0 ) { /* Test for singularity. */
+		*info = j+jfst+1;
+	    } else { /* Scale the j-th column. */
+		temp = 1.0 / ujrow[0];
+		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr-j-1;
+	    }
+
+	    /* Rank-1 update of the trailing submatrix. */
+	    if ( --cols_left ) {
+		l = nsupr - j - 1;
+#ifdef _CRAY
+		SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
+		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#else
+		dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
+		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * l * cols_left;
+
+	    }
+	    ujrow = ublk_ptr + u_diag_cnt;  /* move to next row of U */
+	    luptr += nsupr + 1;	                 /* move to next column */
+
+	} /* for column j ... */
+
+	if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */
+	    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+	    for (pr = 0; pr < Pr; ++pr)
+		if (pr != krow)
+		    MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr,
+			      ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr);
+	    U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */
+	}
+
+    } else  { /* non-diagonal process */
+
+	/* Receive the diagonal block of U */
+        MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE,
+		 krow, ((k<<2)+2)%NTAGS, comm, &status);
+
+	for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
+	    u_diag_cnt += cols_left;
+
+	    if ( !lusup ) { /* empty block column */
+		--cols_left;
+		if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
+		continue;
+	    }
+
+	    /* Test for singularity. */
+	    if ( ujrow[0] == 0.0 ) {
+		*info = j+jfst+1;
+	    } else {
+		/* Scale the j-th column. */
+		temp = 1.0 / ujrow[0];
+		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
+		stat->ops[FACT] += nsupr;
+	    }
+
+	    /* Rank-1 update of the trailing submatrix. */
+	    if ( --cols_left ) {
+#ifdef _CRAY
+		SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
+		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#else
+		dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
+		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#endif
+		stat->ops[FACT] += 2 * nsupr * cols_left;
+
+	    }
+
+	    ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */
+	    luptr += nsupr;                      /* move to next column */
+
+	} /* for column j ... */
+
+    } /* end if pkk ... */
+
+} /* PDGSTRF2 */
+
+
+/************************************************************************/
+static void pdgstrs2
+/************************************************************************/
+#ifdef _CRAY
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
+ )
+#else
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat
+ )
+#endif
+/* 
+ * Purpose
+ * =======
+ *   Perform parallel triangular solves
+ *           U(k,:) := A(k,:) \ L(k,k). 
+ *   Only the process row that owns block row *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * m      (input) int (global)
+ *        Number of rows in the matrix.
+ *
+ * k      (input) int (global)
+ *        The row number of the block row to be factorized.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization; 
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ */
+{
+    int    iam, pkk;
+    int    incx = 1;
+    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int    segsize;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  luptr, iukp, rukp;
+    int_t  b, gb, j, klst, knsupc, lk, nb;
+    int_t  *xsup = Glu_persist->xsup;
+    int_t  *usub;
+    double *lusup, *uval;
+
+    /* Quick return. */
+    lk = LBi( k, grid ); /* Local block number */
+    if ( !Llu->Unzval_br_ptr[lk] ) return;
+
+    /* Initialization. */
+    iam  = grid->iam;
+    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    klst = FstBlockC( k+1 );
+    knsupc = SuperSize( k );
+    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if ( iam == pkk ) {
+	lk = LBj( k, grid );
+	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+	lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
+	lusup = Llu->Lval_buf_2[k%2];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b) {
+	gb = usub[iukp];
+	nsupc = SuperSize( gb );
+	iukp += UB_DESCRIPTOR;
+
+	/* Loop through all the segments in the block. */
+	for (j = 0; j < nsupc; ++j) {
+	    segsize = klst - usub[iukp++]; 
+	    if ( segsize ) { /* Nonzero segment. */
+		luptr = (knsupc - segsize) * (nsupr + 1);
+#ifdef _CRAY
+		STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
+		      &uval[rukp], &incx);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx, 1, 1, 1);
+#else
+		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx);
+#endif
+		stat->ops[FACT] += segsize * (segsize + 1);
+		rukp += segsize;
+	    }
+	}
+    } /* for b ... */
+
+} /* PDGSTRS2 */
+
+static int
+probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
+	   int buf_size)
+{
+    MPI_Status status;
+    int count; 
+
+    MPI_Probe( source, tag, comm, &status );
+    MPI_Get_count( &status, datatype, &count );
+    if ( count > buf_size ) {
+        printf("(%d) Recv'ed count %d > buffer size $d\n",
+	       iam, count, buf_size);
+	exit(-1);
+    }
+    return 0;
+}
diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c
new file mode 100644
index 0000000..34f1a07
--- /dev/null
+++ b/SRC/pdgstrs.c
@@ -0,0 +1,1341 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations A*X = B with a
+ * general N-by-N matrix A using the LU factors computed previously.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+/*
+ * Sketch of the algorithm for L-solve:
+ * =======================
+ *
+ * Self-scheduling loop:
+ *
+ *   while ( not finished ) { .. use message counter to control
+ *
+ *      reveive a message;
+ * 	
+ * 	if ( message is Xk ) {
+ * 	    perform local block modifications into lsum[];
+ *                 lsum[i] -= L_i,k * X[k]
+ *          if all local updates done, Isend lsum[] to diagonal process;
+ *
+ *      } else if ( message is LSUM ) { .. this must be a diagonal process 
+ *          accumulate LSUM;
+ *          if ( all LSUM are received ) {
+ *              perform triangular solve for Xi;
+ *              Isend Xi down to the current process column;
+ *              perform local block modifications into lsum[];
+ *          }
+ *      }
+ *   }
+ *
+ * 
+ * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array)
+ * =======================
+ *
+ * lsum[] array (local)
+ *   + lsum has "nrhs" columns, row-wise is partitioned by supernodes
+ *   + stored by row blocks, column wise storage within a row block
+ *   + prepend a header recording the global block number.
+ *
+ *         lsum[]                        ilsum[nsupers + 1]
+ *
+ *         -----
+ *         | | |  <- header of size 2     ---
+ *         --------- <--------------------| |
+ *         | | | | |			  ---
+ * 	   | | | | |	      |-----------| |		
+ *         | | | | | 	      |           ---
+ *	   ---------          |   |-------| |
+ *         | | |  <- header   |   |       ---
+ *         --------- <--------|   |  |----| |
+ *         | | | | |		  |  |    ---
+ * 	   | | | | |              |  |
+ *         | | | | |              |  |
+ *	   ---------              |  |
+ *         | | |  <- header       |  |
+ *         --------- <------------|  |
+ *         | | | | |                 |
+ * 	   | | | | |                 |
+ *         | | | | |                 |
+ *	   --------- <---------------|
+ */
+  
+/*#define ISEND_IRECV*/
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ * 
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ * 
+ * B      (input) double*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) double*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) SOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+
+int_t
+pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
+                      int_t fst_row, int_t *ilsum, double *x,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist,
+		      gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct)
+{
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
+    int_t  *send_ibuf, *recv_ibuf;
+    double *send_dbuf, *recv_dbuf;
+    int_t  *xsup, *supno;
+    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk;
+    int    p, procs;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    SendCnt      = gstrs_comm->B_to_X_SendCnt;
+    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    /* ------------------------------------------------------------
+       NOW COMMUNICATE THE ACTUAL DATA.
+       ------------------------------------------------------------*/
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls[p] * nrhs;
+    }
+
+    /* Copy the row indices and values to the send buffer. */
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	k = ptr_to_ibuf[p];
+	send_ibuf[k] = irow;
+	k = ptr_to_dbuf[p];
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    send_dbuf[k++] = B[i + j*ldb];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+
+    /* Communicate the (permuted) row indices. */
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+
+    /* Communicate the numerical values. */
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
+		  grid->comm);
+    
+    /* ------------------------------------------------------------
+       Copy buffer into X on the diagonal processes.
+       ------------------------------------------------------------*/
+    ii = 0;
+    for (p = 0; p < procs; ++p) {
+        jj = rdispls_nrhs[p];
+        for (i = 0; i < RecvCnt[p]; ++i) {
+	    /* Only the diagonal processes do this; the off-diagonal processes
+	       have 0 RecvCnt. */
+	    irow = recv_ibuf[ii]; /* The permuted row index. */
+	    k = BlockNum( irow );
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );  /* Local block number. */
+	    l = X_BLK( lk );
+	    x[l - XK_H] = k;      /* Block number prepended in the header. */
+	    irow = irow - FstBlockC(k); /* Relative row number in X-block */
+	    RHS_ITERATE(j) {
+	        x[l + irow + j*knsupc] = recv_dbuf[jj++];
+	    }
+	    ++ii;
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()");
+#endif
+    return 0;
+} /* pdReDistribute_B_to_X */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * </pre>
+ */
+
+int_t
+pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row,
+		      int_t nrhs, double *x, int_t *ilsum,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
+		      SOLVEstruct_t *SOLVEstruct)
+{
+    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
+    int_t  *xsup, *supno;
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *send_ibuf, *recv_ibuf;
+    double *send_dbuf, *recv_dbuf;
+    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+    int  iam, p, q, pkk, procs;
+    int_t  num_diag_procs, *diag_procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+ 
+    SendCnt      = gstrs_comm->X_to_B_SendCnt;
+    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls_nrhs[p];
+    }
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid ); /* Local block number */
+		irow = FstBlockC( k );
+		l = X_BLK( lk );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
+#else
+		    ii = irow;
+#endif
+		    q = row_to_proc[ii];
+		    jj = ptr_to_ibuf[q];
+		    send_ibuf[jj] = ii;
+		    jj = ptr_to_dbuf[q];
+		    RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
+		        send_dbuf[jj++] = x[l + i + j*knsupc];
+		    }
+		    ++ptr_to_ibuf[q];
+		    ptr_to_dbuf[q] += nrhs;
+		    ++irow;
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+        COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
+       ------------------------------------------------------------*/
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, 
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
+		  grid->comm);
+
+    /* ------------------------------------------------------------
+       COPY THE BUFFER INTO B.
+       ------------------------------------------------------------*/
+    for (i = 0, k = 0; i < m_loc; ++i) {
+	irow = recv_ibuf[i];
+	irow -= fst_row; /* Relative row number */
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    B[irow + j*ldb] = recv_dbuf[k++];
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()");
+#endif
+    return 0;
+
+} /* pdReDistribute_X_to_B */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSTRS solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by PDGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PDGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) double*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ * 
+ * SOLVEstruct (input) SOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>       
+ */
+
+void
+pdgstrs(int_t n, LUstruct_t *LUstruct, 
+	ScalePermstruct_t *ScalePermstruct,
+	gridinfo_t *grid, double *B,
+	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
+	SOLVEstruct_t *SOLVEstruct,
+	SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double alpha = 1.0;
+    double zero = 0.0;
+    double *lsum;  /* Local running sum of the updates to B-components */
+    double *x;     /* X component at step k. */
+		    /* NOTE: x and lsum are of same size. */
+    double *lusup, *dest;
+    double *recvbuf, *tempv;
+    double *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *supno, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr;
+    MPI_Status status;
+    MPI_Request *send_req, recv_req;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve --
+                             Count the number of local block products to
+                             be summed into lsum[lk]. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+                             from processes in this row. 
+                             It is only valid on the diagonal processes. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for U-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+ 
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PDGSTRS", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = supno[n-1] + 1;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrs()");
+#endif
+
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doubleCalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
+	ABORT("Calloc fails for x[].");
+    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+    /* Redistribute B into X on the diagonal processes. */
+    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+    /* Set up the headers in lsum[]. */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	
+#if ( PROFlevel>=2 )
+	t_reduce_tmp = SuperLU_timer_();
+#endif
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+#if ( PROFlevel>=2 )
+	t_reduce += SuperLU_timer_() - t_reduce_tmp;
+#endif
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#if 0
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
+                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j) {
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+	      }
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+
+			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
+                                     MPI_DOUBLE, pi, Xk, grid->comm,
+                                     &send_req[Llu->SolveMsgSent++]);
+#if 0
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				    MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel==2 )
+    {
+      printf("(%d) .. After L-solve: y =\n", iam);
+      for (i = 0, k = 0; k < nsupers; ++k) {
+	  krow = PROW( k, grid );
+	  kcol = PCOL( k, grid );
+	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	      knsupc = SuperSize( k );
+	      lk = LBi( k, grid );
+	      ii = X_BLK( lk );
+	      for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	      fflush(stdout);
+	  }
+	  MPI_Barrier( grid->comm );
+      }
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
+    Llu->SolveMsgSent = 0;
+
+    MPI_Barrier( grid->comm );
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid ); /* root process in this row scope */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid ); /* root process in this row scope. */
+		if ( mycol == kcol ) { /* diagonal process */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j) {
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
+	    }
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nub is the number of local block columns. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb) {
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    }
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#if 0
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                  MPI_DOUBLE, pi, Xk,
+                                  grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
+                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM: /* Receiver must be a diagonal process */
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j) {
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+		}
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_DOUBLE, pi, Xk, grid->comm,
+                                       &send_req[Llu->SolveMsgSent++] );
+#if 0
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                      MPI_DOUBLE, pi, Xk,
+                                      grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		    }
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    {
+	double *x_col;
+	int diag;
+	printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
+	ii = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    knsupc = SuperSize( k );
+	    krow = PROW( k, grid );
+	    kcol = PCOL( k, grid );
+	    diag = PNUM( krow, kcol, grid);
+	    if ( iam == diag ) { /* Diagonal process. */
+		lk = LBi( k, grid );
+		jj = X_BLK( lk );
+		x_col = &x[jj];
+		RHS_ITERATE(j) {
+		    for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
+			printf("\t(%d)\t%4d\t%.10f\n",
+			       iam, xsup[k]+i, x_col[i]);
+		    }
+		    x_col += knsupc;
+		}
+	    }
+	    ii += knsupc;
+	} /* for k ... */
+    }
+#endif
+
+    pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i) {
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    }
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+
+    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
+    SUPERLU_FREE(send_req);
+
+    MPI_Barrier( grid->comm );
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrs()");
+#endif
+
+    return;
+} /* PDGSTRS */
+
diff --git a/SRC/pdgstrs1.c b/SRC/pdgstrs1.c
new file mode 100644
index 0000000..86f0e04
--- /dev/null
+++ b/SRC/pdgstrs1.c
@@ -0,0 +1,910 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+		   int*, double*, int*, double*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSTRS1 solves a system of distributed linear equations
+ *
+ *                   op( sub(A) ) * X = sub( B )
+ *
+ * with a general N-by-N distributed matrix sub( A ) using the LU
+ * factorization computed by PDGSTRF.
+ *
+ * This routine is used only in the iterative refinement routine
+ * pdgsrfs_ABXglobal, assuming that the right-hand side is already
+ * distributed in the diagonal processes.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures to store L and U factors,
+ *        and the permutation vectors.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t' structure.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * x      (input/output) double*
+ *        On entry, the right hand side matrix.
+ *        On exit, the solution matrix if info = 0;
+ *
+ *        NOTE: the right-hand side matrix is already distributed on
+ *              the diagonal processes.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves; 
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>      
+ */
+
+void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+	      double *x, int nrhs, SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double alpha = 1.0;
+    double *lsum;  /* Local running sum of the updates to B-components */
+    double *lusup, *dest;
+    double *recvbuf, *tempv;
+    double *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int    iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr;
+    MPI_Status status;
+#ifdef ISEND_IRECV
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -8;
+    if ( *info ) {
+	pxerr_dist("PDGSTRS1", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrs1()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS1. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#ifdef ISEND_IRECV
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Compute ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
+    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Prepended the block number in the header for lsum[].
+     */
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; 
+	}
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( !frecv[lk] && !fmod[lk] ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, 
+                                 pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /*
+     * Compute the internal nodes asynchronously by all processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM:
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p)
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     MPI_DOUBLE, pi, Xk, grid->comm,
+				     &send_req[Llu->SolveMsgSent++] );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam,  status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) printf("\n.. After L-solve: y =\n");
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS1. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nlb is the number of local block rows. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb)
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( !brecv[lk] && !bmod[lk] ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm,
+				   &send_req[Llu->SolveMsgSent++] );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, 
+                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM:
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+		if ( !(--brecv[lk]) && !bmod[lk] ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p)
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_DOUBLE, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrs1()");
+#endif
+
+} /* PDGSTRS1 */
diff --git a/SRC/pdgstrsL.c b/SRC/pdgstrsL.c
new file mode 100644
index 0000000..d740e23
--- /dev/null
+++ b/SRC/pdgstrsL.c
@@ -0,0 +1,848 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a lower triangular system L*X = B,  with L being the
+ * lower triangular factor computed previously by PDGSTRF.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ * 
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ * 
+ * B      (input) double*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) double*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) SOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+
+int_t
+pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
+                      int_t fst_row, int_t *ilsum, double *x,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist,
+		      gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct)
+{
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
+    int_t  *send_ibuf, *recv_ibuf;
+    double *send_dbuf, *recv_dbuf;
+    int_t  *xsup, *supno;
+    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk;
+    int    p, procs;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    SendCnt      = gstrs_comm->B_to_X_SendCnt;
+    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    /* ------------------------------------------------------------
+       NOW COMMUNICATE THE ACTUAL DATA.
+       ------------------------------------------------------------*/
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls[p] * nrhs;
+    }
+
+    /* Copy the row indices and values to the send buffer. */
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	k = ptr_to_ibuf[p];
+	send_ibuf[k] = irow;
+	k = ptr_to_dbuf[p];
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    send_dbuf[k++] = B[i + j*ldb];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+
+    /* Communicate the (permuted) row indices. */
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+
+    /* Communicate the numerical values. */
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
+		  grid->comm);
+    
+    /* ------------------------------------------------------------
+       Copy buffer into X on the diagonal processes.
+       ------------------------------------------------------------*/
+    ii = 0;
+    for (p = 0; p < procs; ++p) {
+        jj = rdispls_nrhs[p];
+        for (i = 0; i < RecvCnt[p]; ++i) {
+	    /* Only the diagonal processes do this; the off-diagonal processes
+	       have 0 RecvCnt. */
+	    irow = recv_ibuf[ii]; /* The permuted row index. */
+	    k = BlockNum( irow );
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );  /* Local block number. */
+	    l = X_BLK( lk );
+	    x[l - XK_H] = k;      /* Block number prepended in the header. */
+	    irow = irow - FstBlockC(k); /* Relative row number in X-block */
+	    RHS_ITERATE(j) {
+	        x[l + irow + j*knsupc] = recv_dbuf[jj++];
+	    }
+	    ++ii;
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()");
+#endif
+    return 0;
+} /* pdReDistribute_B_to_X */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * </pre>
+ */
+
+int_t
+pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row,
+		      int_t nrhs, double *x, int_t *ilsum,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
+		      SOLVEstruct_t *SOLVEstruct)
+{
+    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
+    int_t  *xsup, *supno;
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *send_ibuf, *recv_ibuf;
+    double *send_dbuf, *recv_dbuf;
+    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+    int  iam, p, q, pkk, procs;
+    int_t  num_diag_procs, *diag_procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+ 
+    SendCnt      = gstrs_comm->X_to_B_SendCnt;
+    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls_nrhs[p];
+    }
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid ); /* Local block number */
+		irow = FstBlockC( k );
+		l = X_BLK( lk );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
+#else
+		    ii = irow;
+#endif
+		    q = row_to_proc[ii];
+		    jj = ptr_to_ibuf[q];
+		    send_ibuf[jj] = ii;
+		    jj = ptr_to_dbuf[q];
+		    RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
+		        send_dbuf[jj++] = x[l + i + j*knsupc];
+		    }
+		    ++ptr_to_ibuf[q];
+		    ptr_to_dbuf[q] += nrhs;
+		    ++irow;
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+        COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
+       ------------------------------------------------------------*/
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, 
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
+		  grid->comm);
+
+    /* ------------------------------------------------------------
+       COPY THE BUFFER INTO B.
+       ------------------------------------------------------------*/
+    for (i = 0, k = 0; i < m_loc; ++i) {
+	irow = recv_ibuf[i];
+	irow -= fst_row; /* Relative row number */
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    B[irow + j*ldb] = recv_dbuf[k++];
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()");
+#endif
+    return 0;
+
+} /* pdReDistribute_X_to_B */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSTRSL solves a lower triangular system L*X = B,  with L being the
+ * lower triangular factor computed previously by PDGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PDGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) double*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ * 
+ * SOLVEstruct (output) SOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>       
+ */
+
+void
+pdgstrsL(int_t n, LUstruct_t *LUstruct, 
+	 ScalePermstruct_t *ScalePermstruct,
+	 gridinfo_t *grid, double *B,
+	 int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
+	 SOLVEstruct_t *SOLVEstruct,
+	 SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double alpha = 1.0;
+    double zero = 0.0;
+    double *lsum;  /* Local running sum of the updates to B-components */
+    double *x;     /* X component at step k. */
+		    /* NOTE: x and lsum are of same size. */
+    double *lusup, *dest;
+    double *recvbuf, *tempv;
+    double *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *supno, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr;
+    MPI_Status status;
+#ifdef ISEND_IRECV
+    MPI_Request *send_req, recv_req;
+#endif
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve --
+                             Count the number of local block products to
+                             be summed into lsum[lk]. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+                             from processes in this row. 
+                             It is only valid on the diagonal processes. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+ 
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerbla("PDGSTRS", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = supno[n-1] + 1;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrsL()");
+#endif
+
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#ifdef ISEND_IRECV
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+    /* Redistribute B into X on the diagonal processes. */
+    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+    /* Set up the headers in lsum[]. */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	
+#if ( PROFlevel>=2 )
+	t_reduce_tmp = SuperLU_timer_();
+#endif
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+#if ( PROFlevel>=2 )
+	t_reduce += SuperLU_timer_() - t_reduce_tmp;
+#endif
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE,
+                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
+                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j) {
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+	      }
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
+                                     MPI_DOUBLE, pi, Xk, grid->comm,
+                                     &send_req[Llu->SolveMsgSent++]);
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				    MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel==2 )
+    {
+      printf("(%d) .. After L-solve: y =\n", iam);
+      for (i = 0, k = 0; k < nsupers; ++k) {
+	  krow = PROW( k, grid );
+	  kcol = PCOL( k, grid );
+	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	      knsupc = SuperSize( k );
+	      lk = LBi( k, grid );
+	      ii = X_BLK( lk );
+	      for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	      fflush(stdout);
+	  }
+	  MPI_Barrier( grid->comm );
+      }
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+    /* Re-distribute X on the diagonal processes to B distributed on all
+       the processes.   */
+    pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrsL()");
+#endif
+
+} /* PDGSTRS */
+
diff --git a/SRC/pdgstrs_Bglobal.c b/SRC/pdgstrs_Bglobal.c
new file mode 100644
index 0000000..b921a0a
--- /dev/null
+++ b/SRC/pdgstrs_Bglobal.c
@@ -0,0 +1,1040 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+		   int*, double*, int*, double*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+static void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,
+                               LocalLU_t *, gridinfo_t *, int_t, int_t [],
+                               int_t [], double [], int_t, double []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pdgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by pdgstrf.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) double*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>    
+ */
+
+void
+pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
+                double *B, int_t ldb, int nrhs, 
+                SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double alpha = 1.0;
+    double *lsum;  /* Local running sum of the updates to B-components */
+    double *x;     /* X component at step k. */
+    double *lusup, *dest;
+    double *recvbuf, *tempv;
+    double *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr;
+    MPI_Status status;
+#if defined (ISEND_IRECV) || defined (BSEND)
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+ 
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PDGSTRS_BGLOBAL", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#if defined (ISEND_IRECV) || defined (BSEND)
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Copy B into X on the diagonal processes.
+     */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	    kcol = PCOL( k, grid );
+	    if ( mycol == kcol ) { /* Diagonal process. */
+		jj = X_BLK( lk );
+		x[jj - XK_H] = k;  /* Block number prepended in the header. */
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
+			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
+	    }
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  MPI_DOUBLE, 
+                                  pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req,stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm, 
+				     &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )	      
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    printf("\n(%d) .. After L-solve: y =\n", iam);
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nub is the number of local block columns. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb) {
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    }
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                  MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+	
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM: /* Receiver must be a diagonal process */
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_DOUBLE, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                      MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		    }
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+
+    /* Copy the solution X into B (on all processes). */
+    {
+	int_t num_diag_procs, *diag_procs, *diag_len;
+	double *work;
+
+	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		       &diag_procs, &diag_len);
+	jj = diag_len[0];
+	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
+	if ( !(work = doubleMalloc_dist(((size_t)jj)*nrhs)) )
+	    ABORT("Malloc fails for work[]");
+	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
+			   grid, num_diag_procs, diag_procs, diag_len,
+			   B, ldb, work);
+	SUPERLU_FREE(diag_procs);
+	SUPERLU_FREE(diag_len);
+	SUPERLU_FREE(work);
+    }
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+#ifdef BSEND
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
+#endif
+
+} /* PDGSTRS_BGLOBAL */
+
+
+/*
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ */
+static void
+gather_diag_to_all(int_t n, int_t nrhs, double x[],
+		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+		   gridinfo_t *grid, int_t num_diag_procs,
+		   int_t diag_procs[], int_t diag_len[],
+		   double y[], int_t ldy, double work[])
+{
+    int_t i, ii, j, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    double *x_col, *y_col;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
+		x_col = &x[ii];
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
+		    lwork += knsupc;
+		    x_col += knsupc;
+		}
+	    }
+	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    y_col = &y[ii];
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
+		lwork += knsupc;
+		y_col += ldy;
+	    }
+	}
+    }
+} /* GATHER_DIAG_TO_ALL */
+
diff --git a/SRC/pdgstrs_Bglobal_Bsend.c b/SRC/pdgstrs_Bglobal_Bsend.c
new file mode 100644
index 0000000..af26ed7
--- /dev/null
+++ b/SRC/pdgstrs_Bglobal_Bsend.c
@@ -0,0 +1,1017 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Solves a system of distributed linear equations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+
+/*#define ISEND_IRECV*/
+
+/* Parry's change
+   Use MPI_Bsend with a large buffer attached in the main program */
+#define BSEND 1
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+		   int*, double*, int*, double*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pdgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by pdgstrf.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) double*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>      
+ */
+void
+pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B,
+		int_t ldb, int nrhs, SuperLUStat_t *stat, int *info)
+{
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    double alpha = 1.0;
+    double *lsum;  /* Local running sum of the updates to B-components */
+    double *x;     /* X component at step k. */
+    double *lusup, *dest;
+    double *recvbuf, *tempv;
+    double *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr;
+    MPI_Status status;
+#if defined(ISEND_IRECV) || defined(BSEND)
+    MPI_Request *send_req, recv_req;
+    int test_flag;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+    /*-- Function prototypes --*/
+    extern void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,
+				   LocalLU_t *, gridinfo_t *, int_t, int_t [],
+				   int_t [], double [], int_t, double []);
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+#ifdef BSEND
+    if(!iam) {
+      printf("Using MPI_Bsend in triangular solve\n");
+      fflush(stdout);
+    }
+#endif
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+    stat->ops[SOLVE] = 0.0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#if defined(ISEND_IRECV) || defined(BSEND)
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+    for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL;
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)))
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doubleMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Copy B into X on the diagonal processes.
+     */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	    kcol = PCOL( k, grid );
+	    if ( mycol == kcol ) { /* Diagonal process. */
+		jj = X_BLK( lk );
+		x[jj - XK_H] = k;  /* Block number prepended in the header. */
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
+			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
+	    }
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#else
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			if ( send_req[p] != MPI_REQUEST_NULL ) 
+			    MPI_Wait( &send_req[p], &status );
+#endif
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req,stat);
+#ifdef ISEND_IRECV
+		/* Wait for previous Isends to complete. */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY )
+			/*MPI_Wait( &send_req[p], &status );*/
+			MPI_Test( &send_req[p], &test_flag, &status );
+		}
+#endif
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM:
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#else
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p)
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			  MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			  if ( send_req[p] != MPI_REQUEST_NULL )
+			    MPI_Wait( &send_req[p], &status );
+#endif
+			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
+				    MPI_DOUBLE, pi, Xk, grid->comm, 
+				    &send_req[p]);
+#else
+#ifdef BSEND
+			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+#ifdef ISEND_IRECV
+		  /* Wait for the previous Isends to complete. */
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY )
+			  MPI_Test( &send_req[p], &test_flag, &status );
+		  }
+#endif
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=1 )	      
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( PRNTlevel==2 )
+    if ( !iam ) printf("\n.. After L-solve: y =\n");
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+    /* MPI_Barrier( grid->comm );  Drain messages in the forward solve. */
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nlb is the number of local block rows. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb)
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=1 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#else
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			if ( send_req[p] != MPI_REQUEST_NULL )
+			  MPI_Wait( &send_req[p], &status );
+#endif
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+#ifdef ISEND_IRECV
+		/* Wait for the previous Isends to complete. */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY )
+			/*MPI_Wait( &send_req[p], &status );*/
+			MPI_Test( &send_req[p], &test_flag, &status );
+		}
+#endif
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+	
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM:
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#else
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p)
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+#if 1
+			    MPI_Test( &send_req[p], &test_flag, &status );
+#else
+			    if ( send_req[p] != MPI_REQUEST_NULL )
+			        MPI_Wait( &send_req[p], &status );
+#endif
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				      MPI_DOUBLE, pi, Xk, grid->comm,
+				      &send_req[p] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+#ifdef ISEND_IRECV
+		    /* Wait for the previous Isends to complete. */
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY )
+			    /*MPI_Wait( &send_req[p], &status );*/
+			    MPI_Test( &send_req[p], &test_flag, &status );
+		    }
+#endif
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=1 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+
+    /* Copy the solution X into B (on all processes). */
+    {
+	int_t num_diag_procs, *diag_procs, *diag_len;
+	double *work;
+
+	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		       &diag_procs, &diag_len);
+	jj = diag_len[0];
+	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
+	if ( !(work = doubleMalloc_dist(jj*nrhs)) )
+	    ABORT("Malloc fails for work[]");
+	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
+			   grid, num_diag_procs, diag_procs, diag_len,
+			   B, ldb, work);
+	SUPERLU_FREE(diag_procs);
+	SUPERLU_FREE(diag_len);
+	SUPERLU_FREE(work);
+    }
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (p = 0; p < Pr; ++p) {
+        if ( send_req[p] != MPI_REQUEST_NULL )
+	    MPI_Wait( &send_req[p], &status );
+    }
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
+#endif
+/* Chao debug */
+
+  MPI_Barrier( grid->comm );  /* Drain messages in the forward solve. */
+
+} /* PDGSTRS_BGLOBAL */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * </pre>
+ */
+static void
+gather_diag_to_all(int_t n, int_t nrhs, double x[],
+		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+		   gridinfo_t *grid, int_t num_diag_procs,
+		   int_t diag_procs[], int_t diag_len[],
+		   double y[], int_t ldy, double work[])
+{
+    int_t i, ii, j, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    double *x_col, *y_col;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
+		x_col = &x[ii];
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
+		    lwork += knsupc;
+		    x_col += knsupc;
+		}
+	    }
+	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    y_col = &y[ii];
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
+		lwork += knsupc;
+		y_col += ldy;
+	    }
+	}
+    }
+} /* GATHER_DIAG_TO_ALL */
diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c
new file mode 100644
index 0000000..8d3da84
--- /dev/null
+++ b/SRC/pdgstrs_lsum.c
@@ -0,0 +1,374 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
+		   double*, int*, double*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+		   int*, double*, int*, double*, double*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+void dlsum_fmod
+/************************************************************************/
+(
+ double *lsum,    /* Sum of local modifications.                        */
+ double *x,       /* X array (local)                                    */
+ double *xk,      /* X[k].                                              */
+ double *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int   knsupc,    /* Size of supernode k.                               */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t nlb,       /* Number of L blocks.                                */
+ int_t lptr,      /* Starting position in lsub[*].                      */
+ int_t luptr,     /* Starting position in lusup[*].                     */
+ int_t *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+)
+{
+    double alpha = 1.0, beta = 0.0;
+    double *lusup, *lusup1;
+    double *dest;
+    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
+    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
+    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    lsub = Llu->Lrowind_bc_ptr[lk];
+    lusup = Llu->Lnzval_bc_ptr[lk];
+    nsupr = lsub[1];
+
+    for (lb = 0; lb < nlb; ++lb) {
+	ik = lsub[lptr]; /* Global block number, row-wise. */
+	nbrow = lsub[lptr+1];
+#ifdef _CRAY
+	SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+	      &alpha, &lusup[luptr], &nsupr, xk,
+	      &knsupc, &beta, rtemp, &nbrow );
+#elif defined (USE_VENDOR_BLAS)
+	dgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow, 1, 1 );
+#else
+	dgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow );
+#endif
+	stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs;
+   
+	lk = LBi( ik, grid ); /* Local block number, row-wise. */
+	iknsupc = SuperSize( ik );
+	il = LSUM_BLK( lk );
+	dest = &lsum[il];
+	lptr += LB_DESCRIPTOR;
+	rel = xsup[ik]; /* Global row index of block ik. */
+	for (i = 0; i < nbrow; ++i) {
+	    irow = lsub[lptr++] - rel; /* Relative row. */
+	    RHS_ITERATE(j)
+		dest[irow + j*iknsupc] -= rtemp[i + j*nbrow];
+	}
+	luptr += nbrow;
+		    
+	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
+	    ikcol = PCOL( ik, grid );
+	    p = PNUM( myrow, ikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_DOUBLE, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_DOUBLE, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 MPI_DOUBLE, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( lk );
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
+		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
+		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( ik, grid );/* Local block number, column-wise. */
+		    lsub1 = Llu->Lrowind_bc_ptr[lk];
+		    lusup1 = Llu->Lnzval_bc_ptr[lk];
+		    nsupr1 = lsub1[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+			  lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+			   lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+		
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( fsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, ikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_DOUBLE, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                    }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    nlb1 = lsub1[0] - 1;
+		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
+		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
+
+		    dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
+			       fmod, nlb1, lptr1, luptr1, xsup,
+			       grid, Llu, send_req, stat);
+		} /* if frecv[lk] == 0 */
+	    } /* if iam == p */
+	} /* if fmod[lk] == 0 */
+
+    } /* for lb ... */
+
+} /* dLSUM_FMOD */
+
+
+/************************************************************************/
+void dlsum_bmod
+/************************************************************************/
+(
+ double *lsum,        /* Sum of local modifications.                    */
+ double *x,           /* X array (local).                               */
+ double *xk,          /* X[k].                                          */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+ )
+{
+/*
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+ */
+    double alpha = 1.0;
+    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+           j, jj, lk, lk1, nub, ub, uptr;
+    int_t  *usub;
+    double *uval, *dest, *y;
+    int_t  *lsub;
+    double *lusup;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *brecv = Llu->brecv;
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    knsupc = SuperSize( k );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
+    for (ub = 0; ub < nub; ++ub) {
+	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+	usub = Llu->Ufstnz_br_ptr[ik];
+	uval = Llu->Unzval_br_ptr[ik];
+	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+	i += UB_DESCRIPTOR;
+	il = LSUM_BLK( ik );
+	gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+	iknsupc = SuperSize( gik );
+	ikfrow = FstBlockC( gik );
+	iklrow = FstBlockC( gik+1 );
+
+	RHS_ITERATE(j) {
+	    dest = &lsum[il + j*iknsupc];
+	    y = &xk[j*knsupc];
+	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+	    for (jj = 0; jj < knsupc; ++jj) {
+		fnz = usub[i + jj];
+		if ( fnz < iklrow ) { /* Nonzero segment. */
+		    /* AXPY */
+		    for (irow = fnz; irow < iklrow; ++irow)
+			dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+		    stat->ops[SOLVE] += 2 * (iklrow - fnz);
+		}
+	    } /* for jj ... */
+	}
+
+	if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */
+	    gikcol = PCOL( gik, grid );
+	    p = PNUM( myrow, gikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_DOUBLE, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_DOUBLE, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			  MPI_DOUBLE, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( ik );
+		dest = &x[ii];
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
+		if ( !brecv[ik] ) { /* Becomes a leaf node. */
+		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
+		    lk1 = LBj( gik, grid ); /* Local block number. */
+		    lsub = Llu->Lrowind_bc_ptr[lk1];
+		    lusup = Llu->Lnzval_bc_ptr[lk1];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, gik);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( bsendx_plist[lk1][p] != EMPTY ) {
+			    pi = PNUM( p, gikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_DOUBLE, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_DOUBLE, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     MPI_DOUBLE, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                     }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    if ( Urbs[lk1] )
+			dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if brecv[ik] == 0 */
+	    }
+	} /* if bmod[ik] == 0 */
+
+    } /* for ub ... */
+
+} /* dlSUM_BMOD */
+
diff --git a/SRC/pdlangs.c b/SRC/pdlangs.c
new file mode 100644
index 0000000..1106765
--- /dev/null
+++ b/SRC/pdlangs.c
@@ -0,0 +1,145 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
+ *
+ * <pre>
+ * File name:	pdlangs.c
+ * History:     Modified from lapack routine DLANGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre> 
+    Purpose   
+    =======   
+
+    PDLANGS returns the value of the one norm, or the Frobenius norm, or 
+    the infinity norm, or the element of largest absolute value of a 
+    real matrix A.   
+
+    Description   
+    ===========   
+
+    PDLANGE returns the value   
+
+       PDLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+                 (   
+                 ( norm1(A),         NORM = '1', 'O' or 'o'   
+                 (   
+                 ( normI(A),         NORM = 'I' or 'i'   
+                 (   
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+
+    Arguments   
+    =========   
+
+    NORM    (input) CHARACTER*1   
+            Specifies the value to be returned in DLANGE as described above.   
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A. 
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+   ===================================================================== 
+</pre>
+*/
+
+double pdlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
+{   
+    /* Local variables */
+    NRformat_loc *Astore;
+    int_t    m_loc;
+    double   *Aval;
+    int_t    i, j, jcol;
+    double   value=0., sum;
+    double   *rwork;
+    double   tempvalue;
+    double   *temprwork;
+
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    Aval   = (double *) Astore->nzval;
+    
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+    } else if ( strncmp(norm, "M", 1)==0 ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+		value = SUPERLU_MAX( value, fabs(Aval[j]) );
+	}
+
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+#if 0
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+		sum += fabs(Aval[i]);
+	    value = SUPERLU_MAX(value,sum);
+	}
+#else /* XSL ==> */
+	if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) )
+	    ABORT("doubleCalloc_dist fails for rwork.");
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+		rwork[jcol] += fabs(Aval[j]);
+	    }
+	}
+
+	if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) )
+	    ABORT("doubleCalloc_dist fails for temprwork.");
+	MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm);
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    value = SUPERLU_MAX(value, temprwork[j]);
+	}
+	SUPERLU_FREE (temprwork);
+	SUPERLU_FREE (rwork);
+#endif	
+    } else if ( strncmp(norm, "I", 1)==0 ) {
+	/* Find normI(A). */
+	value = 0.;
+	sum = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	        sum += fabs(Aval[j]);
+	    value = SUPERLU_MAX(value, sum);
+	}
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else {
+	ABORT("Illegal norm specified.");
+    }
+    
+    return (value);
+
+} /* pdlangs */
diff --git a/SRC/pdlaqgs.c b/SRC/pdlaqgs.c
new file mode 100644
index 0000000..5d3af5b
--- /dev/null
+++ b/SRC/pdlaqgs.c
@@ -0,0 +1,151 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Equilibrates a general sparse M by N matrix
+ *
+ * <pre>
+ * File name:	pdlaqgs.c
+ * History:     Modified from LAPACK routine DLAQGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose   
+    =======   
+
+    PDLAQGS equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments   
+    =========   
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of 
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+	    
+    R       (input) double*, dimension (A->nrow)
+            The row scale factors for A.
+	    
+    C       (input) double*, dimension (A->ncol)
+            The column scale factors for A.
+	    
+    ROWCND  (input) double
+            Ratio of the smallest R(i) to the largest R(i).
+	    
+    COLCND  (input) double
+            Ratio of the smallest C(i) to the largest C(i).
+	    
+    AMAX    (input) double
+            Absolute value of largest matrix entry.
+	    
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.   
+            = 'N':  No equilibration   
+            = 'R':  Row equilibration, i.e., A has been premultiplied by  
+                    diag(R).   
+            = 'C':  Column equilibration, i.e., A has been postmultiplied  
+                    by diag(C).   
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).   
+
+    Internal Parameters   
+    ===================   
+
+    THRESH is a threshold value used to decide if row or column scaling   
+    should be done based on the ratio of the row or column scaling   
+    factors.  If ROWCND < THRESH, row scaling is done, and if   
+    COLCND < THRESH, column scaling is done.   
+
+    LARGE and SMALL are threshold values used to decide if row scaling   
+    should be done based on the absolute size of the largest matrix   
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+
+    ===================================================================== 
+</pre>
+*/
+
+void
+pdlaqgs(SuperMatrix *A, double *r, double *c, 
+       double rowcnd, double colcnd, double amax, char *equed)
+{
+
+#define THRESH    (0.1)
+    
+    /* Local variables */
+    NRformat_loc *Astore;
+    double *Aval;
+    int_t i, j, irow, jcol, m_loc;
+    double large, small;
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+    
+    /* Initialize LARGE and SMALL. */
+    small = dmach_dist("Safe minimum") / dmach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    irow = Astore->fst_row;
+	    for (i = 0; i < m_loc; ++i) {
+	        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+		    jcol = Astore->colind[j];
+		    Aval[j] *= c[jcol];
+	      }
+	      ++irow;
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	        Aval[j] *= r[irow];
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Both row and column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+	        Aval[j] = Aval[j] * r[irow] * c[jcol];
+	    }
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* pdlaqgs */
+
diff --git a/SRC/pdsymbfact_distdata.c b/SRC/pdsymbfact_distdata.c
new file mode 100644
index 0000000..c301c81
--- /dev/null
+++ b/SRC/pdsymbfact_distdata.c
@@ -0,0 +1,1974 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Redistribute the symbolic structure of L and U from the distribution
+ *
+ * <pre>
+ * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
+ * -- Distributes the data from parallel symbolic factorization 
+ * -- to numeric factorization
+ * INRIA France -  July 1, 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ * </pre>
+ */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+
+#include "superlu_ddefs.h"
+#include "psymbfact.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Redistribute the symbolic structure of L and U from the distribution
+ * used in the parallel symbolic factorization step to the distdibution
+ * used in the parallel numeric factorization step.  On exit, the L and U
+ * structure for the 2D distribution used in the numeric factorization step is
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
+ * information is also computed and it is stored in Glu_persist->supno
+ * and Glu_persist->xsup.
+ *
+ * This routine allocates memory for storing the structure of L and U
+ * and the supernodes information.  This represents the arrays:
+ * p_xlsub, p_lsub, p_xusub, p_usub,
+ * Glu_persist->supno,  Glu_persist->xsup.
+ *
+ * This routine also deallocates memory allocated during symbolic 
+ * factorization routine.  That is, the folloing arrays are freed:
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
+ * Pslu_freeable->xusub, Pslu_freeable->usub, 
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
+ * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (Input) int_t
+ *        Order of the input matrix
+ * Pslu_freeable  (Input) Pslu_freeable_t *
+ *        Local L and U structure, 
+ *        global to local indexing information.
+ * 
+ * Glu_persist (Output) Glu_persist_t *
+ *        Stores on output the information on supernodes mapping.
+ * 
+ * p_xlsub (Output) int_t **
+ *         Pointer to structure of L distributed on a 2D grid 
+ *         of processors, stored by columns.
+ * 
+ * p_lsub  (Output) int_t **
+ *         Structure of L distributed on a 2D grid of processors, 
+ *         stored by columns.
+ *
+ * p_xusub (Output) int_t **
+ *         Pointer to structure of U distributed on a 2D grid 
+ *         of processors, stored by rows.
+ * 
+ * p_usub  (Output) int_t **
+ *         Structure of U distributed on a 2D grid of processors, 
+ *         stored by rows.
+ * 
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU.
+ *   > 0, number of bytes allocated in this routine when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+static float
+dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
+	     Glu_persist_t *Glu_persist, 
+	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
+	     gridinfo_t *grid
+	     )
+{
+  int   iam, nprocs, pc, pr, p, np, p_diag;
+  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
+    *tmp_ptrToSend, *mem;
+  int_t *nnzToRecv_l, *nnzToRecv_u;
+  int_t *send_1, *send_2, nsend_1, nsend_2;
+  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+  int_t nsupers, nsupers_i, nsupers_j;
+  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
+  int_t maxszsn, maxNvtcsPProc;
+  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
+  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
+  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
+  int_t *globToLoc, nvtcs_loc;
+  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
+    RecvCnt_l, RecvCnt_u, ind_loc;
+  int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
+  int_t nelts, isize;
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword;
+  
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_symbLU()");
+#endif
+  nprocs = (int) grid->nprow * grid->npcol;
+  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
+  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
+  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
+  xsup_end_s    = Pslu_freeable->xsup_end_loc;
+  supno_s       = Pslu_freeable->supno_loc;
+  rcv_luind     = NULL;
+  iword = sizeof(int_t);
+  dword = sizeof(double);
+  memAux = 0.; memRet = 0.;
+  
+  mem           = intCalloc_dist(12 * nprocs);
+  if (!mem)
+    return (ERROR_RET);
+  memAux     = (float) (12 * nprocs * sizeof(int_t));
+  nnzToRecv     = mem;
+  nnzToSend     = nnzToRecv + 2*nprocs;
+  nnzToSend_l   = nnzToSend + 2 * nprocs;
+  nnzToSend_u   = nnzToSend_l + nprocs;
+  send_1        = nnzToSend_u + nprocs;
+  send_2        = send_1 + nprocs;
+  tmp_ptrToSend = send_2 + nprocs;
+  nnzToRecv_l   = tmp_ptrToSend + nprocs;
+  nnzToRecv_u   = nnzToRecv_l + nprocs;
+  
+  ptrToSend = nnzToSend;
+  ptrToRecv = nnzToSend + nprocs;
+
+  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
+  intBuf1 = nvtcs + nprocs;
+  intBuf2 = nvtcs + 2 * nprocs;
+  intBuf3 = nvtcs + 3 * nprocs;
+  intBuf4 = nvtcs + 4 * nprocs;
+  memAux += 5 * nprocs * sizeof(int);
+
+  maxszsn   = sp_ienv_dist(3);
+  
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(supno_n = intMalloc_dist(n+1)) ) {
+    fprintf (stderr, "Malloc fails for supno_n[].");
+    return (memAux);
+  }
+  memRet += (float) ((n+1) * sizeof(int_t));
+
+  /* ------------------------------------------------------------
+     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
+     ------------------------------------------------------------*/
+  
+  if (nvtcs_loc > INT_MAX)
+    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
+  intNvtcs_loc = (int) nvtcs_loc;
+  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
+	      0, grid->comm);
+
+  if (!iam) {
+    /* set ptrToRecv to point to the beginning of the data for
+       each processor */
+    for (k = 0, p = 0; p < nprocs; p++) {
+      ptrToRecv[p] = k;
+      k += nvtcs[p];
+    }
+  }
+  
+  if (nprocs > 1) {
+    temp = NULL;
+    if (!iam ) {
+      if ( !(temp = intMalloc_dist (n+1)) ) {
+	fprintf (stderr, "Malloc fails for temp[].");
+	return (memAux + memRet);
+      }
+      memAux += (float) (n+1) * iword;
+    }
+#if defined (_LONGINT)
+    for (p=0; p<nprocs; p++) {
+      if (ptrToRecv[p] > INT_MAX)
+	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+      intBuf1[p] = (int) ptrToRecv[p];
+    }
+#else  /* Default */
+    intBuf1 = ptrToRecv;
+#endif
+    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
+		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
+  }
+  else
+    temp = supno_s;
+
+  if (!iam) {
+    nsupers = 0;
+    p = (int) OWNER( globToLoc[0] );
+    gb = temp[ptrToRecv[p]];
+    supno_n[0] = nsupers;
+    ptrToRecv[p] ++;
+    szsn = 1;
+    for (j = 1; j < n; j ++) {
+      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
+	nsupers ++;
+	p  = (int) OWNER( globToLoc[j] );
+	gb = temp[ptrToRecv[p]];
+	szsn = 1;
+      }
+      else {
+	szsn ++;
+      }
+      ptrToRecv[p] ++;
+      supno_n[j] = nsupers;
+    }
+    nsupers++;
+    if (nprocs > 1) {
+      SUPERLU_FREE (temp);
+      memAux -= (float) (n+1) * iword;
+    }
+    supno_n[n] = nsupers;
+  }
+
+  /* reset to 0 nnzToSend */
+  for (p = 0; p < 2 *nprocs; p++)
+    nnzToSend[p] = 0;
+  
+  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
+  nsupers = supno_n[n];
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
+    fprintf (stderr, "Malloc fails for xsup_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers+1) * iword;  
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
+     ------------------------------------------------------------*/
+  gb = EMPTY;
+  for (i = 0; i < n; i++) {
+    if (gb != supno_n[i]) {
+      /* a new supernode starts */
+      gb = supno_n[i];
+      xsup_n[gb] = i;
+    }
+  }
+  xsup_n[nsupers] = n;
+  
+  for (p = 0; p < nprocs; p++) {
+    send_1[p] = FALSE;
+    send_2[p] = FALSE;
+  }
+  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
+    i = xsup_n[gb_n];
+    if (iam == (int) OWNER( globToLoc[i] )) {
+      pc = PCOL( gb_n, grid );
+      pr = PROW( gb_n, grid );
+      p_diag = PNUM( pr, pc, grid);
+      
+      i_loc = LOCAL_IND( globToLoc[i] );
+      gb_s  = supno_s[i_loc];
+      fst_s = xsup_beg_s[gb_s];
+      lst_s = xsup_end_s[gb_s];
+      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
+	k = lsub_s[j];
+	if (k >= i) {
+	  gb = supno_n[k];
+	  p = (int) PNUM( PROW(gb, grid), pc, grid );
+	  nnzToSend[2*p] ++;
+	  send_1[p] = TRUE;
+	}
+      }
+      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
+	k = usub_s[j];
+	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
+	  gb = supno_n[k];
+	  p = PNUM( pr, PCOL(gb, grid), grid);
+	  nnzToSend[2*p+1] ++;	
+	  send_2[p] = TRUE;
+	}
+      }
+      
+      nsend_2 = 0;
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	nnzToSend[2*p+1] += 2;
+	if (send_2[p])  nsend_2 ++;	  
+      }
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
+	if (send_2[p] || p == p_diag) {
+	  if (p == p_diag && !send_2[p])
+	    nnzToSend[2*p+1] += nsend_2;
+	  else
+	    nnzToSend[2*p+1] += nsend_2-1;
+	  send_2[p] = FALSE;
+	}
+      nsend_1 = 0;
+      for (p = pc; p < nprocs; p += grid->npcol) {
+	nnzToSend[2*p] += 2;
+	if (send_1[p]) nsend_1 ++;
+      }
+      for (p = pc; p < nprocs; p += grid->npcol) 
+	if (send_1[p]) {
+	  nnzToSend[2*p] += nsend_1-1;
+	  send_1[p] = FALSE;
+	}
+	else
+	  nnzToSend[2*p] += nsend_1;
+    }
+  }
+  
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
+		grid->comm);
+  
+  nnz_loc_l = nnz_loc_u = 0;
+  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
+  for (p = 0; p < nprocs; p++) {
+    if ( p != iam ) {
+      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
+      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
+      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
+      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    } else {
+      nnz_loc_l += nnzToRecv[2*p];
+      nnz_loc_u += nnzToRecv[2*p+1];
+      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
+      nnzToRecv_l[p] = nnzToRecv[2*p]; 
+      nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    }
+  }
+  
+  /* Allocate space for storing the symbolic structure after redistribution. */
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for xlsub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_j+1) * iword;
+
+  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for xusub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_i+1) * iword;  
+
+  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
+  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
+    if (!(rcv_luind = 
+	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
+      fprintf (stderr, "Malloc fails for rcv_luind[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
+      * iword;
+  }
+  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
+      fprintf (stderr, "Malloc fails for index[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+  } 
+  
+  /* ------------------------------------------------------------------
+     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
+     ------------------------------------------------------------------*/
+  sendL = TRUE;
+  sendU = FALSE;
+  while (sendL || sendU) {
+    if (sendL) {
+      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
+      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
+    }
+    if (sendU) {
+      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
+      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
+    }
+    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
+      if ( p != iam ) {
+	ptrToSend[p] = i;  i += nnzToSend[p];
+      }
+      ptrToRecv[p] = j;  j += nnzToRecv[p];
+    }
+    nnzToRecv[iam] = 0;
+    
+    ind_loc = ptrToRecv[iam];
+    for (gb_n = 0; gb_n < nsupers; gb_n++) {
+      nsend_2 = 0;    
+      i = xsup_n[gb_n];
+      if (iam == OWNER( globToLoc[i] )) {
+	pc = PCOL( gb_n, grid );
+	pr = PROW( gb_n, grid );
+	p_diag = PNUM( pr, pc, grid );
+	
+	i_loc = LOCAL_IND( globToLoc[i] );
+	gb_s  = supno_s[i_loc];
+	fst_s = xsup_beg_s[gb_s];
+	lst_s = xsup_end_s[gb_s];
+	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+
+	if (sendL) {
+	  p = pc;                np = grid->nprow;	  
+	} else {
+	  p = pr * grid->npcol;  np = grid->npcol;
+	}
+	for (j = 0; j < np; j++) {
+	  if (p == iam) {
+	    rcv_luind[ind_loc] = gb_n;
+	    rcv_luind[ind_loc+1] = 0;
+	    tmp_ptrToSend[p] = ind_loc + 1;
+	    ind_loc += 2;	 
+	  }
+	  else {
+	    snd_luind[ptrToSend[p]] = gb_n;
+	    snd_luind[ptrToSend[p]+1] = 0;
+	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
+	    ptrToSend[p] += 2;	 
+	  }
+	  if (sendL) p += grid->npcol;
+	  if (sendU) p++;
+	}
+	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
+	  k = sub_s[j];
+	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
+	    gb = supno_n[k];
+	    if (sendL)
+	      p = PNUM( PROW(gb, grid), pc, grid );
+	    else 
+	      p = PNUM( pr, PCOL(gb, grid), grid);
+	    if (send_1[p] == FALSE) {
+	      send_1[p] = TRUE;
+	      send_2[nsend_2] = k; nsend_2 ++;
+	    }
+	    if (p == iam) {
+	      rcv_luind[ind_loc] = k;  ind_loc++;
+	      if (sendL)
+		xsub_n[LBj( gb_n, grid )] ++;
+	      else
+		xsub_n[LBi( gb_n, grid )] ++;
+	    }
+	    else {
+	      snd_luind[ptrToSend[p]] = k;
+	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+	    }
+	  }
+	}
+	if (sendL)
+	  for (p = pc; p < nprocs; p += grid->npcol) {
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if (PNUM(PROW(gb, grid), pc, grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBj( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }
+		}
+	      }
+	      send_1[p] = FALSE;
+	  }  
+	if (sendU)
+	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	    if (send_1[p] || p == p_diag) {	      
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBi( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }	     
+		}
+	      } 
+	      send_1[p] = FALSE;
+	    }
+	  }
+      }
+    }
+    
+    /* reset ptrToSnd to point to the beginning of the data for
+       each processor (structure needed in MPI_Alltoallv) */
+    for (i = 0, p = 0; p < nprocs; p++) {
+      ptrToSend[p] = i;  i += nnzToSend[p];
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       Note: it uses MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    if (nprocs > 1) {
+#if defined (_LONGINT)
+      nnzToSend[iam] = 0;
+      for (p=0; p<nprocs; p++) {
+	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
+	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
+	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+	intBuf1[p] = (int) nnzToSend[p];
+	intBuf2[p] = (int) ptrToSend[p];
+	intBuf3[p] = (int) nnzToRecv[p];
+	intBuf4[p] = (int) ptrToRecv[p];
+      }
+#else  /* Default */
+      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
+      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
+#endif
+
+      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
+		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
+		     grid->comm);
+    }
+    if (sendL)
+      nnzToRecv[iam] = nnz_loc_l;
+    else 
+      nnzToRecv[iam] = nnz_loc_u;
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE.
+       -------------------------------------------------------------*/
+    if (sendU) 
+      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+	SUPERLU_FREE (snd_luind);
+	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+      }
+    
+    /* ------------------------------------------------------------
+       CONVERT THE FORMAT.
+       ------------------------------------------------------------*/
+    /* Initialize the array of column of L/ row of U pointers */
+    k = 0;
+    for (p = 0; p < nprocs; p ++) {
+      if (p != iam) {
+	i = k;
+	while (i < k + nnzToRecv[p]) {
+	  gb = rcv_luind[i];
+	  nelts = rcv_luind[i+1];
+	  if (sendL)
+	    xsub_n[LBj( gb, grid )] = nelts;
+	  else
+	    xsub_n[LBi( gb, grid )] = nelts;
+	  i += nelts + 2;
+	}
+      }
+      k += nnzToRecv[p];
+    }
+
+    if (sendL) j = nsupers_j;
+    else j = nsupers_i;
+    k = 0; 
+    isize = xsub_n[0];
+    xsub_n[0] = 0; 
+    for (gb_l = 1; gb_l < j; gb_l++) {
+      k += isize;
+      isize = xsub_n[gb_l];
+      xsub_n[gb_l] = k;
+    }
+    xsub_n[gb_l] = k + isize;
+    nnz_loc = xsub_n[gb_l];
+    if (sendL) {
+      lsub_n = NULL;
+      if (nnz_loc) {
+	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for lsub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = lsub_n;
+    }
+    if (sendU) {
+      usub_n = NULL;
+      if (nnz_loc) {
+	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for usub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = usub_n;
+    }
+    
+    /* Copy the data into the L column / U row oriented storage */
+    k = 0;
+    for (p = 0; p < nprocs; p++) {
+      i = k;
+      while (i < k + nnzToRecv[p]) {
+	gb = rcv_luind[i];
+	if (gb >= nsupers)
+	  printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
+		  iam, p, gb, nsupers, i, i-k);
+	i += 2;
+	if (sendL) gb_l = LBj( gb, grid );
+	if (sendU) gb_l = LBi( gb, grid );
+	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
+	  sub_n[j] = rcv_luind[i];
+	}
+      }      
+      k += nnzToRecv[p];
+    }
+    if (sendL) {
+      sendL = FALSE;  sendU = TRUE;
+    }
+    else
+      sendU = FALSE;
+  }
+
+  /* deallocate memory allocated during symbolic factorization routine */
+  if (rcv_luind != NULL) {
+    SUPERLU_FREE (rcv_luind);
+    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
+  }
+  SUPERLU_FREE (mem);  
+  memAux -= (float) (12 * nprocs * iword);
+  SUPERLU_FREE(nvtcs);
+  memAux -= (float) (5 * nprocs * sizeof(int));
+  
+  if (xlsub_s != NULL) {
+    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
+  }
+  if (xusub_s != NULL) {
+    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
+  }
+  SUPERLU_FREE (globToLoc); 
+  if (supno_s != NULL) {
+    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
+    SUPERLU_FREE (supno_s);
+  }
+  
+  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
+  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
+  *p_xusub = xusub_n; *p_usub = usub_n;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit dist_symbLU()");
+#endif
+  
+  return (-memRet);
+}
+ 
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.  The lower part is
+ *   stored using a column format and the upper part
+ *   is stored using a row format.
+ * 
+ * Arguments
+ * =========
+ * 
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (Input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_persist  (Input) Glu_persist_t *
+ *        Information on supernodes mapping.
+ * 
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * p_ainf_colptr (Output) int_t**
+ *         Pointer to the lower part of A distributed on a 2D grid 
+ *         of processors, stored by columns.
+ *
+ * p_ainf_rowind (Output) int_t**
+ *         Structure of of the lower part of A distributed on a 
+ *         2D grid of processors, stored by columns.
+ *
+ * p_ainf_val    (Output) double**
+ *         Numerical values of the lower part of A, distributed on a 
+ *         2D grid of processors, stored by columns.
+ *
+ * p_asup_rowptr (Output) int_t**
+ *         Pointer to the upper part of A distributed on a 2D grid 
+ *         of processors, stored by rows.
+ *
+ * p_asup_colind (Output) int_t**
+ *         Structure of of the upper part of A distributed on a 
+ *         2D grid of processors, stored by rows.
+ *
+ * p_asup_val    (Output) double**
+ *         Numerical values of the upper part of A, distributed on a 
+ *         2D grid of processors, stored by rows.
+ *
+ * ilsum_i  (Input) int_t *
+ *       Starting position of each supernode in 
+ *       the full array (local, block row wise).
+ *
+ * ilsum_j  (Input) int_t *
+ *       Starting position of each supernode in 
+ *       the full array (local, block column wise).
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+ 
+static float
+ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
+	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
+	int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val,
+	int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val,
+	int_t *ilsum_i, int_t *ilsum_j
+	)
+{
+  int    iam, p, procs;
+  NRformat_loc *Astore;
+  int_t  *perm_r; /* row permutation vector */
+  int_t  *perm_c; /* column permutation vector */
+  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
+  int_t  nsupers, nsupers_i, nsupers_j;
+  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
+  int_t  SendCnt; /* number of remote nonzeros to be sent */
+  int_t  RecvCnt; /* number of remote nonzeros to be sent */
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  double *asup_val, *ainf_val;
+  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+  int_t  *ia, *ja, **ia_send, *index, *itemp;
+  int_t  *ptr_to_send;
+  double *aij, **aij_send, *nzval, *dtemp;
+  double *nzval_a;
+  MPI_Request *send_req;
+  MPI_Status  status;
+  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+  int_t *supno = Glu_persist->supno;   
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword, szbuf;
+
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter ddist_A()");
+#endif
+  iword = sizeof(int_t);
+  dword = sizeof(double);
+  
+  perm_r = ScalePermstruct->perm_r;
+  perm_c = ScalePermstruct->perm_c;
+  procs = grid->nprow * grid->npcol;
+  Astore = (NRformat_loc *) A->Store;
+  n = A->ncol;
+  m_loc = Astore->m_loc;
+  fst_row = Astore->fst_row;
+  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (ERROR_RET);
+  }
+  memAux = (float) (2 * procs * iword);
+  memRet = 0.;
+  nnzToSend = nnzToRecv + procs;
+  nsupers  = supno[n-1] + 1;  
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF A.
+     ------------------------------------------------------------*/
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+      ++nnzToSend[p]; 
+    }
+  }
+  
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		grid->comm);
+  
+  maxnnzToRecv = 0;
+  nnz_loc = SendCnt = RecvCnt = 0;
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      SendCnt += nnzToSend[p];
+      RecvCnt += nnzToRecv[p];
+      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+    } else {
+      nnz_loc += nnzToRecv[p];
+      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+    }
+  }
+  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+  szbuf = k;
+
+  /* Allocate space for storing the triplets after redistribution. */
+  if ( !(ia = intMalloc_dist(2*k)) ) {
+    fprintf (stderr, "Malloc fails for ia[].");
+    return (memAux);
+  }
+  memAux += (float) (2*k*iword);
+  ja = ia + k;
+  if ( !(aij = doubleMalloc_dist(k)) ) {
+    fprintf (stderr, "Malloc fails for aij[].");
+    return (memAux);
+  }
+  memAux += (float) (k*dword);
+  
+  /* Allocate temporary storage for sending/receiving the A triplets. */
+  if ( procs > 1 ) {
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
+      fprintf (stderr, "Malloc fails for send_req[].");
+      return (memAux);
+    }
+    memAux += (float) (2*procs *sizeof(MPI_Request));
+    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
+      fprintf(stderr, "Malloc fails for ia_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(int_t*));
+    if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) {
+      fprintf(stderr, "Malloc fails for aij_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(double*));    
+    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for index[].");
+      return (memAux);
+    }
+    memAux += (float) (2*SendCnt*iword);
+    if ( !(nzval = doubleMalloc_dist(SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for nzval[].");
+      return (memAux);
+    }
+    memAux += (float) (SendCnt * dword);
+    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
+      fprintf(stderr, "Malloc fails for ptr_to_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs * iword);
+    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for itemp[].");
+      return (memAux);
+    }
+    memAux += (float) (2*maxnnzToRecv*iword);
+    if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for dtemp[].");
+      return (memAux);
+    }
+    memAux += (float) (maxnnzToRecv * dword);
+    
+    for (i = 0, j = 0, p = 0; p < procs; ++p) {
+      if ( p != iam ) {
+	ia_send[p] = &index[i];
+	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	aij_send[p] = &nzval[j];
+	j += nnzToSend[p];
+      }
+    }
+  } /* if procs > 1 */
+  
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
+    return (memAux);
+  }
+  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
+  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
+    return (memAux+memRet);
+  }
+  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
+  
+  /* ------------------------------------------------------------
+     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF A.
+     ------------------------------------------------------------*/
+  nnz_loc = 0; /* Reset the local nonzero count. */
+  nnz_loc_ainf = nnz_loc_asup = 0;
+  nzval_a = Astore->nzval;
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+      
+      if ( p != iam ) { /* remote */
+	k = ptr_to_send[p];
+	ia_send[p][k] = irow;
+	ia_send[p][k + nnzToSend[p]] = jcol;
+	aij_send[p][k] = nzval_a[j];
+	++ptr_to_send[p]; 
+      } else {          /* local */
+	ia[nnz_loc] = irow;
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = nzval_a[j];
+	++nnz_loc;
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+
+  /* ------------------------------------------------------------
+     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+     NOTE: Can possibly use MPI_Alltoallv.
+     ------------------------------------------------------------*/
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToSend[p];
+      MPI_Isend( ia_send[p], it, mpi_int_t,
+		 p, iam, grid->comm, &send_req[p] );
+      it = nnzToSend[p];
+      MPI_Isend( aij_send[p], it, MPI_DOUBLE,
+		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
+    }
+  }
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToRecv[p];
+      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+      it = nnzToRecv[p];
+      MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
+		grid->comm, &status );
+      for (i = 0; i < nnzToRecv[p]; ++i) {
+	ia[nnz_loc] = itemp[i];
+	irow = itemp[i];
+	jcol = itemp[i + nnzToRecv[p]];
+	/* assert(jcol<n); */
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = dtemp[i];
+	++nnz_loc;
+	
+	gbi = BlockNum( irow );
+	gbj = BlockNum( jcol );
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      MPI_Wait( &send_req[p], &status);
+      MPI_Wait( &send_req[procs+p], &status);
+    }
+  }
+  
+  /* ------------------------------------------------------------
+     DEALLOCATE TEMPORARY STORAGE
+     ------------------------------------------------------------*/
+  
+  SUPERLU_FREE(nnzToRecv);
+  memAux -= 2 * procs * iword;
+  if ( procs > 1 ) {
+    SUPERLU_FREE(send_req);
+    SUPERLU_FREE(ia_send);
+    SUPERLU_FREE(aij_send);
+    SUPERLU_FREE(index);
+    SUPERLU_FREE(nzval);
+    SUPERLU_FREE(ptr_to_send);
+    SUPERLU_FREE(itemp);
+    SUPERLU_FREE(dtemp);
+    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
+      procs*sizeof(double*) + 2*SendCnt * iword +
+      SendCnt* dword + procs*iword +
+      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
+  }
+  
+  /* ------------------------------------------------------------
+     CONVERT THE TRIPLET FORMAT.
+     ------------------------------------------------------------*/
+  if (nnz_loc_ainf != 0) {
+    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * iword);
+    if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_val[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * dword);
+  }
+  else {
+    ainf_rowind = NULL;
+    ainf_val = NULL;
+  }
+  if (nnz_loc_asup != 0) {
+    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_colind[].");
+      return (memAux + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * iword);
+    if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_val[].");
+      return (memAux  + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * dword);
+  }
+  else {
+    asup_colind = NULL;
+    asup_val = NULL;
+  }
+
+  /* Initialize the array of column pointers */
+  k = 0; 
+  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
+  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
+    k += jsize;              
+    jsize = ainf_colptr[j];  
+    ainf_colptr[j] = k;
+  }
+  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
+  i = 0;
+  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
+  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
+    i += isize;
+    isize = asup_rowptr[j];  
+    asup_rowptr[j] = i;
+  }
+  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
+
+  /* Copy the triplets into the column oriented storage */
+  for (i = 0; i < nnz_loc; ++i) {
+    jcol = ja[i];
+    irow = ia[i];
+    gbi = BlockNum( irow );
+    gbj = BlockNum( jcol );
+    /* Count nonzeros in each column of L / row of U */
+    if (gbi >= gbj) {
+      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
+      k = ainf_colptr[j];
+      ainf_rowind[k] = irow;
+      ainf_val[k] = aij[i];
+      ainf_colptr[j] ++;
+    }
+    else {
+      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
+      k = asup_rowptr[j];
+      asup_colind[k] = jcol;
+      asup_val[k] = aij[i];
+      asup_rowptr[j] ++;
+    }
+  }
+
+  /* Reset the column pointers to the beginning of each column */
+  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
+    ainf_colptr[j] = ainf_colptr[j-1];
+  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
+    asup_rowptr[j] = asup_rowptr[j-1];
+  ainf_colptr[0] = 0;
+  asup_rowptr[0] = 0;
+  
+  SUPERLU_FREE(ia);
+  SUPERLU_FREE(aij);
+  memAux -= 2*szbuf*iword + szbuf*dword;
+  
+  *p_ainf_colptr = ainf_colptr;
+  *p_ainf_rowind = ainf_rowind; 
+  *p_ainf_val    = ainf_val;
+  *p_asup_rowptr = asup_rowptr;
+  *p_asup_colind = asup_colind;
+  *p_asup_val    = asup_val;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit ddist_A()");
+  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
+#endif
+
+  return (-memRet);
+} /* dist_A */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the input matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *          This routine should not be called for this case, an error
+ *          is generated.  Instead, pddistribute routine should be called.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (Input) int
+ *        Dimension of the matrix.
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
+ *
+ * ScalePermstruct (Input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (Input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * LUstruct (Input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated for performing the distribution
+ *       of the data, when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+float
+ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
+		ScalePermstruct_t *ScalePermstruct,
+		Pslu_freeable_t *Pslu_freeable, 
+		LUstruct_t *LUstruct, gridinfo_t *grid)
+{
+  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+  Glu_freeable_t Glu_freeable_n;
+  LocalLU_t *Llu = LUstruct->Llu;
+  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
+    len, len1, nsupc, nsupc_gb, ii, nprocs;
+  int_t ljb;  /* local block column number */
+  int_t nrbl; /* number of L blocks in current block column */
+  int_t nrbu; /* number of U blocks in current block column */
+  int_t gb;   /* global block number; 0 < gb <= nsuper */
+  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
+  int_t mybufmax[NBUFFERS];
+  NRformat_loc *Astore;
+  double *a;
+  int_t *asub, *xa;
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  double *asup_val, *ainf_val;
+  int_t *xsup, *supno;    /* supernode and column mapping */
+  int_t *lsub, *xlsub, *usub, *xusub;
+  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
+  int_t next_ind;      /* next available position in index[*] */
+  int_t next_val;      /* next available position in nzval[*] */
+  int_t *index;        /* indices consist of headers and row subscripts */
+  int   *index1;       /* temporary pointer to array of int */
+  double *lusup, *uval; /* nonzero values in L and U */
+  int_t *recvBuf;
+  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
+  double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+  double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  
+  /*-- Counts to be used in factorization. --*/
+  int  *ToRecv, *ToSendD, **ToSendR;
+  
+  /*-- Counts to be used in lower triangular solve. --*/
+  int_t  *fmod;          /* Modification count for L-solve.        */
+  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nfsendx = 0;    /* Number of Xk I will send               */
+  int_t  kseen;
+  
+  /*-- Counts to be used in upper triangular solve. --*/
+  int_t  *bmod;          /* Modification count for U-solve.        */
+  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
+  int_t  *ilsum;         /* starting position of each supernode in 
+			    the full array (local)                 */  
+  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
+				the full array (local, block column wise) */  
+  /*-- Auxiliary arrays; freed on return --*/
+  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  int_t *LUb_length; /* L,U block length; size nsupers_ij */
+  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
+  int_t *LUb_number; /* global block number; size nsupers_ij */
+  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
+  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  double *dense, *dense_col; /* SPA */
+  double zero = 0.0;
+  int_t ldaspa;     /* LDA of SPA */
+  int_t iword, dword;
+  float memStrLU, memA,
+        memDist = 0.; /* memory used for redistributing the data, which does
+		         not include the memory for the numerical values
+                         of L and U (positive number)*/
+  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
+		         L and U, that will be used in the numeric
+                         factorization (positive number) */
+
+#if ( PRNTlevel>=1 )
+  int_t nLblocks = 0, nUblocks = 0;
+#endif
+  
+  /* Initialization. */
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+#endif
+  myrow = MYROW( iam, grid );
+  mycol = MYCOL( iam, grid );
+  nprocs = grid->npcol * grid->nprow;
+  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+  Astore   = (NRformat_loc *) A->Store;
+  
+  iword = sizeof(int_t);
+  dword = sizeof(double);
+
+  if (fact == SamePattern_SameRowPerm) {
+    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
+  }
+
+  if ((memStrLU = 
+       dist_symbLU (n, Pslu_freeable, 
+		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
+    return (memStrLU);
+  memDist += (-memStrLU);
+  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
+  supno = Glu_persist->supno;   
+  nsupers  = supno[n-1] + 1;
+  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
+  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
+  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
+  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum[].");  
+    return (memDist + memNLU);
+  }
+  memNLU += (nsupers_i+1) * iword;
+  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum_j[].");
+    return (memDist + memNLU);
+  }
+  memDist += (nsupers_j+1) * iword;
+
+  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
+  ilsum[0] = 0;
+  ldaspa = 0;
+  for (gb = 0; gb < nsupers; gb++) 
+    if ( myrow == PROW( gb, grid ) ) {
+      i = SuperSize( gb );
+      ldaspa += i;
+      lb = LBi( gb, grid );
+      ilsum[lb + 1] = ilsum[lb] + i;
+    }
+  ilsum[nsupers_i] = ldaspa;
+
+  ldaspa_j = 0; ilsum_j[0] = 0;  
+  for (gb = 0; gb < nsupers; gb++) 
+    if (mycol == PCOL( gb, grid )) {
+      i = SuperSize( gb );
+      ldaspa_j += i;
+      lb = LBj( gb, grid );
+      ilsum_j[lb + 1] = ilsum_j[lb] + i;
+    }
+  ilsum_j[nsupers_j] = ldaspa_j;
+  
+  if ((memA = ddist_A(A, ScalePermstruct, Glu_persist,
+		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
+		      &asup_rowptr, &asup_colind, &asup_val,
+		      ilsum, ilsum_j)) > 0)
+    return (memDist + memA + memNLU);
+  memDist += (-memA);
+
+  /* ------------------------------------------------------------
+     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+     ------------------------------------------------------------*/
+  
+  /* We first need to set up the L and U data structures and then
+   * propagate the values of A into them.
+   */
+  if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) {
+    fprintf(stderr, "Calloc fails for ToRecv[].");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+  memNLU += nsupers * iword;
+  
+  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+  if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) {
+    fprintf(stderr, "Malloc fails for ToSendR[].");
+    return (memDist + memNLU);
+  }
+  memNLU += k*sizeof(int_t*);
+  j = k * grid->npcol;
+  if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for index[].");
+    return (memDist + memNLU);
+  }
+  memNLU += j*iword;
+  
+  for (i = 0; i < j; ++i) index1[i] = EMPTY;
+  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+  
+  /* Auxiliary arrays used to set up L and U block data structures.
+     They are freed on return. */
+  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_length[].");
+    return (memDist + memNLU);
+  }
+  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Malloc fails for LUb_indptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_number[].");
+    return (memDist + memNLU);
+  }    
+  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_valptr[].");
+    return (memDist + memNLU);
+  }
+  memDist += 4 * nsupers_ij * iword;
+  
+  k = CEILING( nsupers, grid->nprow ); 
+  /* Pointers to the beginning of each block row of U. */
+  if ( !(Unzval_br_ptr = 
+	 (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) {
+    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
+    return (memDist + memNLU);
+  }
+  memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*);
+  Unzval_br_ptr[nsupers_i-1] = NULL;
+  Ufstnz_br_ptr[nsupers_i-1] = NULL;
+
+  if ( !(ToSendD = SUPERLU_MALLOC(nsupers_i * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for ToSendD[].");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO;
+
+  memNLU += nsupers_i*iword;  
+  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU);
+  }
+  memDist += (nsupers_i + nsupers_j)*iword;
+  
+  /* Auxiliary arrays used to set up L, U block data structures.
+     They are freed on return.
+     k is the number of local row blocks.   */
+  if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
+				   * sp_ienv_dist(3))) ) {
+    fprintf(stderr, "Calloc fails for SPA dense[].");
+    return (memDist + memNLU);
+  }
+  /* These counts will be used for triangular solves. */
+  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for fmod[].");
+    return (memDist + memNLU);
+  }
+  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for bmod[].");
+    return (memDist + memNLU);
+  }
+  /* ------------------------------------------------ */
+  memNLU += 2*nsupers_i*iword + 
+    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
+  
+  /* Pointers to the beginning of each block column of L. */
+  if ( !(Lnzval_bc_ptr = 
+	 (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
+    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
+    return (memDist + memNLU);
+  }
+  memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*);
+  Lnzval_bc_ptr[nsupers_j-1] = NULL;
+  Lrowind_bc_ptr[nsupers_j-1] = NULL;
+  
+  /* These lists of processes will be used for triangular solves. */
+  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[].");
+    return (memDist + memNLU);
+  }
+  len = nsupers_j * grid->nprow;
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    fsendx_plist[i] = &index[j];
+  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[].");
+    return (memDist + memNLU);
+  }
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    bsendx_plist[i] = &index[j];
+  /* -------------------------------------------------------------- */
+  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+  
+  /*------------------------------------------------------------
+    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+    ------------------------------------------------------------*/
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid);  /* Local block number row wise */
+    fsupc = FstBlockC( jb );
+    nsupc = SuperSize( jb );
+    
+    if ( myrow == jbrow ) { /* Block row jb in my process row */
+      /* Scatter A into SPA. */
+      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
+	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
+	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
+	    printf ("ERR7\n");
+	  jcol = asup_colind[i];
+	  if (jcol >= n)
+	    printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  gb = BlockNum( jcol );
+	  lb = LBj( gb, grid );
+	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
+	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
+	  if (jcol >= ldaspa_j)
+	    printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  dense_col[jcol] = asup_val[i];
+	}
+	dense_col += ldaspa_j;
+      }
+      
+      /*------------------------------------------------
+       * SET UP U BLOCKS.
+       *------------------------------------------------*/
+      /* Count number of blocks and length of each block. */
+      nrbu = 0;
+      len = 0; /* Number of column subscripts I own. */
+      len1 = 0; /* number of fstnz subscripts */
+      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
+	jcol = usub[i];
+	gb = BlockNum( jcol ); /* Global block number */
+	
+	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
+	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
+	  iam, jb, gb, jcol, jbcol, pc); */
+	
+	lb = LBj( gb, grid );  /* Local block number */
+	pc = PCOL( gb, grid ); /* Process col owning this block */
+	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
+	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
+	pr = PROW( gb, grid );
+	if ( pr != jbrow  && mycol == pc)
+	  bsendx_plist[lb][jbrow] = YES; 
+	if (mycol == pc) {
+	  len += nsupc;
+	  LUb_length[lb] += nsupc;
+	  ToSendD[ljb_i] = YES;
+	  if (Urb_marker[lb] <= jb) { /* First see this block */
+	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
+	    Urb_marker[lb] = jb + 1;
+	    LUb_number[nrbu] = gb;
+	    /* if (gb == 391825 && jb == 145361)
+	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
+	       iam, jb, gb, nrbu); */
+	    nrbu ++;
+	    len1 += SuperSize( gb );
+	    if ( gb != jb )/* Exclude diagonal block. */
+	      ++bmod[ljb_i];/* Mod. count for back solve */
+#if ( PRNTlevel>=1 )
+	    ++nUblocks;
+#endif
+	  }
+	}
+      } /* for i ... */
+      
+      if ( nrbu ) { 
+	/* Sort the blocks of U in increasing block column index.
+	   SuperLU_DIST assumes this is true */
+	/* simple insert sort algorithm */
+	/* to be transformed in quick sort */
+	for (j = 1; j < nrbu; j++) {
+	  k = LUb_number[j];
+	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
+	    LUb_number[i+1] = LUb_number[i];
+	  }
+	  LUb_number[i+1] = k;
+	} 
+	
+	/* Set up the initial pointers for each block in
+	   index[] and nzval[]. */
+	/* Add room for descriptors */
+	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1+1)) ) {
+	  fprintf (stderr, "Malloc fails for Uindex[]");
+	  return (memDist + memNLU);
+	}
+	Ufstnz_br_ptr[ljb_i] = index;
+	if (!(Unzval_br_ptr[ljb_i] =
+	      doubleMalloc_dist(len))) {
+	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
+	  return (memDist + memNLU);
+	}
+	memNLU += (len1+1)*iword + len*dword;
+	uval = Unzval_br_ptr[ljb_i];
+	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+	index[0] = nrbu;  /* Number of column blocks */
+	index[1] = len;   /* Total length of nzval[] */
+	index[2] = len1;  /* Total length of index */
+	index[len1] = -1; /* End marker */
+	next_ind = BR_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbu; k++) {
+	  gb = LUb_number[k];
+	  lb = LBj( gb, grid );
+	  len = LUb_length[lb];
+	  LUb_length[lb] = 0;  /* Reset vector of block length */
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len;
+	  LUb_indptr[lb] = next_ind;
+	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
+	    index[next_ind] = FstBlockC( jb + 1 );
+	  LUb_valptr[lb] = next_val;
+	  next_val += len;
+	}
+	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
+	   and the initial values of A from SPA into Unzval_br_ptr[]. */
+	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	  jcol = usub[i];
+	  gb = BlockNum( jcol );
+	  
+	  if ( mycol == PCOL( gb, grid ) ) {
+	    lb = LBj( gb, grid );
+	    k = LUb_indptr[lb]; /* Start fstnz in index */
+	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
+	  }
+	}  /* for i ... */
+	
+	for (i = 0; i < nrbu; i++) {
+	  gb = LUb_number[i];
+	  lb = LBj( gb, grid );   
+	  next_ind = LUb_indptr[lb];
+	  k = FstBlockC( jb + 1);
+	  jcol = ilsum_j[lb];
+	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
+	    dense_col = dense;
+	    j = index[next_ind+jj];
+	    for (ii = j; ii < k; ii++) {
+	      uval[LUb_valptr[lb]++] = dense_col[jcol];
+	      dense_col[jcol] = zero;
+	      dense_col += ldaspa_j;	      
+	    }
+	  }
+	}
+      } else {
+	Ufstnz_br_ptr[ljb_i] = NULL;
+	Unzval_br_ptr[ljb_i] = NULL;
+      } /* if nrbu ... */	
+    } /* if myrow == jbrow */
+    
+      /*------------------------------------------------
+       * SET UP L BLOCKS.
+       *------------------------------------------------*/
+    if (mycol == jbcol) {  /* Block column jb in my process column */
+      /* Scatter A_inf into SPA. */
+      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
+	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
+	  irow = ainf_rowind[i];
+	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
+	  gb = BlockNum( irow );
+	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
+	  if ( myrow == PROW( gb, grid ) ) {
+	    lb = LBi( gb, grid );
+	    irow = ilsum[lb] + irow - FstBlockC( gb );
+	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
+	    dense_col[irow] = ainf_val[i];
+	  }
+	}
+	dense_col += ldaspa;
+      }      
+      
+      /* sort the indices of the diagonal block at the beginning of xlsub */
+      if (myrow == jbrow) {
+	k = xlsub[ljb_j];
+	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	  irow = lsub[i];
+	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
+	    lsub[i] = lsub[k + irow - fsupc];
+	    lsub[k + irow - fsupc] = irow;
+	    i --;
+	  }
+	}
+      }
+      
+      /* Count number of blocks and length of each block. */
+      nrbl = 0;
+      len = 0; /* Number of row subscripts I own. */
+      kseen = 0;
+      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	irow = lsub[i];
+	gb = BlockNum( irow ); /* Global block number */	  
+	pr = PROW( gb, grid ); /* Process row owning this block */
+	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
+	     myrow == jbrow) {
+	  fsendx_plist[ljb_j][pr] = YES;
+	  ++nfsendx;
+	}
+	if ( myrow == pr ) {
+	  lb = LBi( gb, grid );  /* Local block number */
+	  if (Lrb_marker[lb] <= jb) { /* First see this block */
+	    Lrb_marker[lb] = jb + 1;
+	    LUb_length[lb] = 1;
+	    LUb_number[nrbl++] = gb;
+	    if ( gb != jb ) /* Exclude diagonal block. */
+	      ++fmod[lb]; /* Mod. count for forward solve */
+	    if ( kseen == 0 && myrow != jbrow ) {
+	      ++nfrecvx;
+	      kseen = 1;
+	    }
+#if ( PRNTlevel>=1 )
+	    ++nLblocks;
+#endif
+	  } else 
+	    ++LUb_length[lb];	    
+	  ++len;
+	}
+      } /* for i ... */
+      
+      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+	/* Set up the initial pointers for each block in 
+	   index[] and nzval[]. */
+	/* If I am the owner of the diagonal block, order it first in LUb_number.
+	   Necessary for SuperLU_DIST routines */
+	kseen = EMPTY;
+	for (j = 0; j < nrbl; j++) {
+	  if (LUb_number[j] == jb)
+	    kseen = j;
+	}
+	if (kseen != EMPTY && kseen != 0) {
+	  LUb_number[kseen] = LUb_number[0];
+	  LUb_number[0] = jb;
+	}
+	
+	/* Add room for descriptors */
+	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1)) ) {
+	  fprintf (stderr, "Malloc fails for index[]");
+	  return (memDist + memNLU);
+	}
+	Lrowind_bc_ptr[ljb_j] = index;
+	if (!(Lnzval_bc_ptr[ljb_j] = 
+	      doubleMalloc_dist(len*nsupc))) {
+	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+	  return (memDist + memNLU);
+	}
+	memNLU += len1*iword + len*nsupc*dword;
+	
+	lusup = Lnzval_bc_ptr[ljb_j];
+	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+	index[0] = nrbl;  /* Number of row blocks */
+	index[1] = len;   /* LDA of the nzval[] */
+	next_ind = BC_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbl; ++k) {
+	  gb = LUb_number[k];
+	  lb = LBi( gb, grid );
+	  len = LUb_length[lb];
+	  LUb_length[lb] = 0;
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len; 
+	  LUb_indptr[lb] = next_ind;
+	    LUb_valptr[lb] = next_val;
+	    next_ind += len;
+	    next_val += len;
+	  }
+	  /* Propagate the compressed row subscripts to Lindex[],
+	     and the initial values of A from SPA into Lnzval[]. */
+	  len = index[1];  /* LDA of lusup[] */
+	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	    irow = lsub[i];
+	    gb = BlockNum( irow );
+	    if ( myrow == PROW( gb, grid ) ) {
+	      lb = LBi( gb, grid );
+	      k = LUb_indptr[lb]++; /* Random access a block */
+	      index[k] = irow;
+	      k = LUb_valptr[lb]++;
+	      irow = ilsum[lb] + irow - FstBlockC( gb );
+	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
+		lusup[k] = dense_col[irow];
+		dense_col[irow] = zero;
+		k += len;
+		dense_col += ldaspa;
+	      }
+	    }
+	  } /* for i ... */
+	} else {
+	  Lrowind_bc_ptr[ljb_j] = NULL;
+	  Lnzval_bc_ptr[ljb_j] = NULL;
+	} /* if nrbl ... */		  
+      } /* if mycol == pc */
+  } /* for jb ... */
+
+  SUPERLU_FREE(ilsum_j);
+  SUPERLU_FREE(Urb_marker);
+  SUPERLU_FREE(LUb_length);
+  SUPERLU_FREE(LUb_indptr);
+  SUPERLU_FREE(LUb_number);
+  SUPERLU_FREE(LUb_valptr);
+  SUPERLU_FREE(Lrb_marker);
+  SUPERLU_FREE(dense);
+  
+  /* Free the memory used for storing L and U */
+  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
+  if (lsub != NULL)
+    SUPERLU_FREE(lsub);  
+  if (usub != NULL)
+    SUPERLU_FREE(usub);
+  
+  /* Free the memory used for storing A */
+  SUPERLU_FREE(ainf_colptr);
+  if (ainf_rowind != NULL) {
+    SUPERLU_FREE(ainf_rowind);
+    SUPERLU_FREE(ainf_val);
+  }
+  SUPERLU_FREE(asup_rowptr);
+  if (asup_colind != NULL) {
+    SUPERLU_FREE(asup_colind);	
+    SUPERLU_FREE(asup_val);	
+  }
+  
+  /* exchange information about bsendx_plist in between column of processors */
+  k = SUPERLU_MAX( grid->nprow, grid->npcol);
+  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+    fprintf (stderr, "Malloc fails for recvBuf[].");
+    return (memDist + memNLU);
+  }
+  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU);
+  }
+  
+  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
+    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
+  
+  for (p = 0; p < nprocs; p++)
+    nnzToRecv[p] = 0;
+  
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    nnzToRecv[p] += grid->npcol;
+  }    
+  i = 0;
+  for (p = 0; p < nprocs; p++) {
+    ptrToRecv[p] = i;
+    i += nnzToRecv[p];
+    ptrToSend[p] = 0;
+    if (p != iam)
+      nnzToSend[p] = nnzToRecv[iam];
+    else
+      nnzToSend[p] = 0;
+  }
+  nnzToRecv[iam] = 0;
+  i = ptrToRecv[iam];
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    if (p == iam) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      for (j = 0; j < grid->npcol; j++, i++)
+	recvBuf[i] = ToSendR[ljb_j][j];
+    }
+  }   
+  
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+  
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
+    /* (myrow == jbrow) {
+       if (ToSendD[ljb_i] == YES)
+       ToRecv[jb] = 1;
+       }
+       else {
+       if (recvBuf[ptrToRecv[p] + mycol] == YES)
+       ToRecv[jb] = 2;
+       } */
+    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
+      if (myrow == jbrow)
+	ToRecv[jb] = 1;
+      else
+	ToRecv[jb] = 2;
+    }
+    if (mycol == jbcol) {
+      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
+	ToSendR[ljb_j][i] = recvBuf[j];  
+      ToSendR[ljb_j][mycol] = EMPTY;
+    }
+    ptrToRecv[p] += grid->npcol;
+  }   
+  
+  /* exchange information about bsendx_plist in between column of processors */
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+		 MPI_MAX, grid->cscp.comm);
+  
+  for (jb = 0; jb < nsupers; jb ++) {
+    jbcol = PCOL( jb, grid);
+    jbrow = PROW( jb, grid);
+    if (mycol == jbcol) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      if (myrow == jbrow ) {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
+	  (*bsendx_plist)[k] = recvBuf[k];
+	  if ((*bsendx_plist)[k] != EMPTY)
+	    nbsendx ++;
+	}
+      }
+      else {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
+	  (*bsendx_plist)[k] = EMPTY;
+      }
+    }
+  }
+  
+  SUPERLU_FREE(nnzToRecv);
+  SUPERLU_FREE(ptrToRecv);
+  SUPERLU_FREE(nnzToSend);
+  SUPERLU_FREE(ptrToSend);
+  SUPERLU_FREE(recvBuf);
+  
+  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+  Llu->Unzval_br_ptr = Unzval_br_ptr;
+  Llu->ToRecv = ToRecv;
+  Llu->ToSendD = ToSendD;
+  Llu->ToSendR = ToSendR;
+  Llu->fmod = fmod;
+  Llu->fsendx_plist = fsendx_plist;
+  Llu->nfrecvx = nfrecvx;
+  Llu->nfsendx = nfsendx;
+  Llu->bmod = bmod;
+  Llu->bsendx_plist = bsendx_plist;
+  Llu->nbrecvx = nbrecvx;
+  Llu->nbsendx = nbsendx;
+  Llu->ilsum = ilsum;
+  Llu->ldalsum = ldaspa;
+  LUstruct->Glu_persist = Glu_persist;	
+#if ( PRNTlevel>=1 )
+  if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+		     nLblocks, nUblocks);
+#endif
+  
+  k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+      ABORT("Malloc fails for mod_bit[].");
+
+  /* Find the maximum buffer size. */
+  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		MPI_MAX, grid->comm);
+  
+#if ( DEBUGlevel>=1 )
+  /* Memory allocated but not freed:
+     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
+     ToRecv, ToSendR, ToSendD, mod_bit
+  */
+  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+#endif
+    
+  return (- (memDist+memNLU));
+} /* ddist_psymbtonum */
+
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
new file mode 100644
index 0000000..05975b0
--- /dev/null
+++ b/SRC/pdutil.c
@@ -0,0 +1,538 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
+ */
+int pdCompRow_loc_to_CompCol_global
+(
+ int_t need_value, /* Input. Whether need to gather numerical values */
+ SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
+ gridinfo_t *grid, /* Input */
+ SuperMatrix *GA   /* Output */
+)
+{
+    NRformat_loc *Astore;
+    NCformat *GAstore;
+    double *a, *a_loc;
+    int_t *colind, *rowptr;
+    int_t *colptr_loc, *rowind_loc;
+    int_t m_loc, n, i, j, k, l;
+    int_t colnnz, fst_row, nnz_loc, nnz;
+    double *a_recv;  /* Buffer to receive the blocks of values. */
+    double *a_buf;   /* Buffer to merge blocks into block columns. */
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+			   local block rows.
+			   Use n_loc+1 pointers for each block. */
+    int_t *colptr_blk;  /* The column pointers for each block, after
+			   redistribution to the local block columns. 
+			   Use n_loc+1 pointers for each block. */
+    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
+    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
+    int_t *fst_rows, *n_locs;
+    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
+    int   it, n_loc, procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdCompRow_loc_to_CompCol_global");
+#endif
+
+    /* Initialization. */
+    n = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a = Astore->nzval;
+    rowptr = Astore->rowptr;
+    colind = Astore->colind;
+    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
+
+    /* ------------------------------------------------------------
+       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
+       ------------------------------------------------------------*/
+    dCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
+                             &rowind_loc, &colptr_loc);
+    /* Change local row index numbers to global numbers. */
+    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
+
+#if ( DEBUGlevel>=2 )
+    printf("Proc %d\n", grid->iam);
+    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
+    PrintInt10("colptr_loc", n+1, colptr_loc);
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
+	  ABORT("Malloc fails for fst_rows[]");
+    n_locs = fst_rows + procs;
+    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
+		  grid->comm);
+    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
+    n_locs[procs-1] = n - fst_rows[procs-1];
+    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
+	  ABORT("Malloc fails for recvcnts[]");
+    sendcnts = recvcnts + procs;
+    rdispls = sendcnts + procs;
+    sdispls = rdispls + procs;
+    itemp_32 = sdispls + procs;
+
+    /* All-to-all transfer column pointers of each block.
+       Now the matrix view is P-by-P block-partition. */
+    /* n column starts for each column, and procs column ends for each block */
+    if ( !(colptr_send = intMalloc_dist(n + procs)) )
+	   ABORT("Malloc fails for colptr_send[]");
+    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
+	   ABORT("Malloc fails for colptr_blk[]");
+    for (i = 0, j = 0; i < procs; ++i) {
+        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
+	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
+	sendcnts[i] = n_locs[i] + 1;
+#if ( DEBUGlevel>=1 )
+	assert(j == fst_rows[i]);
+#endif
+	sdispls[i] = j + i;
+	recvcnts[i] = n_loc + 1;
+	rdispls[i] = i * (n_loc + 1);
+	j += n_locs[i]; /* First column of next block in colptr_loc[] */
+    }
+    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
+		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+    /* Adjust colptr_blk[] so that they contain the local indices of the
+       column pointers in the receive buffer. */
+    nnz = 0; /* The running sum of the nonzeros counted by far */
+    k = 0;
+    for (i = 0; i < procs; ++i) {
+	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
+	    colnnz = colptr_blk[j+1] - colptr_blk[j];
+	    /*assert(k<=j);*/
+	    colptr_blk[k] = nnz;
+	    nnz += colnnz; /* Start of the next column */
+	    ++k;
+	}
+	colptr_blk[k++] = nnz; /* Add an END marker for each block */
+    }
+    /*assert(k == (n_loc+1)*procs);*/
+
+    /* Now prepare to transfer row indices and values. */
+    sdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
+	sdispls[i+1] = sdispls[i] + sendcnts[i];
+    }
+    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
+    for (i = 0; i < procs; ++i) {
+        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
+	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
+    }
+    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
+    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
+
+    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
+    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
+        ABORT("Malloc fails for rowind_recv[]");
+    rowind_buf = rowind_recv + k;
+    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
+		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+        if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) )
+	    ABORT("Malloc fails for rowind_recv[]");
+	a_buf = a_recv + k;
+	MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE,
+                      a_recv, recvcnts, rdispls, MPI_DOUBLE,
+                      grid->comm);
+    }
+      
+    /* Reset colptr_loc[] to point to the n_loc global columns. */
+    colptr_loc[0] = 0;
+    itemp = colptr_send;
+    for (j = 0; j < n_loc; ++j) {
+        colnnz = 0;
+	for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
+	    colnnz += colptr_blk[k+1] - colptr_blk[k];
+	}
+	colptr_loc[j+1] = colptr_loc[j] + colnnz;
+	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
+    }
+    itemp[n_loc] = colptr_loc[n_loc];
+      
+    /* Merge blocks of row indices into columns of row indices. */
+    for (i = 0; i < procs; ++i) {
+        k = i * (n_loc + 1);
+	for (j = 0; j < n_loc; ++j) { /* i-th block */
+	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+	        rowind_buf[itemp[j]] = rowind_recv[l];
+		++itemp[j];
+	    }
+	}
+    }
+
+    if ( need_value ) {
+        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
+        for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1);
+	    for (j = 0; j < n_loc; ++j) { /* i-th block */
+	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+		    a_buf[itemp[j]] = a_recv[l];
+		    ++itemp[j];
+		}
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
+       ------------------------------------------------------------*/
+    GA->nrow  = A->nrow;
+    GA->ncol  = A->ncol;
+    GA->Stype = SLU_NC;
+    GA->Dtype = A->Dtype;
+    GA->Mtype = A->Mtype;
+    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
+    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
+
+    /* First gather the size of each piece. */
+    nnz_loc = colptr_loc[n_loc];
+    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
+    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
+    GAstore->nnz = nnz;
+    
+    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
+    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
+      
+    /* Allgatherv for row indices. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + itemp[i];
+        itemp_32[i] = itemp[i];
+    }
+    itemp_32[procs-1] = itemp[procs-1];
+    it = nnz_loc;
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+      if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) )
+          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
+      MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, 
+		     itemp_32, rdispls, MPI_DOUBLE, grid->comm);
+    } else GAstore->nzval = NULL;
+
+    /* Now gather the column pointers. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + n_locs[i];
+        itemp_32[i] = n_locs[i];
+    }
+    itemp_32[procs-1] = n_locs[procs-1];
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+
+    /* Recompute column pointers. */
+    for (i = 1; i < procs; ++i) {
+        k = rdispls[i];
+	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
+	itemp[i] += itemp[i-1]; /* prefix sum */
+    }
+    GAstore->colptr[n] = nnz;
+
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+        printf("After pdCompRow_loc_to_CompCol_global()\n");
+	dPrint_CompCol_Matrix_dist(GA);
+    }
+#endif
+
+    SUPERLU_FREE(a_loc);
+    SUPERLU_FREE(rowind_loc);
+    SUPERLU_FREE(colptr_loc);
+    SUPERLU_FREE(fst_rows);
+    SUPERLU_FREE(recvcnts);
+    SUPERLU_FREE(colptr_send);
+    SUPERLU_FREE(colptr_blk);
+    SUPERLU_FREE(rowind_recv);
+    if ( need_value) SUPERLU_FREE(a_recv);
+#if ( DEBUGlevel>=1 )
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
+    CHECK_MALLOC(grid->iam, "Exit pdCompRow_loc_to_CompCol_global");
+#endif
+    return 0;
+} /* pdCompRow_loc_to_CompCol_global */
+
+
+/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B.
+ */
+int pdPermute_Dense_Matrix
+(
+ int_t fst_row,
+ int_t m_loc,
+ int_t row_to_proc[],
+ int_t perm[],
+ double X[], int ldx,
+ double B[], int ldb,
+ int nrhs,
+ gridinfo_t *grid
+)
+{
+    int_t i, j, k, l;
+    int p, procs;
+    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *send_ibuf, *recv_ibuf;
+    double *send_dbuf, *recv_dbuf;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pdPermute_Dense_Matrix()");
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
+        ABORT("Malloc fails for sendcnts[].");
+    sendcnts_nrhs = sendcnts + procs;
+    recvcnts = sendcnts_nrhs + procs;
+    recvcnts_nrhs = recvcnts + procs;
+    sdispls = recvcnts_nrhs + procs;
+    sdispls_nrhs = sdispls + procs;
+    rdispls = sdispls_nrhs + procs;
+    rdispls_nrhs = rdispls + procs;
+    ptr_to_ibuf = rdispls_nrhs + procs;
+    ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        p = row_to_proc[perm[i]];
+	++sendcnts[p];
+    }
+    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
+    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
+    for (i = 1; i < procs; ++i) {
+        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
+	sdispls_nrhs[i] = sdispls[i] * nrhs;
+	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
+	rdispls_nrhs[i] = rdispls[i] * nrhs;
+	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
+	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
+    }
+    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
+    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
+    /*assert(k == m_loc);*/
+    /*assert(l == m_loc);*/
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+
+    for (i = 0; i < procs; ++i) {
+        ptr_to_ibuf[i] = sdispls[i];
+	ptr_to_dbuf[i] = sdispls_nrhs[i];
+    }
+
+    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        j = perm[i];
+	p = row_to_proc[j];
+	send_ibuf[ptr_to_ibuf[p]] = j;
+	j = ptr_to_dbuf[p];
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    send_dbuf[j++] = X[i-fst_row + k*ldx];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+	  
+    /* Transfer the (permuted) row indices and numerical values. */
+    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
+		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_DOUBLE,
+		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE,
+		  grid->comm);
+
+    /* Copy the buffer into b. */
+    for (i = 0, l = 0; i < m_loc; ++i) {
+        j = recv_ibuf[i] - fst_row; /* Relative row number */
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    B[j + k*ldb] = recv_dbuf[l++];
+	}
+    }
+
+    SUPERLU_FREE(sendcnts);
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pdPermute_Dense_Matrix()");
+#endif
+    return 0;
+} /* pdPermute_Dense_Matrix */
+
+
+/*! \brief Initialize the data structure for the solution phase.
+ */
+int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, 
+	       int_t perm_r[], int_t perm_c[], int_t nrhs,
+	       LUstruct_t *LUstruct, gridinfo_t *grid,
+	       SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *row_to_proc, *inv_perm_c, *itemp;
+    NRformat_loc *Astore;
+    int_t        i, fst_row, m_loc, p;
+    int          procs;
+
+    Astore = (NRformat_loc *) A->Store;
+    fst_row = Astore->fst_row;
+    m_loc = Astore->m_loc;
+    procs = grid->nprow * grid->npcol;
+    
+    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
+	ABORT("Malloc fails for row_to_proc[]");
+    SOLVEstruct->row_to_proc = row_to_proc;
+    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
+        ABORT("Malloc fails for inv_perm_c[].");
+    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
+    SOLVEstruct->inv_perm_c = inv_perm_c;
+
+    /* ------------------------------------------------------------
+       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
+       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
+       
+       NOTE: For those processes that do not own any row, it must
+             must be set so that fst_row == A->nrow. 
+       ------------------------------------------------------------*/
+    if ( !(itemp = intMalloc_dist(procs+1)) )
+        ABORT("Malloc fails for itemp[]");
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = A->nrow;
+    for (p = 0; p < procs; ++p) {
+        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
+    }
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+      printf("fst_row = %d\n", fst_row);
+      PrintInt10("row_to_proc", A->nrow, row_to_proc);
+      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
+    }
+#endif
+    SUPERLU_FREE(itemp);
+
+#if 0
+    /* Compute the mapping between rows and processes. */
+    /* XSL NOTE: What happens if # of mapped processes is smaller
+       than total Procs?  For the processes without any row, let
+       fst_row be EMPTY (-1). Make sure this case works! */
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = n;
+    for (p = 0; p < procs; ++p) {
+        j = itemp[p];
+	if ( j != EMPTY ) {
+	    k = itemp[p+1];
+	    if ( k == EMPTY ) k = n;
+	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
+	}
+    }
+#endif    
+
+    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
+		   &SOLVEstruct->num_diag_procs,
+		   &SOLVEstruct->diag_procs,
+		   &SOLVEstruct->diag_len);
+
+    /* Setup communication pattern for redistribution of B and X. */
+    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
+	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+        ABORT("Malloc fails for gstrs_comm[]");
+    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+		 LUstruct->Glu_persist, SOLVEstruct);
+
+    if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *)
+           SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) )
+        ABORT("Malloc fails for gsmv_comm[]");
+    SOLVEstruct->A_colind_gsmv = NULL;
+    
+    options->SolveInitialized = YES;
+    return 0;
+} /* dSolveInit */
+
+/*! \brief Release the resources used for the solution phase.
+ */
+void dSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *it;
+
+    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+    if ( options->RefineInitialized ) {
+        pdgsmv_finalize(SOLVEstruct->gsmv_comm);
+	options->RefineInitialized = NO;
+    }
+    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+    SUPERLU_FREE(SOLVEstruct->row_to_proc);
+    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+    SUPERLU_FREE(SOLVEstruct->diag_procs);
+    SUPERLU_FREE(SOLVEstruct->diag_len);
+    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
+    options->SolveInitialized = NO;
+} /* dSolveFinalize */
+
+/*! \brief Check the inf-norm of the error vector 
+ */
+void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
+		      double xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+{
+    double err, xnorm, temperr, tempxnorm;
+    double *x_work, *xtrue_work;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
+	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
+      }
+
+      /* get the golbal max err & xnrom */
+      temperr = err;
+      tempxnorm = xnorm;
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+
+      err = err / xnorm;
+      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c
new file mode 100644
index 0000000..8410f15
--- /dev/null
+++ b/SRC/psymbfact.c
@@ -0,0 +1,5225 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Implements parallel symbolic factorization
+ *
+ * <pre>
+ * -- Parallel symbolic factorization routine  (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
+ * INRIA France - January 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ *
+ * The function symbfact_dist implements the parallel symbolic factorization
+ * algorithm described in the paper:
+ *
+ * Parallel Symbolic Factorization for Sparse LU with Static Pivoting,
+ * Laura Grigori, James W. Demmel and Xiaoye S. Li,
+ * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3.
+ * </pre>
+ */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+#include <math.h>
+#include "superlu_ddefs.h"
+#include "psymbfact.h"
+
+/*
+ * Internal protypes
+ */
+
+static int_t *
+intMalloc_symbfact(int_t );
+
+static int_t *
+intCalloc_symbfact(int_t );
+
+static int_t
+initParmsAndStats
+(psymbfact_stat_t *PS);
+
+static void
+estimate_memUsage
+(int_t, int, superlu_dist_mem_usage_t *, float *, float *,
+ Pslu_freeable_t *, Llu_symbfact_t *,
+ vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
+
+static void
+symbfact_free 
+(int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *);
+
+static int_t
+denseSep_symbfact 
+(int , int_t, int, int, int, int_t *, int_t *, int, 
+ int,  int, int_t, int_t, int_t *, int_t *, int_t *,
+ int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *,
+ Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, 
+ comm_symbfact_t *, psymbfact_stat_t * );
+
+static int_t
+dnsUpSeps_symbfact
+(int_t, int, int, int, int, int_t *, int_t *, int_t,
+ Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, 
+ comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *);
+
+static void
+intraLvl_symbfact 
+(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, 
+ int, int_t, int_t,  Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
+ comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
+ int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *);
+
+static void
+initLvl_symbfact
+(int_t, int, int_t, int_t, Pslu_freeable_t *, 
+ Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, 
+ int_t *, int_t, int_t);
+
+static void
+createComm (int, int, MPI_Comm *, MPI_Comm *);
+
+static void
+freeComm (int, int, MPI_Comm *, MPI_Comm *);
+
+static void
+domain_symbfact
+(SuperMatrix *, int, int, int,  int, int, int_t *, int_t *,
+ int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *,
+ comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
+ int_t *, int_t *, int_t *);
+
+static float
+allocPrune_domain
+(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
+
+static float
+allocPrune_lvl
+(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
+
+static int
+symbfact_alloc
+(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, 
+ vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
+
+static float 
+symbfact_mapVtcs
+(int, int, int, SuperMatrix *, int_t *, int_t *, 
+ Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *);
+
+static void 
+symbfact_distributeMatrix 
+(int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, 
+ Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *);
+
+static int_t
+interLvl_symbfact
+(SuperMatrix *, int, int, int, int, int, int, int, 
+ int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *,
+ Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *,
+ psymbfact_stat_t *, MPI_Comm, MPI_Comm *);
+
+static float
+cntsVtcs 
+(int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
+ int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *);
+
+/************************************************************************/
+float symbfact_dist
+/************************************************************************/
+(
+ int         nprocs_num,  /* Input - no of processors */
+ int         nprocs_symb, /* Input - no of processors for the symbolic
+			     factorization */
+ SuperMatrix *A,          /* Input - distributed input matrix */
+ int_t       *perm_c,     /* Input - column permutation */
+ int_t       *perm_r,     /* Input - row permutation */
+ int_t       *sizes,      /* Input - sizes of each node in the separator tree */
+ int_t       *fstVtxSep,  /* Input - first vertex of each node in the tree */
+ Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, 
+				    global to local indexing information */
+ MPI_Comm    *num_comm,   /* Input - communicator for numerical factorization */
+ MPI_Comm    *symb_comm,  /* Input - communicator for symbolic factorization */
+ superlu_dist_mem_usage_t *symb_mem_usage
+ )
+{
+/*! \brief
+ *
+ * <pre> 
+ * Purpose
+ * =======
+ *   symbfact_dist() performs symbolic factorization of matrix A suitable
+ *   for performing the supernodal Gaussian elimination with no pivoting (GEPP). 
+ *   This routine computes the structure of one column of L and one row of U 
+ *   at a time.  It uses:
+ *        o distributed input matrix
+ *        o supernodes
+ *        o symmetric structure pruning
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * nprocs_num (input) int
+ *         Number of processors SuperLU_DIST is executed on, and the input 
+ *         matrix is distributed on.
+ *
+ * nprocs_symb (input) int
+ *         Number of processors on which the symbolic factorization is
+ *         performed.  It is equal to the number of independent domains
+ *         idenfied in the graph partitioning algorithm executed
+ *         previously and has to be a power of 2.  It corresponds to
+ *         number of leaves in the separator tree.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The
+ *         number of the linear equations is A->nrow.  Matrix A is
+ *         distributed in NRformat_loc format.
+ *         Matrix A is not yet permuted by perm_c.
+ *
+ * perm_c  (input) int_t*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ * perm_r  (input) int_t*
+ *	   Row permutation vector of size A->nrow, which defines the 
+ *         permutation matrix Pr; perm_r[i] = j means column i of A is 
+ *         in position j in Pr*A.
+ *
+ * sizes   (input) int_t*
+ *         Contains the number of vertices in each separator.
+ *
+ * fstVtxSep (input) int_t*
+ *         Contains first vertex for each separator.
+ *
+ * Pslu_freeable (output) Pslu_freeable_t*
+ *         Returns the local L and U structure, and global to local
+ *         information on the indexing of the vertices.  Contains all
+ *         the information necessary for performing the data
+ *         distribution towards the numeric factorization.
+ *				    
+ * num_comm (input) MPI_Comm*
+ *         Communicator for numerical factorization 
+ *
+ * symb_comm (input) MPI_Comm*
+ *         Communicator for symbolic factorization 
+ *
+ * symb_mem_usage (input) superlu_dist_mem_usage_t *
+ *         Statistics on memory usage.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the symbolic factorization.
+ *   > 0, number of bytes allocated when out of memory.
+ *
+ * Sketch of the algorithm
+ * =======================
+ *
+ *  Distrbute the vertices on the processors using a subtree to
+ *  subcube algorithm.
+ *
+ *  Redistribute the structure of the input matrix A according to the
+ *  subtree to subcube computed previously for the symbolic
+ *  factorization routine.  This implies in particular a distribution
+ *  from nprocs_num processors to nprocs_symb processors.
+ *
+ *  Perform symbolic factorization guided by the separator tree provided by
+ *  a graph partitioning algorithm.  The symbolic factorization uses a 
+ *  combined left-looking, right-looking approach. 
+ * </pre>
+ */
+  NRformat_loc *Astore;
+  int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep;
+  int iinfo; /* return code */
+  int_t m, n;
+  int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr;
+  int_t ind_blk, nsuper, vtx, min_mn, szsn;
+  long long int nnzL, nnzU, nnzLU;
+  float stat_loc[23], stat_glob[23], mem_glob[15];
+  
+  Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */
+  vtcsInfo_symbfact_t VInfo; /* local information on number of blocks,
+				number of vertices in a block etc */
+  matrix_symbfact_t   AS; /* temporary storage for the input matrix after redistribution */
+  comm_symbfact_t CS;  /* information on communication */
+  /* relaxation parameters (for future release) and 
+     statistics collected during the symbolic factorization */
+  psymbfact_stat_t PS; 
+  /* temp array of size n, used as a marker by the subroutines */
+  int_t *tempArray; 
+  int_t i, j, k;
+  int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc;
+  int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd;
+  float totalMemLU, overestimMem;
+  MPI_Comm *commLvls;  
+
+  /* maximum block size */
+  int_t  maxSzBlk;
+  float flinfo;
+#if ( PRNTlevel >= 1)
+  float stat_msgs_l[10], stat_msgs_g[10]; 
+#endif  
+#if ( PROFlevel>=1 )
+  double t, t_symbFact[3], t_symbFact_loc[3];
+  double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9];
+#endif
+  
+  /* Initialization */
+  MPI_Comm_rank ((*num_comm), &iam);
+  commLvls = NULL;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter psymbfact()");
+#endif
+  initParmsAndStats (&PS);
+  if (nprocs_symb != 1) {
+    if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) {
+      fprintf (stderr, "Malloc fails for commLvls[].");  
+      return (PS.allocMem);
+    }
+    PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm);
+  }
+  
+  nlvls = (int) LOG2( nprocs_num ) + 1;
+#if ( PROFlevel>=1 )
+  time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) 
+					 * sizeof(double));
+  time_lvls  = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double));
+  if (!time_lvls || !time_lvlsT) {
+    fprintf (stderr, "Malloc fails for time_lvls[].");  
+    return (PS.allocMem);
+  }
+  PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double);
+#endif
+  
+  VInfo.xlsub_nextLvl  = 0;
+  VInfo.xusub_nextLvl  = 0;
+  VInfo.maxSzBlk = sp_ienv_dist(3);
+  maxSzBlk = VInfo.maxSzBlk;
+  
+  mark = EMPTY;
+  nsuper_loc = 0;
+  nextl   = 0; nextu      = 0;
+  neltsZr = 0; neltsTotal = 0;
+  
+  m = A->nrow;
+  n = A->ncol;
+  min_mn = SUPERLU_MIN( m, n );
+  
+  if (!(tempArray = intMalloc_symbfact(n))) {
+    fprintf (stderr, "Malloc fails for tempArray[].\n");  
+    return (PS.allocMem);
+  }
+  PS.allocMem += n * sizeof(int_t);
+  
+#if ( PROFlevel>=1 )  
+  t = SuperLU_timer_();
+#endif
+  
+  /* Distribute vertices on processors */
+  if ((flinfo = 
+       symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, 
+			 Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) 
+    return (flinfo);
+
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  
+  /* Redistribute matrix A on processors following the distribution found
+     in symbfact_mapVtcs.  Store the redistributed A temporarily into AS */
+  symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb,  A, 
+			     perm_c, perm_r, &AS, 
+			     Pslu_freeable, &VInfo, tempArray, num_comm);
+  
+  /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB
+     PROCESSORS */
+  if ( iam < nprocs_symb ) {
+    
+#if ( PROFlevel>=1 )
+    t_symbFact_loc[0] = SuperLU_timer_() - t;
+    t = SuperLU_timer_();
+    t_symbFact_loc[1] = t;
+#endif
+
+    /* Allocate storage common to the symbolic factor routines */
+    if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, 
+				&Llu_symbfact, &VInfo, &CS, &PS)) 
+      return (PS.allocMem);
+    /* Copy the redistributed input matrix AS at the end of the memory buffer
+       allocated to store L and U.  That is, copy (AS.x_ainf, AS.ind_ainf) in
+       (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub).  Free the
+       memory used to store the input matrix */
+    nnz_ainf_loc = VInfo.nnz_ainf_loc;
+    nnz_asup_loc = VInfo.nnz_asup_loc;
+    j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc;
+    k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc;
+    for (i = 0; i <= VInfo.nvtcs_loc; i++) {
+      Llu_symbfact.xusub[i] = AS.x_asup[i] + j;
+      Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k;
+    }
+    
+    for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) 
+      Llu_symbfact.usub[j] = AS.ind_asup[i];
+    for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++)   
+      Llu_symbfact.lsub[k] = AS.ind_ainf[i];
+    SUPERLU_FREE( AS.x_ainf );
+    SUPERLU_FREE( AS.x_asup );  
+    SUPERLU_FREE( AS.ind_ainf );  
+    SUPERLU_FREE( AS.ind_asup );  
+
+    if (nprocs_symb != 1) {
+      createComm (iam, nprocs_symb, commLvls, symb_comm);    
+
+#if ( PROFlevel>=1 )
+      t_symbFact_loc[2] = SuperLU_timer_();
+#endif
+      if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, 
+			      &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) 
+	return (flinfo);
+			     
+#if ( PROFlevel>=1 )
+      t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2];
+#endif
+    }
+    
+    /* set to EMPTY marker[] array */
+    for (i = 0; i < n; i++)
+      tempArray[i] = EMPTY;
+    
+    szSep = nprocs_symb;
+    iSep = 0;
+    lvl = 0;
+    while (szSep >= 1) {
+      /* for each level in the separator tree */
+      npNode = nprocs_symb / szSep; 
+      fstP = 0; 
+      /* for each node in the level */
+      for (jSep = iSep; jSep < iSep + szSep; jSep++) {
+	fstVtx = fstVtxSep[jSep];
+	lstVtx  = fstVtx + sizes[jSep];
+	/* if this is the first level */
+	if (szSep == nprocs_symb) {
+	  /* compute symbolic factorization for my domain */
+	  if (fstP == iam) {
+	    /* allocate storage for the pruned structures */
+#if ( PROFlevel>=1 )
+	    t1 = SuperLU_timer_();
+#endif
+	    if ((flinfo = allocPrune_domain (fstVtx, lstVtx, 
+					     &Llu_symbfact, &VInfo, &PS)) > 0)
+	      return (flinfo);
+	    if (fstVtx < lstVtx)
+	      VInfo.fstVtx_nextLvl = VInfo.begEndBlks_loc[2];
+	    
+	    domain_symbfact 
+	      (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstVtx, lstVtx, 
+	       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, tempArray, 
+	       &mark, &nextl, &nextu, &neltsZr, &neltsTotal, &nsuper_loc);
+
+	    PS.estimLSz = nextl;
+	    PS.estimUSz = nextu;
+	    if (nprocs_symb != 1) 
+	      if((flinfo = allocPrune_lvl (&Llu_symbfact, &VInfo, &PS)) > 0)
+		return (flinfo);
+#if ( PROFlevel>=1 )
+	    t2 = SuperLU_timer_();
+	    time_lvls[lvl] = 0.; time_lvls[lvl+1] = 0.;
+	    time_lvls[lvl + 2] = t2 - t1;
+#endif
+	  }
+	}
+	else {
+	  lstP = fstP + npNode;
+	  if (fstP <= iam && iam < lstP) {
+#if ( PROFlevel>=1 )
+	    t1 = SuperLU_timer_();	  
+#endif
+	    if (VInfo.filledSep != FILLED_SEPS)
+	      initLvl_symbfact(n, iam, fstVtx, lstVtx,
+			       Pslu_freeable, &Llu_symbfact, &VInfo, &PS, commLvls[jSep], 
+			       tempArray, nextl, nextu);
+#if ( PROFlevel>=1 )
+	    t2 = SuperLU_timer_();
+	    time_lvls[3*lvl] = t2 - t1;
+#endif
+	    interLvl_symbfact (A, iam, lvl, szSep, fstP, lstP,
+			       iSep, jSep, sizes, fstVtxSep, 
+			       &nextl, &nextu, &nsuper_loc, &mark, tempArray,
+			       &Llu_symbfact, Pslu_freeable, &CS, &VInfo, &PS,
+			       commLvls[jSep], symb_comm);
+#if ( PROFlevel>=1 )
+	    t1 = SuperLU_timer_();
+	    time_lvls[3*lvl+1] = t1 - t2;
+#endif
+	    if (VInfo.filledSep != FILLED_SEPS)
+	      intraLvl_symbfact 
+		(A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstP, lstP, 
+		 fstVtx, lstVtx, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS,
+		 tempArray, &mark, &nextl, &nextu, &neltsZr, &neltsTotal, 
+		 &nsuper_loc, commLvls[jSep], symb_comm);
+#if ( PROFlevel>=1 )
+	    t2 = SuperLU_timer_();
+	    time_lvls[3*lvl+2] = t2 - t1;		 
+#endif
+	  }
+	}
+	fstP += npNode;
+      }
+      iSep += szSep;
+      szSep = szSep / 2;
+      lvl ++;
+    }
+  
+    SUPERLU_FREE( tempArray );
+    
+    /* Set up global information and collect statistics */
+    if (PS.maxSzLPr < Llu_symbfact.indLsubPr)
+      PS.maxSzLPr = Llu_symbfact.indLsubPr;
+    if (PS.maxSzUPr < Llu_symbfact.indUsubPr)
+      PS.maxSzUPr = Llu_symbfact.indUsubPr;
+    
+    Llu_symbfact.xlsub[VInfo.nvtcs_loc] = nextl;
+    Llu_symbfact.xusub[VInfo.nvtcs_loc] = nextu;
+    fill_rcmd = SUPERLU_MAX( nextl / (nnz_ainf_loc+1), nextu / (nnz_asup_loc+1)) + 1;
+    Pslu_freeable->xsup_beg_loc = intMalloc_dist (nsuper_loc+1);
+    Pslu_freeable->xsup_end_loc = intMalloc_dist (nsuper_loc+1);
+    if (!Pslu_freeable->xsup_beg_loc || !Pslu_freeable->xsup_end_loc) {
+      fprintf (stderr, "Malloc fails for xsup_beg_loc, xsup_end_loc.");
+      return (PS.allocMem);
+    }
+    PS.allocMem += 2 * (nsuper_loc+1) * sizeof(int_t);
+    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+    nnzL = 0; nnzU = 0;
+    
+    i = 0;
+    nsuper = 0;
+    ind_blk = 0;
+    for (ind_blk = 0; ind_blk < VInfo.nblks_loc; ind_blk ++) {
+      fstVtx = VInfo.begEndBlks_loc[2 * ind_blk];
+      lstVtx = VInfo.begEndBlks_loc[2 * ind_blk + 1];
+      fstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[fstVtx] );
+      nsuper = Pslu_freeable->supno_loc[fstVtx_lid];
+      Pslu_freeable->xsup_beg_loc[nsuper] = fstVtx;
+      szsn = 1;
+      if (INT_MAX - nnzL <= Llu_symbfact.xlsub[fstVtx_lid + 1] - 
+	  Llu_symbfact.xlsub[fstVtx_lid])
+	printf ("PE[%d] ERR nnzL %lld\n", iam, nnzL); 
+      if (INT_MAX - nnzU <= Llu_symbfact.xusub[fstVtx_lid + 1] - 
+	  Llu_symbfact.xusub[fstVtx_lid])
+	printf ("PE[%d] ERR nnzU %lld\n", iam, nnzU);
+      
+      j = Llu_symbfact.xlsub[fstVtx_lid + 1] - Llu_symbfact.xlsub[fstVtx_lid];
+      k = Llu_symbfact.xusub[fstVtx_lid + 1] - Llu_symbfact.xusub[fstVtx_lid];
+      nnzL += j;
+      nnzU += k;
+
+      for (vtx = fstVtx + 1, vtx_lid = fstVtx_lid + 1; 
+	   vtx < lstVtx; vtx++, vtx_lid ++) {
+	if (Pslu_freeable->supno_loc[vtx_lid] != nsuper) {
+	  nsuper = Pslu_freeable->supno_loc[vtx_lid];
+	  Pslu_freeable->xsup_end_loc[nsuper-1] = vtx;
+	  Pslu_freeable->xsup_beg_loc[nsuper] = vtx;
+	  szsn = 1;
+	  j = Llu_symbfact.xlsub[vtx_lid + 1] - Llu_symbfact.xlsub[vtx_lid];
+	  k = Llu_symbfact.xusub[vtx_lid + 1] - Llu_symbfact.xusub[vtx_lid];
+	}
+	else {
+	  szsn ++;
+	}
+	nnzL += j - szsn + 1;
+	nnzU += k - szsn + 1;
+      }
+      Pslu_freeable->xsup_end_loc[nsuper] = lstVtx;
+    }
+    Pslu_freeable->supno_loc[VInfo.nvtcs_loc] = nsuper_loc;
+    Pslu_freeable->nvtcs_loc = VInfo.nvtcs_loc; 
+
+    /* set up xsup data */
+    Pslu_freeable->lsub = Llu_symbfact.lsub;
+    Pslu_freeable->xlsub = Llu_symbfact.xlsub;
+    Pslu_freeable->usub = Llu_symbfact.usub;
+    Pslu_freeable->xusub = Llu_symbfact.xusub;
+    Pslu_freeable->szLsub = Llu_symbfact.szLsub;
+    Pslu_freeable->szUsub = Llu_symbfact.szUsub;
+    
+#if ( PROFlevel>=1 )
+    t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1];
+#endif  
+
+#if ( PRNTlevel>=1 )
+    estimate_memUsage (n, iam,  symb_mem_usage, 
+		       &totalMemLU, &overestimMem, 
+		       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS);
+    stat_loc[0] = (float) nnzL;
+    stat_loc[1] = (float) nnzU;  
+    stat_loc[2] = (float) nsuper_loc;
+    stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc];
+    stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc];
+    stat_loc[5] = totalMemLU;
+    stat_loc[6] = overestimMem;
+    stat_loc[7] = totalMemLU - overestimMem;
+    stat_loc[8] = (float) PS.maxSzBuf;
+    stat_loc[9] = (float) PS.nDnsUpSeps;
+    stat_loc[10] = (float) PS.nDnsCurSep;
+    stat_loc[11] = (float) (Llu_symbfact.no_expand + Llu_symbfact.no_expcp +
+			    Llu_symbfact.no_expand_pr);
+    stat_loc[12] = (float) Llu_symbfact.no_expand;
+    stat_loc[13] = (float) Llu_symbfact.no_expcp;
+    stat_loc[14] = (float) Llu_symbfact.no_expand_pr;
+    stat_loc[15] = (float) fill_rcmd;
+    stat_loc[16] = PS.nops;
+    stat_loc[17] = PS.fill_pelt[1];
+    stat_loc[18] = PS.fill_pelt[4];
+    stat_loc[19] = PS.fill_pelt[0];
+    stat_loc[20] = PS.fill_pelt[2];
+    stat_loc[21] = PS.fill_pelt[3];
+    stat_loc[22] = PS.fill_pelt[5];
+    
+    MPI_Reduce (stat_loc, stat_glob, 23, MPI_FLOAT, 
+		MPI_SUM, 0, (*symb_comm));
+    MPI_Reduce (&(stat_loc[5]), mem_glob, 14, MPI_FLOAT, 
+		MPI_MAX, 0, (*symb_comm));
+    fill_rcmd = (int_t) mem_glob[10];
+    PS.fill_pelt[0] = stat_glob[19];
+    PS.fill_pelt[1] = mem_glob[12];
+    PS.fill_pelt[2] = stat_glob[20];
+    PS.fill_pelt[3] = stat_glob[21];
+    PS.fill_pelt[4] = mem_glob[13];
+    PS.fill_pelt[5] = stat_glob[22];
+    if (PS.fill_pelt[2] == 0.) PS.fill_pelt[2] = 1.;
+    if (PS.fill_pelt[5] == 0.) PS.fill_pelt[5] = 1.;
+    
+#if ( PROFlevel>=1 )
+    MPI_Reduce (t_symbFact_loc, t_symbFact, 3, MPI_DOUBLE,
+		MPI_MAX, 0, (*symb_comm));
+    MPI_Gather (time_lvls, 3 * nlvls, MPI_DOUBLE,
+		time_lvlsT, 3 * nlvls , MPI_DOUBLE,
+		0, (*symb_comm));
+#endif
+    
+    stat_msgs_l[0] = (float) PS.maxsz_msgSnd;
+    stat_msgs_l[1] = (float) PS.maxsz_msgSnd;
+    if (PS.maxsz_msgSnd < PS.maxsz_msgCol)
+      stat_msgs_l[1] = PS.maxsz_msgCol;
+    stat_msgs_l[2] = PS.no_shmSnd + PS.no_msgsSnd + 
+      PS.no_shmRcvd + PS.no_msgsRcvd;
+    stat_msgs_l[3] = stat_msgs_l[2] + PS.no_msgsCol;
+    stat_msgs_l[4] = stat_msgs_l[2];
+    stat_msgs_l[5] = stat_msgs_l[3]; 
+    stat_msgs_l[6] = PS.no_msgsSnd;
+    stat_msgs_l[7] = PS.no_msgsSnd + PS.no_msgsCol;  
+    stat_msgs_l[8] = PS.sz_msgsSnd;
+    stat_msgs_l[9] = PS.sz_msgsSnd + PS.sz_msgsCol;
+    MPI_Reduce (stat_msgs_l, stat_msgs_g, 4, MPI_FLOAT,
+		MPI_MAX, 0, (*symb_comm));
+    MPI_Reduce (&(stat_msgs_l[4]), &(stat_msgs_g[4]), 6, MPI_FLOAT,
+		MPI_SUM, 0, (*symb_comm));
+    if (stat_msgs_g[6] == 0) stat_msgs_g[6] = 1;
+    if (stat_msgs_g[7] == 0) stat_msgs_g[7] = 1;
+    
+    if (!iam) {
+      nnzL   = (long long) stat_glob[0]; nnzU  = (long long) stat_glob[1];
+      nsuper = (int_t) stat_glob[2];
+      szLGr  = (int_t) stat_glob[3]; szUGr = (int_t) stat_glob[4];
+      printf("\tMax szBlk          %ld\n", (long long) VInfo.maxSzBlk);
+#if ( PRNTlevel>=2 )
+      printf("\t relax_gen %.2f, relax_curSep %.2f, relax_seps %.2f\n",
+	     PS.relax_gen, PS.relax_curSep, PS.relax_seps);
+#endif
+      printf("\tParameters: fill mem %ld fill pelt %ld\n",
+	     (long long) sp_ienv_dist(6), (long long) PS.fill_par);
+      printf("\tNonzeros in L       %ld\n", nnzL);
+      printf("\tNonzeros in U       %ld\n", nnzU);
+      nnzLU = nnzL + nnzU;
+      printf("\tnonzeros in L+U-I   %ld\n", nnzLU);
+      printf("\tNo of supers   %ld\n", (long long) nsuper);
+      printf("\tSize of G(L)   %ld\n", (long long) szLGr);
+      printf("\tSize of G(U)   %ld\n", (long long) szUGr);
+      printf("\tSize of G(L+U) %ld\n", (long long) szLGr+szUGr);
+
+      printf("\tParSYMBfact (MB)      :\tL\\U MAX %.2f\tAVG %.2f\n",
+	     mem_glob[0]*1e-6, 
+	     stat_glob[5]/nprocs_symb*1e-6);
+#if ( PRNTlevel>=2 )
+      printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
+	     mem_glob[1]*1e-6, 
+	     stat_glob[6]/nprocs_symb*1e-6);
+      printf("\tsnd/rcv buffers (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
+	     mem_glob[3]*1e-6, 
+	     stat_glob[8]/nprocs_symb*1e-6);
+      printf("\tSYMBfact 2*n+4*nvtcs_loc+2*maxNvtcsNds_loc:\tL\\U %.2f\n",
+	     (float) (2 * n * sizeof(int_t)) *1e-6);
+      printf("\tint_t %d, int %d, long int %d, short %d, float %d, double %d\n", 
+	     sizeof(int_t), sizeof(int),  sizeof(long int), sizeof(short), sizeof(float),
+	     sizeof(double));
+      printf("\tDNS ALLSEPS:\t MAX %d\tAVG %.2f\n",
+	     (int_t) mem_glob[4], stat_glob[9]/nprocs_symb);
+      printf("\tDNS CURSEP:\t MAX %d\tAVG %.2f\n\n",
+	     (int_t) mem_glob[5], stat_glob[10]/nprocs_symb);
+
+      printf("\t MAX FILL Mem(L+U) / Mem(A) per processor %ld\n", fill_rcmd);    
+      printf("\t      Per elt MAX %ld AVG %ld\n", 
+	     (int_t) PS.fill_pelt[4], (int_t)(PS.fill_pelt[3]/PS.fill_pelt[5]));
+      printf("\t      Per elt RL MAX %ld AVG %ld\n",
+	     (int_t) PS.fill_pelt[1], (int_t)(PS.fill_pelt[0]/PS.fill_pelt[2]));
+      printf("\tM Nops:\t MAX %.2f\tAVG %.2f\n",
+	     mem_glob[11]*1e-6, (stat_glob[16]/nprocs_symb)*1e-6);
+      
+      
+      printf("\tEXPANSIONS: MAX/AVG\n");
+      printf("\tTOTAL: %d / %.2f\n",
+	     (int_t) mem_glob[6], stat_glob[11]/nprocs_symb);
+      printf("\tREALLOC: %.f / %.2f RL_CP %.f / %.2f PR_CP %.f / %.2f\n",
+	     mem_glob[7], stat_glob[12]/nprocs_symb,
+	     mem_glob[8], stat_glob[13]/nprocs_symb,
+	     mem_glob[9], stat_glob[14]/nprocs_symb);
+      
+      printf ("\n\tDATA MSGS  noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
+	      stat_msgs_g[2]*1e-3, stat_msgs_g[4]/nprocs_symb*1e-3,
+	      stat_msgs_g[0]*1e-6, stat_msgs_g[8] / stat_msgs_g[6]*1e-6);
+      printf ("\tTOTAL MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
+	      stat_msgs_g[3]*1e-3, stat_msgs_g[5]/nprocs_symb*1e-3,
+	      stat_msgs_g[1]*1e-6, stat_msgs_g[9]/stat_msgs_g[7]*1e-6);
+#endif      
+
+#if ( PROFlevel>=1 )
+      printf("Distribute matrix time = %8.3f\n", t_symbFact[0]);
+      printf("Count vertices time    = %8.3f\n", t_symbFact[2]);
+      printf("Symbfact DIST time     = %8.3f\n", t_symbFact[1]);
+      
+      printf("\nLvl\t    Time\t    Init\t   Inter\t    Intra\n");
+      time_lvlsg[0] = 0.;
+      for (i = 0; i < nlvls; i++) {
+	for (j = 1; j < 9; j++)
+	  time_lvlsg[j] = 0.;
+	for (p = 0; p < nprocs_symb; p++) {
+	  k = p * 3 * nlvls;
+	  t = time_lvlsT[i*3+k] + time_lvlsT[i*3+k+1] + time_lvlsT[i*3+k+2];
+	  if (t > time_lvlsg[1]) {
+	    time_lvlsg[1] = t; j = p;
+	  }
+	  time_lvlsg[2] += t;
+	  if (time_lvlsT[i*3+k] > time_lvlsg[3])
+	    time_lvlsg[3] = time_lvlsT[i*3+k];
+	  time_lvlsg[4] += time_lvlsT[i*3+k];
+	  if (time_lvlsT[i*3+k+1] > time_lvlsg[5])
+	    time_lvlsg[5] = time_lvlsT[i*3+k+1];
+	  time_lvlsg[6] += time_lvlsT[i*3+k+1];
+	  if (time_lvlsT[i*3+k+2] > time_lvlsg[7])
+	    time_lvlsg[7] = time_lvlsT[i*3+k+2];
+	  time_lvlsg[8] += time_lvlsT[i*3+k+2];
+	}
+	time_lvlsg[0] += time_lvlsg[1];
+	printf ("%d \t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\n", i,
+		time_lvlsg[1], time_lvlsg[2] / nprocs_symb,
+		time_lvlsg[3], time_lvlsg[4] / nprocs_symb,
+		time_lvlsg[5], time_lvlsg[6] /nprocs_symb,
+		time_lvlsg[7], time_lvlsg[8] / nprocs_symb); 
+      }
+      printf("\t   %8.3f \n", time_lvlsg[0]);    
+#endif
+    }
+#endif
+#if ( PROFlevel>=1 )
+    SUPERLU_FREE (time_lvls);
+    SUPERLU_FREE (time_lvlsT);
+#endif
+    symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS);
+  } /* if (iam < nprocs_symb) */  
+  else {
+    /* update Pslu_freeable before returning */
+    Pslu_freeable->nvtcs_loc = 0; 
+    Pslu_freeable->xlsub = NULL; Pslu_freeable->lsub = NULL; 
+    Pslu_freeable->xusub = NULL; Pslu_freeable->usub = NULL; 
+    Pslu_freeable->supno_loc = NULL;
+    Pslu_freeable->xsup_beg_loc = NULL;     
+    Pslu_freeable->xsup_end_loc = NULL;
+    
+    SUPERLU_FREE( tempArray );
+    PS.allocMem -= n * sizeof(int_t);
+  }
+
+  if (iam < nprocs_symb && nprocs_symb != 1) 
+    freeComm (iam, nprocs_symb, commLvls, symb_comm);     
+  if (commLvls != NULL)
+    SUPERLU_FREE( commLvls );
+  
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit psymbfact()");
+#endif
+
+  return (- PS.allocMem);
+} /* SYMBFACT_DIST */
+
+
+static int_t
+initParmsAndStats
+(
+ psymbfact_stat_t *PS /* Output -statistics*/
+)
+/*! \brief
+ * <pre> 
+ * Purpose
+ * =======
+ * Initialize relaxation parameters and statistics variables
+ * </pre>
+ */
+{
+  int  i;
+
+  PS->nDnsCurSep = 0;
+  PS->nDnsUpSeps = 0;
+  
+  PS->relax_gen = 1.0;
+  PS->relax_curSep = 1.0;
+  PS->relax_seps = 1.0;
+  PS->fill_par = sp_ienv_dist(6);
+  PS->nops = 0.;
+  PS->no_shmSnd = 0.;
+  PS->no_msgsSnd = 0.;
+  PS->maxsz_msgSnd = 0;
+  PS->sz_msgsSnd = 0.;
+  PS->no_shmRcvd = 0.;
+  PS->no_msgsRcvd = 0.;
+  PS->maxsz_msgRcvd = 0;
+  PS->sz_msgsRcvd = 0.;
+  PS->no_msgsCol = 0.;
+  PS->maxsz_msgCol = 0;
+  PS->sz_msgsCol = 0.;
+
+  for (i = 0; i < 6; i++)
+    PS->fill_pelt[i] = 0.;
+
+  PS->estimUSz = 0;
+  PS->estimLSz = 0;
+  PS->maxSzLPr = 0;
+  PS->maxSzUPr = 0;
+  PS->maxSzBuf = 0;
+  PS->szDnsSep = 0;  
+  PS->allocMem = 0;
+
+  return 0;
+}
+
+static float
+cntsVtcs 
+(
+ int_t  n,           /* Input - order of the input matrix */
+ int    iam,         /* Input - my processor number */
+ int    nprocs_symb, /* Input - no of processors for symbolic factorization */
+ Pslu_freeable_t *Pslu_freeable, /* Input -globToLoc and maxNvtcsPProc */
+ Llu_symbfact_t  *Llu_symbfact, /* Input/Output -local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
+ int_t            *tempArray, /* Input - temporary storage */
+ int_t            *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int_t            *sizes,     /* Input - sizes of each node in the tree */
+ psymbfact_stat_t *PS,  /* Input/Output -statistics */
+ MPI_Comm         *commLvls
+ )
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Computes an estimation of the number of elements in columns of L
+ * and rows of U.  Stores this information in cntelt_vtcs, and it will
+ * be used in the right-looking symbolic factorization.
+ * </pre>
+ */
+{
+  int   fstP, lstP, szSep, npNode, i, j;
+  int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk;
+  int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk;
+  int_t nelts, nelts_new_blk;
+  int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc;
+  int_t *minElt_vtx, *cntelt_vtcs;
+  
+  /* Initialization */
+  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
+  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
+  cntelt_vtcs = Llu_symbfact->cntelt_vtcs;
+  globToLoc = Pslu_freeable->globToLoc;
+  nvtcs_loc = VInfo->nvtcs_loc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n)
+    minElt_vtx = lsub;
+  else { 
+    /* allocate memory for minElt_vtx */
+    if (!(minElt_vtx = intMalloc_dist(n))) {
+      fprintf(stderr, "Malloc fails for minElt_vtx[].");
+      return (PS->allocMem);
+    }
+    PS->allocMem += n * sizeof (int_t);
+  } 
+  
+  for (ii = 0; ii < n; ii++) 
+    tempArray[ii] = n;
+  for (ii = 0; ii < nvtcs_loc; ii++)
+    cntelt_vtcs[ii] = 0;
+
+  szSep = nprocs_symb;
+  i = 0;
+  cur_blk = 0;
+  vtx_lid = 0;
+  while (szSep >= 1) {
+    /* for each level in the separator tree */
+    npNode = nprocs_symb / szSep; 
+    fstP = 0; 
+    /* for each node in the level */
+    for (j = i; j < i + szSep; j++) {
+      fstVtx = fstVtxSep[j];
+      lstVtx  = fstVtx + sizes[j];
+      lstP = fstP + npNode;
+
+      if (fstP <= iam && iam < lstP) {      
+	ind_blk = cur_blk;
+	ii = vtx_lid;
+	while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
+	       ind_blk < 2 * VInfo->nblks_loc) {	  
+	  fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+	  lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+	  ind_blk += 2;
+	  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) {
+	    for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) {
+	      vtx_elt = lsub[jj];
+	      if (tempArray[vtx_elt] == n) {
+		tempArray[vtx_elt] = vtx;
+	      }
+	    }
+	    for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) {
+	      vtx_elt = usub[jj];
+	      if (tempArray[vtx_elt] == n) {
+		tempArray[vtx_elt] = vtx;
+	      }
+	    }
+	  }	  
+	} 
+	if (szSep == nprocs_symb) 
+	  vtx_lid = ii;
+	else {
+	  MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), 
+			 (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]);
+#if ( PRNTlevel>=1 )
+	  PS->no_msgsCol += (float) (2 * (int) LOG2( npNode ));
+	  PS->sz_msgsCol += (float) (n - fstVtx);
+	  if (PS->maxsz_msgCol < n - fstVtx) 
+	    PS->maxsz_msgCol = n - fstVtx;      
+#endif
+
+	  nelts = 0;
+	  for (ii = fstVtx; ii < lstVtx; ii++)
+	    tempArray[ii] = 0;
+	  for (ii = fstVtx; ii < n; ii++) {
+	    if (minElt_vtx[ii] != n) {
+	      if (minElt_vtx[ii] < fstVtx)
+		nelts ++;
+	      else
+		tempArray[minElt_vtx[ii]] ++;
+	      if (ii > lstVtx)
+		tempArray[ii] = minElt_vtx[ii];
+	    }
+	  }
+	
+	  ind_blk = cur_blk;
+	  lv = fstVtx;
+	  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
+		 ind_blk < 2 * VInfo->nblks_loc) {	  
+	    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+	    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+	    ind_blk += 2;
+	    
+	    for (ii = lv; ii < fstVtx_blk; ii++)
+	      nelts += tempArray[ii];
+	    lv = lstVtx_blk;
+
+	    nelts_new_blk = 0;
+	    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
+	      nelts_new_blk += tempArray[vtx];
+	      cntelt_vtcs[vtx_lid] = nelts;
+	    }
+	    nelts += nelts_new_blk;
+	  }
+	} /* if (szSep != nprocs_symb) */
+	cur_blk = ind_blk;
+      }
+      fstP += npNode;
+    }
+    i += szSep;
+    szSep = szSep / 2;
+  }
+  /* free memory */
+  if (minElt_vtx != lsub) {
+    SUPERLU_FREE (minElt_vtx);
+    PS->allocMem -= n * sizeof(int_t);
+  }
+  return (SUCCES_RET);
+}
+
+static float
+symbfact_mapVtcs
+(
+ int iam,             /* Input -process number */
+ int nprocs_num,      /* Input -number of processors */
+ int nprocs_symb,     /* Input -number of procs for symbolic factorization */
+ SuperMatrix *A,      /* Input -input distributed matrix A */
+ int_t *fstVtxSep,    /* Input -first vertex in each separator */
+ int_t *sizes,        /* Input -size of each separator in the separator tree */
+ Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc 
+				    computed */
+ vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */
+ int_t *tempArray,    /* Input -temp array of size n = order of the matrix */
+ int_t  maxSzBlk,     /* Input -maximum number of vertices in a block */
+ psymbfact_stat_t *PS /* Input/Output -statistics */
+ ) 
+{
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *  symbfact_mapVtcs maps the vertices of the graph of the input
+ *  matrix A on nprocs_symb processors, using the separator tree
+ *  returned by a graph partitioning algorithm from the previous step
+ *  of the symbolic factorization.  The number of processors
+ *  nprocs_symb must be a power of 2.
+ *
+ * Description of the algorithm
+ * ============================
+ *
+ *  A subtree to subcube algorithm is used first to map the processors
+ *  on the nodes of the separator tree.
+ *
+ *  For each node of the separator tree, its corresponding vertices
+ *  are distributed on the processors affected to this node, using a
+ *  block cyclic distribution.
+ *
+ *  After the distribution, fields of the VInfo structure are
+ *  computed.  The array globToLoc and maxNvtcsPProc of Pslu_freeable
+ *  are also computed.
+ * </pre>
+ */
+  int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d;
+  int_t k, n, kk;
+  int_t fstVtx, lstVtx;
+  int_t fstVtxBlk, ind_blk;
+  int_t noVtcsProc, noBlk;
+  int_t nvtcs_loc; /* number of vertices owned by process iam */
+  int_t nblks_loc; /* no of blocks owned by process iam */
+  int_t *globToLoc;    /* global indexing to local indexing */
+  int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx;
+  int_t *begEndBlks_loc; /* begin and end vertex of each local block */
+  int_t *vtcs_pe;  /* contains the number of vertices on each processor */
+  int   *avail_pes; /* contains the processors to be used at each level */
+  
+  n = A->ncol;
+  /* allocate memory */
+  if (!(globToLoc = intMalloc_dist(n + 1))) {
+    fprintf (stderr, "Malloc fails for globToLoc[].");
+    return (PS->allocMem);
+  }
+  PS->allocMem += (n+1) * sizeof(int_t);
+  if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) {
+    fprintf (stderr, "Malloc fails for avail_pes[].");
+    return (PS->allocMem);
+  }
+  PS->allocMem += nprocs_symb*sizeof(int);
+  if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) {
+    fprintf (stderr, "Malloc fails for vtcs_pe[].");
+    return (PS->allocMem);
+  }
+  PS->allocMem += nprocs_symb*sizeof(int_t);
+  
+  /* Initialization */
+  globToLoc[n] = n;  
+  for (p = 0; p < nprocs_symb; p++) {
+    vtcs_pe[p] = 0;
+    avail_pes[p] = EMPTY;
+  }
+  nvtcs_loc = 0;
+  nblks_loc = 0;
+  maxNvtcsNds_loc = 0;
+  maxNeltsVtx     = 0;
+  
+  /* distribute data among processors */
+  szSep = nprocs_symb;
+  iSep = 0;
+  while (szSep >= 1) {
+    /* for each level in the separator tree */
+    npNode = nprocs_symb / szSep; 
+    firstP = 0; 
+    nvtcsNds_loc = 0;
+    
+    for (jSep = iSep; jSep < iSep + szSep; jSep++) {
+      /* for each node in the level */
+      fstVtx = fstVtxSep[jSep];
+      lstVtx = fstVtx + sizes[jSep];
+      if (firstP <= iam && iam < firstP + npNode)
+	maxNeltsVtx += lstVtx - fstVtx;
+
+      if (szSep == nprocs_symb) {
+	/* leaves of the separator tree */
+	for (k = fstVtx; k < lstVtx; k++) {
+	  globToLoc[k] = (int_t) firstP;
+	  vtcs_pe[firstP] ++;
+	}
+	if (firstP == iam) {	  
+	  nvtcs_loc += lstVtx - fstVtx;
+	  if (fstVtx != lstVtx)
+	    nblks_loc ++;
+	}
+      }
+      else {
+	/* superior levels of the separator tree */
+	k = fstVtx;
+	noVtcsProc = maxSzBlk;
+	fstVtxBlk = fstVtx;
+	if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode;
+	/* first allocate processors from previous levels */	
+	for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) {
+	  p = avail_pes[ind_ap_s];
+	  if (p != EMPTY && k < lstVtx) {
+	    /* for each column in the separator */	  
+	    avail_pes[ind_ap_s] = EMPTY;
+	    kk = 0;
+	    while (kk < noVtcsProc && k < lstVtx) {
+	      globToLoc[k] = p;
+	      vtcs_pe[p] ++;
+	      k ++;
+	      kk ++;
+	    }
+	    if (p == iam) {
+	      nvtcs_loc += kk;
+	      nblks_loc ++;
+	      nvtcsNds_loc += kk;
+	    }
+	  }
+	  else {
+	    if (p != EMPTY && k == lstVtx) {
+	      avail_pes[ind_ap_s] = EMPTY;
+	      avail_pes[ind_ap_d] = p; ind_ap_d ++;
+	    }
+	  }
+	} 
+	noBlk = 0;
+	p = firstP + npNode;
+	while (k < lstVtx) {
+	  /* for each column in the separator */
+	  kk = 0;
+	  p = (int) (noBlk % (int_t) npNode) + firstP;
+	  while (kk < noVtcsProc && k < lstVtx) {
+	    globToLoc[k] = p;
+	    vtcs_pe[p] ++;
+	    k ++;
+	    kk ++;
+	  }
+	  if (p == iam) {
+	    nvtcs_loc += kk;
+	    nblks_loc ++;
+	    nvtcsNds_loc += kk;
+	  }
+	  noBlk ++;
+	} /* while (k < lstVtx) */
+	/* Add the unused processors to the avail_pes list of pes */
+	for (p = p + 1; p < firstP + npNode; p ++) {
+	  avail_pes[ind_ap_d] = p; ind_ap_d ++;
+	}
+      }
+      firstP += npNode;
+    }
+    if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb)
+      maxNvtcsNds_loc = nvtcsNds_loc;
+    iSep += szSep;
+    szSep = szSep / 2;
+  }
+  
+#if ( PRNTlevel>=2 )
+  if (!iam)
+    PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe);
+#endif
+  /* determine maximum number of vertices among processors */
+  maxNvtcsPProc = vtcs_pe[0];
+  vtcs_pe[0] = 0;
+  for (p = 1; p < nprocs_symb; p++) {
+    if (maxNvtcsPProc < vtcs_pe[p])
+      maxNvtcsPProc = vtcs_pe[p];
+    vtcs_pe[p] = 0;
+  }
+#if ( PRNTlevel>=2 )
+  if (!iam)
+    printf ("  MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", 
+	    maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n);
+#endif
+
+  if (iam < nprocs_symb)
+    if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1)))
+      ABORT("Malloc fails for begEndBlks_loc[].");
+  
+  ind_blk = 0;
+  k = 0;
+  while (k < n) {
+    p = globToLoc[k];
+    if (p == iam) 
+      begEndBlks_loc[ind_blk] = k;
+    while (globToLoc[k] == p && k < n) {
+      globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p];
+      vtcs_pe[p] ++;
+      k ++;
+    }
+    if (p == iam) {
+      begEndBlks_loc[ind_blk + 1] = k;
+      ind_blk += 2;
+    }
+  }
+  if (iam < nprocs_symb)
+    begEndBlks_loc[2 * nblks_loc] = n;
+ 
+  SUPERLU_FREE (avail_pes);
+  SUPERLU_FREE (vtcs_pe);
+  
+  Pslu_freeable->maxNvtcsPProc   = maxNvtcsPProc;
+  Pslu_freeable->globToLoc       = globToLoc;
+  if (iam < nprocs_symb) {
+    VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc;
+    VInfo->nblks_loc       = nblks_loc;
+    VInfo->nvtcs_loc       = nvtcs_loc;
+    VInfo->curblk_loc      = 0;
+    VInfo->maxNeltsVtx     = maxNeltsVtx;
+    VInfo->filledSep       = FALSE;
+    VInfo->xlsub_nextLvl   = 0;
+    VInfo->xusub_nextLvl   = 0;
+    VInfo->begEndBlks_loc  = begEndBlks_loc;
+    VInfo->fstVtx_nextLvl  = begEndBlks_loc[0];
+  }
+  return SUCCES_RET;
+}
+
+static void 
+symbfact_distributeMatrix
+(
+ int   iam,             /* Input - my processor number */  
+ int   nprocs_num,      /* Input - number of processors */
+ int   nprocs_symb,     /* Input - number of processors for the
+			   symbolic factorization */
+ SuperMatrix *A,        /* Input - input matrix A */
+ int_t *perm_c,         /* Input - column permutation */
+ int_t *perm_r,         /* Input - row permutation */
+ matrix_symbfact_t *AS, /* Output - temporary storage for the
+			   redistributed matrix */
+ Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */
+ vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices
+				 distribution */
+ int_t  *tempArray,     /* Input/Output - temporary array of size n
+			   (order of the matrix) */
+ MPI_Comm    *num_comm  /* Input - communicator for nprocs_num procs */
+ )
+{
+/*! \brief
+ *
+ * <pre>
+ * Purpose 
+ * =======
+ *
+ * Distribute input matrix A for the symbolic factorization routine.
+ * Only structural information is distributed.  The redistributed
+ * matrix has its rows and columns permuted according to perm_r and
+ * perm_c. A is not modified during this routine.
+ * </pre>
+ */
+/* Notations:
+ * Ainf : inferior part of A, including diagonal.
+ * Asup : superior part of A.
+ */
+  int p, p_irow, code_err, ainf_data;
+  int_t n, m_loc, fst_row;
+  int_t i, j, k, irow, jcol;
+  NRformat_loc *Astore;
+  int_t  nnz_loc, nnz_iam;    /* number of local nonzeros */
+  int_t  nnz_remote; /* number of remote nonzeros to be sent */
+  int_t  SendCnt; /* number of remote nonzeros to be sent */
+  int_t  RecvCnt; /* number of remote nonzeros to be received */
+  /* number of nonzeros to send/receive per processor */
+  int_t  *nnzToSend, *nnzToRecv; 
+  int_t *nnzAinf_toSnd; /* nnz in Ainf to send */
+  /* VInfo data structures */
+  int_t *globToLoc, *begEndBlks_loc, nblks_loc, nvtcs_loc, maxNvtcsPProc;
+  
+  int_t neltsRow, vtx, vtx_lid, nelts, ind;
+  int_t *snd_aind, *rcv_aind;
+  int_t *ptr_toSnd, *buf, *ptr_toRcv;
+  /* matrix_symbfact_t *As data */
+  int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup;
+  int  *intBuf1, *intBuf2, *intBuf3, *intBuf4;
+
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  Astore = (NRformat_loc *) A->Store;
+  n = A->ncol;
+  m_loc = Astore->m_loc;
+  fst_row = Astore->fst_row;
+  globToLoc = Pslu_freeable->globToLoc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  nnzToRecv = intCalloc_symbfact(3 * (int_t)nprocs_num);
+  nnzToSend = nnzToRecv + nprocs_num;
+  nnzAinf_toSnd = nnzToRecv + 2 * nprocs_num;
+
+  /* --------------------------------------------------------------------- 
+    COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE
+    SPACE.  THIS ACCOUNTS FOR THE FIRST PASS OF A.
+    ----------------------------------------------------------------------*/
+  /* tempArray stores the number of nonzeros in each column of ainf */
+  for (i = 0; i < n; i++)
+    tempArray[i] = 0;
+  for (i = 0; i < m_loc; i++) {
+    irow   = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+    p_irow = OWNER(globToLoc[irow]);
+    neltsRow = 0;
+
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
+      jcol = perm_c[Astore->colind[j]];
+      if (jcol <= irow) {
+	p = OWNER(globToLoc[jcol]);
+	if (tempArray[jcol] == 0) {
+	  nnzToSend[p] += 2;
+	  nnzAinf_toSnd[p] += 2;
+	}
+	tempArray[jcol] ++;
+	nnzAinf_toSnd[p] ++;
+      }
+      else {
+	p = p_irow;
+	neltsRow ++;
+      }
+      nnzToSend[p] ++; 
+    }
+    if (neltsRow != 0) {
+      nnzToSend[p_irow] += 2;
+    }
+  }
+  
+  /* add one entry which will separate columns of Ainf from rows
+     of Asup */
+  for (p = 0; p < nprocs_num; p++)
+    if (nnzToSend[p] != 0)
+      nnzToSend[p] ++;
+  
+  /* All-to-all communication */
+  MPI_Alltoall (nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		(*num_comm));
+
+  nnz_loc = SendCnt = RecvCnt = 0;
+  for (p = 0; p < nprocs_num; p++) {
+    if ( p != iam ) {
+      SendCnt += nnzToSend[p];
+      RecvCnt += nnzToRecv[p];
+    } else {
+      nnz_loc += nnzToRecv[p];
+      nnzToSend[p] = 0;
+    }
+  }
+  nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+  
+  /* Allocate temporary storage for sending/receiving the A triplets. */
+  if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0)
+    ABORT("Malloc fails for snd_aind[].");
+  if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1)))
+    ABORT("Malloc fails for rcv_aind[].");
+  if ( !(ptr_toSnd = intCalloc_symbfact((int_t) nprocs_num)) )
+    ABORT("Malloc fails for ptr_toSnd[].");
+  if ( !(ptr_toRcv = intCalloc_symbfact((int_t) nprocs_num)) )
+    ABORT("Malloc fails for ptr_toRcv[].");
+
+  /* setup ptr_toSnd[p] to point to data in snd_aind to be send to 
+   processor p */
+  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
+    if ( p != iam ) 
+      ptr_toSnd[p] = i;
+    else
+      ptr_toSnd[p] = j;
+    i += nnzToSend[p]; 
+    j += nnzToRecv[p];
+  }
+
+  for (i = 0; i < n; i++) {
+    if (tempArray[i] != 0) {
+      /* column i of Ainf will be send to a processor  */
+      p = OWNER( globToLoc[i] );
+      if (p == iam) {
+	buf = &(rcv_aind[ptr_toSnd[p]]);
+      }
+      else {
+	buf = &(snd_aind[ptr_toSnd[p]]);
+      }
+      buf[0] = tempArray[i];
+      buf[1] = i;
+      tempArray[i] = ptr_toSnd[p] + 2;
+      ptr_toSnd[p] += 2 + buf[0];
+    }
+  }
+
+  /* set ptr_toSnd to point to Asup data (stored by rows) */
+  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
+    if ( p != iam ) {
+      if (nnzToSend[p] != 0) { 
+	snd_aind[i + nnzAinf_toSnd[p]] = EMPTY;
+	ptr_toSnd[p] = i + nnzAinf_toSnd[p] + 1;
+      }
+    }
+    else {
+      if (nnzToRecv[p] != 0) {
+	rcv_aind[j + nnzAinf_toSnd[p]] = EMPTY;
+	ptr_toSnd[p] = j + nnzAinf_toSnd[p] + 1;
+      }
+    }
+    i += nnzToSend[p]; 
+    j += nnzToRecv[p];
+  }
+
+  /* ------------------------------------------------------------
+     LOAD THE ENTRIES OF A INTO THE snd_aind STRUCTURE TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF A.
+     For each processor, we store first the columns to be sent,
+     and then the rows to be sent. For each row/column sent:
+     entry 0            : x = number of elements in that row/column
+     entry 1            : row/column number
+     entries 2 .. x + 2 : row/column indices.
+     ------------------------------------------------------------*/
+  for (i = 0; i < m_loc; i++) {
+    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*A */
+    p_irow = OWNER( globToLoc[irow] );
+    ptr_toSnd[p_irow] +=2;
+    neltsRow = 0;
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
+      jcol = perm_c[Astore->colind[j]];
+      if (jcol <= irow) {
+	p = OWNER( globToLoc[jcol] );
+	k = tempArray[jcol];
+	tempArray[jcol] ++;
+	if (p == iam) { /* local */
+	  rcv_aind[k] = irow; 
+	}
+	else {
+	  snd_aind[k] = irow;
+	}
+      }
+      else {
+	p = p_irow;
+	neltsRow ++;
+	k = ptr_toSnd[p];
+	ptr_toSnd[p] ++;
+	if (p == iam) { /* local */
+	  rcv_aind[k] = jcol;
+	}
+	else {
+	  snd_aind[k] = jcol;
+	}
+      }
+    }
+
+    if (neltsRow == 0)
+      ptr_toSnd[p_irow] -= 2;
+    else {
+      /* store entry 0 and entry 1 */
+      if (p_irow == iam) { /* local */
+	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
+	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
+      }
+      else { /* remote */
+	snd_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
+	snd_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
+      }
+    }
+  }
+  
+  /* reset ptr_toSnd to point to the beginning of the data for
+     each processor (structure needed in MPI_Alltoallv */
+  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
+    ptr_toSnd[p] = i;
+    i += nnzToSend[p];
+    ptr_toRcv[p] = j;
+    j += nnzToRecv[p];
+  }
+  
+  /* ------------------------------------------------------------
+     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+     Note: it uses MPI_Alltoallv.
+     ------------------------------------------------------------*/
+  if (nprocs_num > 1) {
+#if defined (_LONGINT)
+    intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs_num * sizeof(int));
+    intBuf2 = intBuf1 + nprocs_num;
+    intBuf3 = intBuf1 + 2 * nprocs_num;
+    intBuf4 = intBuf1 + 3 * nprocs_num;
+    
+    for (p=0; p<nprocs_num; p++) {
+      if (nnzToSend[p] > INT_MAX || ptr_toSnd[p] > INT_MAX ||
+	  nnzToRecv[p] > INT_MAX || ptr_toRcv[p] > INT_MAX)
+	ABORT("ERROR in symbfact_distributeMatrix size to send > INT_MAX\n");
+      intBuf1[p] = (int) nnzToSend[p];
+      intBuf2[p] = (int) ptr_toSnd[p];
+      intBuf3[p] = (int) nnzToRecv[p];
+      intBuf4[p] = (int) ptr_toRcv[p];
+    }
+    intBuf1[iam]=0; /* This corresponds to nnzToSend[iam] */
+    intBuf3[iam]=0; /* This corresponds to nnzToRecv[iam] */
+#else  /* Default */
+    intBuf1 = nnzToSend;  intBuf2 = ptr_toSnd;
+    intBuf3 = nnzToRecv;  intBuf4 = ptr_toRcv;
+    i = nnzToRecv[iam]; 
+    nnzToRecv[iam] = 0;
+    nnzToSend[iam] = 0;
+#endif
+
+    MPI_Alltoallv (snd_aind, intBuf1, intBuf2, mpi_int_t, 
+		   rcv_aind, intBuf3, intBuf4, mpi_int_t,
+		   (*num_comm));
+
+#if defined (_LONGINT)
+    SUPERLU_FREE (intBuf1);
+#else  /* Default */
+    nnzToRecv[iam] = i;
+#endif
+  }
+  
+  /* ------------------------------------------------------------
+     DEALLOCATE SEND STORAGE
+     ------------------------------------------------------------*/
+  if (snd_aind) SUPERLU_FREE( snd_aind );
+  SUPERLU_FREE( ptr_toSnd );
+
+  /* ------------------------------------------------------------
+     CONVERT THE RECEIVED FORMAT INTO THE SYMBOLIC FORMAT.
+     THIS IS PERFORMED ONLY BY NPROCS_SYMB PROCESSORS
+     ------------------------------------------------------------*/
+  if (iam < nprocs_symb) {
+    nblks_loc = VInfo->nblks_loc;
+    begEndBlks_loc = VInfo->begEndBlks_loc;
+    nvtcs_loc = VInfo->nvtcs_loc;
+    /* ------------------------------------------------------------
+       Allocate space for storing indices of A after redistribution.
+       ------------------------------------------------------------*/
+    if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1)))
+      ABORT("Malloc fails for x_ainf[].");
+    if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1)))
+      ABORT("Malloc fails for x_asup[].");
+    
+    /* Initialize the array of columns/rows pointers */
+    for (i = 0, p = 0; p < nprocs_num; p++) {
+      ainf_data = TRUE;
+      k = 0;
+      while (k < nnzToRecv[p]) {
+	j = rcv_aind[i + k];
+	if (j == EMPTY) {
+	  ainf_data = FALSE;
+	  k ++;
+	}
+	else {
+	  nelts = rcv_aind[i + k];
+	  vtx = rcv_aind[i + k + 1];
+	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
+	  k += nelts + 2;
+	  if (ainf_data) 
+	    x_ainf[vtx_lid] += nelts; 
+	  else 
+	    x_asup[vtx_lid] = nelts;
+	}
+      }
+      i += nnzToRecv[p];
+    }
+    
+    /* copy received information */
+    vtx_lid = 0;
+    for (i = 0, k = 0, j = 0; i < nblks_loc; i++) {
+      for (vtx = begEndBlks_loc[2*i]; vtx < begEndBlks_loc[2*i+1]; vtx++, vtx_lid ++) {
+	nelts = x_ainf[vtx_lid];
+	x_ainf[vtx_lid] = k;
+	k += nelts;
+	nelts = x_asup[vtx_lid];
+	x_asup[vtx_lid] = j;
+	j += nelts;
+	tempArray[vtx] = x_ainf[vtx_lid];
+      }
+    }
+    x_ainf[nvtcs_loc] = k;
+    x_asup[nvtcs_loc] = j;
+    
+    /* Allocate space for storing indices of A after conversion */
+    if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 )
+      ABORT("Malloc fails for ind_ainf[].");
+    if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0)
+      ABORT("Malloc fails for ind_asup[].");
+    
+    /* Copy the data into the row/column oriented storage */  
+    for (i = 0, p = 0; p < nprocs_num; p++) {
+      ainf_data = TRUE;
+      k = 0;
+      while (k < nnzToRecv[p]) {
+	j = rcv_aind[i + k];
+	if (ainf_data && j == EMPTY) {
+	  ainf_data = FALSE;
+	  k ++;
+	}
+	else {
+	  nelts = rcv_aind[i + k];
+	  vtx = rcv_aind[i + k + 1];
+	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
+	  if (ainf_data) {
+	    /* traverse ainf data */
+	    ind = tempArray[vtx];
+	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
+	      ind_ainf[ind] = rcv_aind[j];
+	    tempArray[vtx] = ind;
+	  }
+	  else {
+	    /* traverse asup data */
+	    ind = x_asup[vtx_lid];
+	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
+	      ind_asup[ind] = rcv_aind[j];
+	  }
+	  k += nelts + 2;
+	}
+      }
+      i += nnzToRecv[p];
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE
+       ------------------------------------------------------------*/
+    SUPERLU_FREE( ptr_toRcv );
+    if (rcv_aind) SUPERLU_FREE( rcv_aind );
+    if (nnzToRecv) SUPERLU_FREE( nnzToRecv );
+
+    AS->x_ainf = x_ainf;
+    AS->x_asup = x_asup;
+    AS->ind_ainf = ind_ainf;
+    AS->ind_asup = ind_asup;
+    
+    VInfo->nnz_asup_loc = x_asup[nvtcs_loc];
+    VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc];
+  }
+}
+
+static
+float allocPrune_lvl
+(
+ Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data
+				  structures */
+ vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
+				  distribution */
+ psymbfact_stat_t *PS          /* Input -statistics */
+ )
+/*! \brief
+ *
+ * <pre>
+ * Allocate storage for data structures necessary for pruned graphs.
+ * For those unpredictable size, make a guess as FILL * n.
+ * Return value:
+ *     0 if enough memory was available;
+ *     otherwise, return the amount of space intended to allocate 
+ *     when memory allocation failure occurred.
+ * </pre>
+ */
+{
+  int_t  lword;
+  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
+  int_t  nvtcs_loc, no_expand_pr, x_sz;
+  float  alpha = 1.5;
+  int_t  FILL = sp_ienv_dist(6);
+  
+  nvtcs_loc = VInfo->nvtcs_loc;
+  
+  no_expand_pr = 0;
+  lword     = (int_t) sizeof(int_t);
+  
+  /* free memory allocated for the domain symbolic factorization */
+  if (Llu_symbfact->szLsubPr)
+    SUPERLU_FREE( Llu_symbfact->lsubPr );
+  if (Llu_symbfact->szUsubPr)
+    SUPERLU_FREE( Llu_symbfact->usubPr );
+  if (Llu_symbfact->xlsubPr)
+    SUPERLU_FREE( Llu_symbfact->xlsubPr );
+  if (Llu_symbfact->xusubPr)
+    SUPERLU_FREE( Llu_symbfact->xusubPr );
+  
+  Llu_symbfact->xlsub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
+  Llu_symbfact->xusub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
+
+  /* allocate memory to use during superior levels of sep_tree */
+  x_sz = SUPERLU_MIN( VInfo->maxNvtcsNds_loc, VInfo->maxSzBlk);
+  nzlmaxPr = 2 * FILL * VInfo->maxNvtcsNds_loc;
+  nzumaxPr = 2 * FILL * VInfo->maxSzBlk;  
+
+  /* Integer pointers for L\U factors */
+  if (x_sz != 0) {
+    xlsubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
+    xusubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
+    
+    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
+    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
+    
+    while ( !lsubPr || !usubPr ) {
+      if ( lsubPr ) SUPERLU_FREE( lsubPr ); 
+      if ( usubPr ) SUPERLU_FREE( usubPr );
+      
+      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
+      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
+      
+      if ( nzumaxPr < x_sz ) {
+	fprintf(stderr, "Not enough memory to perform factorization.\n");
+	return (PS->allocMem);
+      }
+      lsubPr  = (int_t *) SUPERLU_MALLOC(nzlmaxPr * lword);
+      usubPr  = (int_t *) SUPERLU_MALLOC(nzumaxPr * lword);
+      ++no_expand_pr;
+    }
+  }    
+  else {
+    xlsubPr = NULL; lsubPr = NULL;
+    xusubPr = NULL; usubPr = NULL;
+    nzlmaxPr = 0; nzumaxPr = 0;
+  }
+  
+  if (VInfo->maxNvtcsNds_loc)
+    Llu_symbfact->cntelt_vtcsA_lvl = 
+      (int_t *) SUPERLU_MALLOC (VInfo->maxNvtcsNds_loc * lword);
+
+  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
+    PS->maxSzLPr = Llu_symbfact->indLsubPr;
+  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
+    PS->maxSzUPr = Llu_symbfact->indUsubPr;
+  
+  Llu_symbfact->lsubPr   = lsubPr;
+  Llu_symbfact->xlsubPr  = xlsubPr;
+  Llu_symbfact->usubPr   = usubPr;
+  Llu_symbfact->xusubPr  = xusubPr;
+  Llu_symbfact->szLsubPr = nzlmaxPr;
+  Llu_symbfact->szUsubPr = nzumaxPr;
+  Llu_symbfact->indLsubPr = 0;
+  Llu_symbfact->indUsubPr = 0;
+
+  Llu_symbfact->no_expand_pr += no_expand_pr;
+  return 0;
+}
+
+static float 
+allocPrune_domain
+(
+ int_t fstVtx,  /* Input - first vertex of current node */ 
+ int_t lstVtx,  /* Input - last vertex of current node */
+ Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data
+				  structures */
+ vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
+				  distribution */
+ psymbfact_stat_t *PS           /* Input -statistics */
+ )
+/*! \brief
+ *
+ * <pre>
+ * Allocate storage for data structures necessary for pruned graphs.
+ * For those unpredictable size, make a guess as FILL * n.
+ * Return value:
+ *     0 if enough memory was available;
+ *     otherwise, return the amount of space intended to allocate 
+ *     when memory allocation failure occurred.
+ * </pre>
+ */
+{
+  int_t  lword;
+  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
+  int_t  nvtcs_loc, no_expand_pr, x_sz;
+  float  alpha = 1.5;
+  int_t  FILL = 2 * sp_ienv_dist(6);
+  
+  nvtcs_loc = VInfo->nvtcs_loc;
+  
+  no_expand_pr = 0;
+  lword     = (int_t) sizeof(int_t);
+  
+  /* allocate memory to use during domain_symbolic routine */
+  /* Guess for prune graph */
+  x_sz = lstVtx - fstVtx;
+  nzlmaxPr = nzumaxPr = 2*FILL * x_sz;
+  
+  /* Integer pointers for L\U factors */
+  if (x_sz != 0) {
+    xlsubPr   = intMalloc_symbfact(x_sz+1);
+    xusubPr   = intMalloc_symbfact(x_sz+1);
+    
+    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
+    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
+    
+    while ( !lsubPr || !usubPr ) {
+      if ( lsubPr ) SUPERLU_FREE(lsubPr); 
+      if ( usubPr ) SUPERLU_FREE(usubPr);
+      
+      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
+      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
+      
+      if ( nzumaxPr < x_sz ) {
+	fprintf(stderr, "Not enough memory to perform factorization.\n");
+	return (PS->allocMem);
+      }
+      lsubPr  = (void *) SUPERLU_MALLOC(nzlmaxPr * lword);
+      usubPr  = (void *) SUPERLU_MALLOC(nzumaxPr * lword);
+      ++no_expand_pr;
+    }
+  }    
+  else {
+    xlsubPr = NULL;
+    xusubPr = NULL;
+  }
+  
+  Llu_symbfact->lsubPr   = lsubPr;
+  Llu_symbfact->xlsubPr  = xlsubPr;
+  Llu_symbfact->usubPr   = usubPr;
+  Llu_symbfact->xusubPr  = xusubPr;
+  Llu_symbfact->szLsubPr = nzlmaxPr;
+  Llu_symbfact->szUsubPr = nzumaxPr;
+  Llu_symbfact->indLsubPr = 0;
+  Llu_symbfact->indUsubPr = 0;
+  Llu_symbfact->xlsub_rcvd = NULL;
+  Llu_symbfact->xusub_rcvd = NULL;
+  Llu_symbfact->cntelt_vtcsA_lvl = NULL;
+
+  PS->maxSzLPr = Llu_symbfact->indLsubPr;
+  PS->maxSzUPr = Llu_symbfact->indUsubPr;
+
+  Llu_symbfact->no_expand_pr = no_expand_pr;
+  Llu_symbfact->no_expcp = 0;
+  return 0;
+}
+
+/************************************************************************/
+static
+int symbfact_alloc
+/************************************************************************/
+(
+ int_t n,       /* Input - order of the matrix */
+ int   nprocs,  /* Input - number of processors for the symbolic
+		   factorization */  
+ Pslu_freeable_t *Pslu_freeable, 
+ Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices
+				  distribution */
+ comm_symbfact_t *CS, /* Input -information on communication */
+ psymbfact_stat_t *PS /* Input -statistics */
+ )
+/*! \brief
+ *
+ * <pre>
+ * Allocate storage for the data structures common to symbolic factorization
+ * routines. For those unpredictable size, make a guess as FILL * nnz(A).
+ * Return value:
+ *     0 if enough memory was available;
+ *     otherwise, return the amount of space intended to allocate 
+ *     when memory allocation failure occurred.
+ * </pre>
+ */
+{
+  int    nlvls, p;  /* no of levels in the separator tree */
+  int_t  lword, no_expand;
+  int_t  *xsup, *supno;
+  int_t  *lsub, *xlsub;
+  int_t  *usub, *xusub;
+  int_t  nzlmax, nzumax, nnz_a_loc;
+  int_t  nvtcs_loc, *cntelt_vtcs;
+  float  alpha = 1.5;
+  int_t  FILL = sp_ienv_dist(6);
+  
+  nvtcs_loc = VInfo->nvtcs_loc;
+  nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc;
+  nlvls = (int) LOG2( nprocs ) + 1;
+  no_expand = 0;
+  lword     = sizeof(int_t);
+  
+  /* Guess for L\U factors */
+  nzlmax = nzumax = FILL * nnz_a_loc + 1;
+  
+  /* Integer pointers for L\U factors */
+  supno  = intMalloc_symbfact(nvtcs_loc+1);
+  xlsub  = intMalloc_symbfact(nvtcs_loc+1);
+  xusub  = intMalloc_symbfact(nvtcs_loc+1);
+  
+  lsub = (void *) SUPERLU_MALLOC(nzlmax * lword);
+  usub = (void *) SUPERLU_MALLOC(nzumax * lword);
+  
+  while ( !lsub || !usub ) {
+    if (!lsub) SUPERLU_FREE(lsub); 
+    if (!usub) SUPERLU_FREE(usub);
+    
+    nzlmax /= 2;     nzlmax = alpha * nzlmax;
+    nzumax /= 2;     nzumax = alpha * nzumax;
+    
+    if ( nzumax < nnz_a_loc/2 ) {
+      fprintf(stderr, "Not enough memory to perform factorization.\n");
+      return (PS->allocMem);
+    }
+    lsub  = (void *) SUPERLU_MALLOC(nzlmax * lword);
+    usub  = (void *) SUPERLU_MALLOC(nzumax * lword);
+    ++no_expand;
+  }
+  
+  if (nprocs == 1)
+    cntelt_vtcs = NULL;
+  else 
+    cntelt_vtcs = intMalloc_symbfact (nvtcs_loc+1);
+  
+  /* allocate memory for communication data structures */
+  CS->rcv_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
+  CS->snd_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
+  CS->ptr_rcvBuf   = intMalloc_symbfact (2 * (int_t) nprocs );
+  CS->rcv_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
+  CS->snd_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
+  
+  CS->snd_interSz  = intMalloc_symbfact ((int_t) nlvls + 1);
+  CS->snd_LinterSz = intMalloc_symbfact ((int_t) nlvls + 1);  
+  CS->snd_vtxinter = intMalloc_symbfact ((int_t) nlvls + 1);  
+  CS->rcv_bufSz    = 0;
+  CS->rcv_buf      = NULL;
+  CS->snd_bufSz    = 0;
+  CS->snd_buf      = NULL;
+
+  for (p = 0; p < nprocs; p++) {
+    CS->rcv_interLvl[p] = EMPTY;
+    CS->snd_interLvl[p] = EMPTY;
+    CS->rcv_intraLvl[p] = EMPTY;
+    CS->snd_intraLvl[p] = EMPTY;
+  }
+  
+  for (p = 0; p <= nlvls; p++) {
+    CS->snd_vtxinter[p] = EMPTY;
+    CS->snd_interSz[p]  = 0;
+    CS->snd_LinterSz[p] = 0;
+  }
+  
+  Pslu_freeable->supno_loc   = supno;
+  Llu_symbfact->lsub   = lsub;
+  Llu_symbfact->xlsub  = xlsub;
+  Llu_symbfact->usub   = usub;
+  Llu_symbfact->xusub  = xusub;
+  Llu_symbfact->szLsub = nzlmax;
+  Llu_symbfact->szUsub = nzumax;
+  Llu_symbfact->cntelt_vtcs = cntelt_vtcs;
+  
+  Llu_symbfact->no_expand = no_expand;  
+  
+  return SUCCES_RET;
+} /* SYMBFACT_ALLOC */
+
+static int_t 
+symbfact_vtx
+(
+ int_t n,         /* Input - order of the matrix */
+ int   iam,       /* Input - my processor number */
+ int_t vtx,       /* Input - vertex number to perform symbolic factorization */
+ int_t vtx_lid,   /* Input - local vertex number */
+ int_t vtx_prid,  /* Input - */
+ int_t computeL,  /* Input - TRUE when compute column L(:,vtx)
+		             otheriwse compute row U(vtx, :) */
+ int   domain_symb,  /* Input - if TRUE, computation corresponds to the independent
+			domain at the bottom of the separator tree */
+ int_t fstVtx,       /* Input - first vertex of current node */ 
+ int_t lstVtx,       /* Input - last vertex of current node */
+ int_t snrep_lid,    /* local index of current supernode reprezentative */
+ int_t szSn,         /* size of supernode with snrep_lid reprezentative */
+ int_t *p_next,      /* next element in sub structure */
+ int_t *marker,      
+ int_t *sub_rcvd,    /* elements of node */
+ int_t sub_rcvd_sz,  /* size of sub to be explored */
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,    /* Input/Output - local info on vertices distribution */
+ psymbfact_stat_t *PS,
+ int_t *p_neltsVtxInit,
+ int_t *p_neltsVtx,
+ int_t *p_neltsVtx_CSep,
+ int_t *p_neltsZrVtx,
+ int_t *p_neltsMatched,
+ int_t mark_vtx,
+ int_t *p_prval_curvtx,
+ int_t vtx_bel_othSn,
+ int_t *p_vtx_bel_mySn
+ )
+{ 
+  int_t x_aind_beg, x_aind_end;
+  int_t k, vtx_elt, ind, pr, pr_lid, mem_error, ii, jj, compRcvd;
+  int_t *xsub, *sub, *xsubPr, *subPr, *xsub_rcvd, *xsub_src, *sub_src;
+  int_t pr_elt, next, prval_curvtx, maxNvtcsPProc;
+  int_t  neltsVtx, neltsMatched, neltsZrVtx, neltsZrSn, neltsVtx_CSep;
+  int_t  neltsVtxInit, kk;
+  int   diagind, upd_lstSn;
+  
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  upd_lstSn     = FALSE;
+  diagind       = FALSE;
+  prval_curvtx  = *p_prval_curvtx;
+  neltsVtx_CSep = 0;
+  next = *p_next;
+  if (computeL) {
+    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
+    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
+    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
+  }
+  else {
+    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
+    xsub_rcvd = Llu_symbfact->xusub_rcvd;
+    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
+  }
+
+  x_aind_beg = xsub[vtx_lid];
+  x_aind_end = xsub[vtx_lid + 1];
+  xsub[vtx_lid] = next;
+  k = x_aind_beg;
+  /* while (sub[k] != EMPTY && k < x_aind_end) { */
+  while (k < x_aind_end) {
+    if (sub[k] == EMPTY)
+      k = x_aind_end;
+    else {
+      vtx_elt = sub[k];
+      if (!computeL)
+	if (marker[vtx_elt] == mark_vtx - 2)
+	  if (vtx_elt < prval_curvtx)
+	    prval_curvtx = vtx_elt;
+      marker[vtx_elt] = mark_vtx;
+      if (computeL && vtx_elt == vtx)
+	diagind = TRUE;
+      if (!computeL && vtx_elt == vtx)
+	printf ("Pe[%d] ERROR diag elt in U part vtx " IFMT " dom_s %d fstV "
+		IFMT " lstV " IFMT "\n", 
+		iam, vtx, domain_symb, fstVtx, lstVtx);
+      else {
+	sub[next] = vtx_elt; 
+	next ++;
+      }
+      if (vtx_elt < lstVtx) neltsVtx_CSep ++;
+      k++;
+    }
+  }
+  neltsVtxInit = k - x_aind_beg;
+  PS->nops += neltsVtxInit;
+  
+  if (domain_symb) {
+    if (computeL)
+      VInfo->nnz_ainf_loc -= x_aind_end - x_aind_beg;
+    else    
+      VInfo->nnz_asup_loc -= x_aind_end - x_aind_beg;
+  }
+
+#ifdef TEST_SYMB
+  printf ("compL %d vtx %d vtx_lid %d vtx_prid %d vtx_bel_othSn %d\n", 
+	  computeL, vtx, vtx_lid, vtx_prid, vtx_bel_othSn);
+  PrintInt10 ("A(:, v)", x_aind_end - x_aind_beg, &(sub[xsub[vtx_lid]]));
+#endif
+
+  ind = xsubPr[vtx_prid];
+  if (vtx_bel_othSn == vtx)
+    upd_lstSn = TRUE;
+
+  while (ind != EMPTY || upd_lstSn) {
+    if (upd_lstSn ) {
+      upd_lstSn = FALSE;
+      pr_lid = snrep_lid;
+    }
+    else {
+      pr_lid = subPr[ind];
+      ind = subPr[ind - 1];
+    }
+    
+    if (!computeL)
+      marker[vtx] = mark_vtx;
+    if (pr_lid >= VInfo->nvtcs_loc) {
+      compRcvd = TRUE;
+      xsub_src = xsub_rcvd; sub_src = sub_rcvd;
+      pr_lid -= VInfo->nvtcs_loc;
+      k = xsub_src[pr_lid] + RCVD_IND;
+    }
+    else {
+      compRcvd = FALSE;
+      xsub_src = xsub; sub_src = sub;
+      k = xsub_src[pr_lid];
+    }
+
+    PS->nops += xsub_src[pr_lid+1] - xsub_src[pr_lid];
+    for (; k < xsub_src[pr_lid+1]; k++) {
+      pr_elt = sub_src[k];
+      if (pr_elt >= vtx && marker[pr_elt] != mark_vtx) {
+
+	/* TEST available memory */
+	if (next >= x_aind_end) {	
+	  if (domain_symb) {
+	    if (mem_error =
+		psymbfact_LUXpandMem (iam, n, vtx, next, 0,
+				      computeL, DOMAIN_SYMB, 1, 
+				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      return (mem_error);
+	  } else if (mem_error =
+		     psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, 
+					computeL, LL_SYMB, 1, 
+					Pslu_freeable, Llu_symbfact, VInfo, PS))
+	    return (mem_error);
+
+	  x_aind_end = xsub[vtx_lid + 1];
+	  if (computeL)   sub = Llu_symbfact->lsub; 
+	  else   sub = Llu_symbfact->usub; 
+	  if (!compRcvd) 
+	    sub_src = sub;	  
+	}
+
+	sub[next] = pr_elt; next ++;
+
+	if (pr_elt < lstVtx) neltsVtx_CSep ++;
+	if (computeL && pr_elt == vtx)
+	  diagind = TRUE;
+	if (!computeL)
+	  if (marker[pr_elt] == mark_vtx - 2)
+	    if (pr_elt < prval_curvtx)
+	      prval_curvtx = pr_elt;
+	marker[pr_elt] = mark_vtx;	
+      }
+    }
+  }
+
+  /* Abort if the diagonal element is zero */
+  if (computeL && diagind == FALSE) {
+    printf("Pe[%d] At column " IFMT ", ", iam, vtx);
+    ABORT("ParSymbFact() encounters zero diagonal");
+  } 
+
+  neltsVtx = next - xsub[vtx_lid];
+  neltsZrVtx = 0; /* number of zero elements which would
+		     be introduced in the vertex */
+  neltsZrSn = 0; /* -"- in the supernode */
+  neltsMatched = 0; 
+  if (vtx != fstVtx) {
+    for (k = xsub[snrep_lid]; k < xsub[snrep_lid+1]; k++) {
+      vtx_elt = sub[k];
+      if (vtx_elt >= vtx) {
+	if ((vtx_elt > vtx && !computeL) || 
+	    (vtx_elt >= vtx && computeL)) {
+	  if (marker[vtx_elt] != mark_vtx)
+	    neltsZrVtx ++;
+	  else {
+	    neltsMatched ++;
+	  }
+	}
+	if (computeL && vtx_elt == vtx)
+	  *p_vtx_bel_mySn = vtx;
+	if (!computeL && vtx_elt == vtx + 1)
+	  *p_vtx_bel_mySn = vtx + 1;
+      }
+    }
+  }
+  else {
+    neltsMatched = neltsVtx;
+    if (! computeL) 
+      for (k = xsub[vtx_lid]; k < next; k++) {
+	vtx_elt = sub[k];
+	if (vtx_elt == vtx + 1)
+	  *p_vtx_bel_mySn = vtx + 1;
+      }
+  }
+
+  *p_neltsVtxInit  = neltsVtxInit;
+  *p_neltsVtx      = neltsVtx;
+  *p_neltsVtx_CSep = neltsVtx_CSep;
+  *p_neltsZrVtx    = neltsZrVtx;
+  *p_neltsMatched  = neltsMatched;
+  *p_next          = next;
+  *p_prval_curvtx  = prval_curvtx;
+  return SUCCES_RET;
+}
+
+static int_t
+updateRcvd_prGraph
+(
+ int_t n,         /* Input - order of the matrix */
+ int   iam,       /* Input - my processor number */
+ int_t *sub_rcvd,      /* elements of node */
+ int_t sub_rcvd_sz,   /* Input - size of sub to be used in the update */
+ int_t fstVtx_toUpd,  /* Input - first vertex to update */
+ int_t lstVtx_toUpd,  /* Input - last vertex to update */
+ int_t pr_offset,
+ int   computeL,
+ int_t *marker,
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices distribution */
+ psymbfact_stat_t *PS
+ /*  marker: first elements of marker contain the nodes that will
+     be used in the updates */
+)
+{
+  int_t i, k, nelts, prVal, vtx_elt, vtx_elt_lid, ind;
+  int_t vtx, vtx_lid, fstVtx_toUpd_lid, fstVtx_srcUpd_lid;
+  int_t *xsub, *sub, *xsub_rcvd, *xsubPr, *subPr, szsubPr, *p_indsubPr;
+  int_t maxNvtcsPProc, *globToLoc, mem_error;
+  int_t nvtcs_toUpd, fstVtx_srcUpd, vtx_lid_p;
+  
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
+  nvtcs_toUpd = lstVtx_toUpd - fstVtx_toUpd;
+  
+  if (computeL) {
+    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
+    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
+    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
+    p_indsubPr = &(Llu_symbfact->indLsubPr);
+    szsubPr = Llu_symbfact->szLsubPr;
+  }
+  else {
+    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
+    xsub_rcvd = Llu_symbfact->xusub_rcvd;
+    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
+    p_indsubPr = &(Llu_symbfact->indUsubPr);
+    szsubPr = Llu_symbfact->szUsubPr;
+  }
+  
+  /* count number of elements in transpose representation of sub_rcvd */
+  /* use marker to count those elements */
+  for (i = 0; i < nvtcs_toUpd; i++)
+    marker[i] = 0;
+  for (i = 0; i <= VInfo->maxSzBlk; i++)
+    xsub_rcvd[i] = 0;
+  
+  i = 0;
+  fstVtx_srcUpd = EMPTY;
+  while (i < sub_rcvd_sz) {
+    vtx   = sub_rcvd[i + DIAG_IND];
+    nelts = sub_rcvd[i + NELTS_IND];
+    i += RCVD_IND;
+    prVal = sub_rcvd[i];
+    if (fstVtx_srcUpd == EMPTY) fstVtx_srcUpd = vtx;
+    xsub_rcvd[vtx - fstVtx_srcUpd] = i - RCVD_IND;
+    xsub_rcvd[vtx-fstVtx_srcUpd+1] = i + nelts;
+    for (k = i; k < i + nelts; k++) {
+      vtx_elt = sub_rcvd[k];
+      if (vtx_elt > prVal)
+	k = i + nelts;
+      else {
+	if (OWNER( globToLoc[vtx_elt] ) == iam) {
+	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
+	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
+	      fstVtx_toUpd_lid;
+	    marker[vtx_elt_lid] ++;
+	  }
+	}
+      }
+    }
+    i += nelts;
+  }
+
+  vtx_lid = fstVtx_toUpd_lid - pr_offset;
+  ind = 0;
+  for (i = 0; i < nvtcs_toUpd; i++) {
+    if (marker[i] != 0) {
+      xsubPr[vtx_lid] = ind + 1;
+      ind += 2* marker[i];
+      marker[i] = xsubPr[vtx_lid] - 1;
+    }
+    vtx_lid ++;
+  }
+  
+  if (ind == 0) 
+    /* quick return if no update */
+    return 0;
+
+  /* test if enough memory in usubPr array */
+  if (ind >= szsubPr) {
+    if (mem_error = 
+	psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS))
+      return (mem_error);
+    if (computeL) 
+      subPr = Llu_symbfact->lsubPr;  
+    else 
+      subPr = Llu_symbfact->usubPr;
+  }
+  *p_indsubPr = ind;
+  
+  i = 0;
+  while (i < sub_rcvd_sz) {
+    vtx   = sub_rcvd[i + DIAG_IND];
+    nelts = sub_rcvd[i + NELTS_IND];
+    i += RCVD_IND;
+    prVal = sub_rcvd[i];
+    for (k = i; k < i + nelts; k++) {
+      vtx_elt = sub_rcvd[k];
+      if (vtx_elt > prVal)
+	k = i + nelts;
+      else {
+	if (OWNER( globToLoc[vtx_elt] ) == iam) {
+	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
+	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
+	    vtx_lid_p = vtx_elt_lid - pr_offset;
+	    vtx_elt_lid -= fstVtx_toUpd_lid;
+	    /* add vtx to structure of pruned graph */
+	    if (marker[vtx_elt_lid] != xsubPr[vtx_lid_p] - 1) 
+	      subPr[marker[vtx_elt_lid] - 2] = marker[vtx_elt_lid] + 1;
+	    subPr[marker[vtx_elt_lid] + 1] = vtx - fstVtx_srcUpd + VInfo->nvtcs_loc;
+	    subPr[marker[vtx_elt_lid]] = EMPTY;
+	    marker[vtx_elt_lid] += 2;
+	  }
+	}
+      }
+    }
+    i += nelts;
+  }  
+  
+  for (i = fstVtx_toUpd; i < nvtcs_toUpd; i++)
+    marker[i] = 0;
+  return 0;
+}
+
+static int_t
+update_prGraph 
+(
+ int   iam, 
+ int_t n,           /* order of the matrix */
+ int_t fstVtx_blk,  /* first vertex in block to factorize */
+ int_t lstVtx_blk,  /* last vertex in block to factorize */
+ int_t snrep_lid,   /* local index of current supernode reprezentative */
+ int_t pr_offset,   /* offset in the indexing of prune structure */
+ int_t prval_cursn, /* prune value of current supernode reprezentative */
+ int_t xsub_snp1,   /* denotes xsub[snrep_lid + 1] */
+ int   computeL,    /* Input - if 1, compute column L(:,vtx)
+		               else compute row U(vtx, :) */
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */
+ psymbfact_stat_t *PS
+ )
+{
+  int_t k, mem_error;
+  int_t kmin, kmax, ktemp, maxElt;
+  int_t sn_elt, sn_elt_prid;
+  int_t *globToLoc, maxNvtcsPProc;
+  int_t *xsub, *sub, *xsubPr, *subPr;
+  int_t *p_indsubPr, szsubPr;
+  
+  globToLoc     = Pslu_freeable->globToLoc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+
+  if (computeL) {
+    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
+    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
+    p_indsubPr = &(Llu_symbfact->indLsubPr);
+    szsubPr = Llu_symbfact->szLsubPr;
+  }
+  else {
+    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
+    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
+    p_indsubPr = &(Llu_symbfact->indUsubPr);
+    szsubPr = Llu_symbfact->szUsubPr;
+  }
+  
+  kmin = xsub[snrep_lid];
+  kmax = xsub_snp1 - 1;
+  if (prval_cursn != n)
+    maxElt = prval_cursn;
+  else
+    maxElt = EMPTY;
+  while (kmin <= kmax) {
+    if (prval_cursn == n) {
+      /* compute maximum element of L(:, vtx) */
+      if (sub[kmin] > maxElt)
+	maxElt = sub[kmin];
+      kmin ++;
+    }
+    else {
+      /* Do a quicksort-type partition. */    
+      if (sub[kmax] > prval_cursn) 
+	kmax--;
+      else if (sub[kmin] <= prval_cursn)
+	kmin++;
+      else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
+	      * 	   interchange the two subscripts
+	      */
+	ktemp = sub[kmin];
+	sub[kmin] = sub[kmax];
+	sub[kmax] = ktemp;
+	kmin ++;
+	kmax --;
+      }
+    }
+  }
+  k = xsub[snrep_lid];
+  while (sub[k] <= prval_cursn && k < xsub_snp1) {
+    sn_elt = sub[k];
+    if (sn_elt < lstVtx_blk) {
+      sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset;
+      if ((*p_indsubPr) + 2 >= szsubPr) {
+	if (mem_error = 
+	    psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS))
+	  return (mem_error);
+	if (computeL) {
+	  subPr = Llu_symbfact->lsubPr;  szsubPr = Llu_symbfact->szLsubPr;
+	}
+	else {
+	  subPr = Llu_symbfact->usubPr;  szsubPr = Llu_symbfact->szUsubPr;
+	}
+      }
+      /* add krow to structure of pruned graph */
+      subPr[(*p_indsubPr) + 1] = snrep_lid;
+      subPr[(*p_indsubPr)] = xsubPr[sn_elt_prid];
+      xsubPr[sn_elt_prid] = (*p_indsubPr) + 1;
+      (*p_indsubPr) += 2;
+    }
+    if (sn_elt == maxElt) {
+      /* move prune val in the first position */
+      sub[k] = sub[xsub[snrep_lid]];
+      sub[xsub[snrep_lid]] = sn_elt;
+    }
+    k ++; 
+  }
+  return SUCCES_RET;
+}
+
+static int_t
+blk_symbfact
+(SuperMatrix *A,
+ int   iam,
+ int   lvl, 
+ int   szSep,
+ int   ind_sizes1,
+ int   ind_sizes2, 
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int_t fstVtx_loc, /* Input - first vertex local of the level */
+ int_t fstVtx_blk,
+ int_t lstVtx_blk,
+ int_t *lsub_rcvd,      /* elements of node */
+ int_t lsub_rcvd_sz,    /* size of sub to be explored */
+ int_t *usub_rcvd,  
+ int_t usub_rcvd_sz,
+ Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
+ Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */  
+ vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS,
+ int_t *marker,
+ int_t *p_mark,    /* marker used to merge elements of vertices */
+ int_t *p_nextl,   /* ptr to nextl in lsub structure */
+ int_t *p_nextu,   /* ptr to nextu in usub structure */
+ int_t *p_neltsZr, /* no of artificial zeros introduced so far */
+ int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
+			 computed so far */
+ int_t *p_nsuper_loc
+ )
+{
+  int szSep_tmp, lvl_tmp, ii, jj;
+  int_t  *xlsubPr, *xusubPr; 
+  int_t  *xsup, *supno, *lsub, *xlsub, *usub, *xusub;
+  int_t  vtx_lid, vtx_prid, vtx, vtx_super, vtx_elt, maxNvtcsPProc;
+  int_t  ind, pr, pr_elt, newnext, k, vtx_elt_lid;
+  int_t  nextl, nextu, nsuper_loc, nvtcs, n, mem_error;
+  int_t  x_aind_beg, x_aind_end, i, szLp, xlsub_snp1, xusub_snp1;
+  int_t  snrep, snrep_lid, szsn, vtxp1, *globToLoc, domain_symb;
+  int_t lstVtx, neltsCurSep, maxNeltsVtx, fstVtx_loc_lid;
+  /* supernode relaxation parameters */
+  int_t  neltsVtx_L, neltsZrVtx_L, neltsMatched_L, neltsVtx_CSep_L;
+  int_t  neltsVtx_U, neltsZrVtx_U, neltsMatched_U, neltsVtx_CSep_U;
+  int_t  neltsZrSn_L, neltsZrSn_U, neltsZr, neltsTotal, 
+    neltsZr_tmp, neltsTotal_tmp, neltsZrSn, neltsVtxInit_l, neltsVtxInit_u;
+  /* next vertex belongs to current supernode pruned structure */
+  int_t  vtx_bel_snL, vtx_bel_snU;
+  /* marker variables */
+  int_t  markl1_vtx, markl2_vtx, marku1_vtx, marku2_vtx;
+  /* prune structure variables */
+  int_t prval_cursn, prval_curvtx, pr_offset;
+  /* variables for comms info */
+  int_t neltSn_L, neltSn_U, lstVtx_tmp, stat;
+  float relax_param, relax_seps;
+
+  if (fstVtx_blk >= lstVtx_blk)
+    return 0;
+  
+  /* Initializations */
+  supno   = Pslu_freeable->supno_loc;
+  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
+  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
+  xusubPr  = Llu_symbfact->xusubPr; 
+  xlsubPr  = Llu_symbfact->xlsubPr;   
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  maxNeltsVtx   = VInfo->maxNeltsVtx;
+  
+  n          = A->ncol;
+  nextl      = *p_nextl;
+  nextu      = *p_nextu;
+  neltsZr    = *p_neltsZr;
+  neltsTotal = *p_neltsTotal;
+  nsuper_loc = *p_nsuper_loc;
+  marku2_vtx = *p_mark;
+  lstVtx     = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
+
+  snrep = fstVtx_blk; 
+  snrep_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
+  szsn = 1;
+  nvtcs = lstVtx_blk - fstVtx_blk;
+  prval_cursn = n;
+  vtx_bel_snL = EMPTY; vtx_bel_snU = EMPTY;
+  
+  /* set up to EMPTY xlsubPr[], xusubPr[] */
+  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
+    PS->maxSzLPr = Llu_symbfact->indLsubPr;
+  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
+    PS->maxSzUPr = Llu_symbfact->indUsubPr;
+  for (i = 0; i < nvtcs; i++) {
+    xlsubPr[i] = EMPTY;
+    xusubPr[i] = EMPTY;
+  }
+  Llu_symbfact->indLsubPr = 0;
+  Llu_symbfact->indUsubPr = 0;
+
+  if (ind_sizes1 == 0) 
+    domain_symb = TRUE;
+  else {
+    domain_symb = FALSE;
+    fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
+  }
+  
+  vtx_prid = 0;
+  vtx_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
+  pr_offset = vtx_lid;
+
+  if (lsub_rcvd != NULL) {
+    updateRcvd_prGraph (n, iam, lsub_rcvd, lsub_rcvd_sz,
+			fstVtx_blk, lstVtx_blk, pr_offset, 1, marker,
+			Pslu_freeable, Llu_symbfact, VInfo, PS);    
+    updateRcvd_prGraph (n, iam, usub_rcvd, usub_rcvd_sz,
+			fstVtx_blk, lstVtx_blk, pr_offset, 0, marker,
+			Pslu_freeable, Llu_symbfact, VInfo, PS);
+  }
+  
+  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++, vtx_prid ++) {
+    vtxp1 = vtx + 1;
+    if (marku2_vtx +4 >= n) {
+      /* reset to EMPTY marker array */
+      for (i = 0; i < n; i++)
+	marker[i] = EMPTY;
+      marku2_vtx = EMPTY;
+    }
+    markl1_vtx = marku2_vtx + 1; markl2_vtx = markl1_vtx + 1;
+    marku1_vtx = markl2_vtx + 1; marku2_vtx = marku1_vtx + 1;    
+
+    prval_curvtx   = n;
+    /* Compute nonzero structure L(:,vtx) */
+    if (mem_error = 
+	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, 
+		      fstVtx_blk,  lstVtx,
+		      snrep_lid, szsn, &nextl,
+		      marker, 
+		      lsub_rcvd, lsub_rcvd_sz,
+		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l,
+		      &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, 
+		      &neltsMatched_L, markl1_vtx, &prval_curvtx, 
+		      vtx_bel_snU, &vtx_bel_snL))
+      return (mem_error);
+    lsub = Llu_symbfact->lsub;
+
+#ifdef TEST_SYMB
+    PrintInt10 ("L(:, %d)", nextl - xlsub[vtx_lid], &(lsub[xlsub[vtx_lid]]));
+#endif
+    
+    /* Compute nonzero structure of U(vtx,:) */
+    if (mem_error = 
+	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, 
+		      fstVtx_blk, lstVtx,
+		      snrep_lid, szsn, &nextu,
+		      marker, 
+		      usub_rcvd, usub_rcvd_sz,
+		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u,
+		      &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, 
+		      &neltsMatched_U, marku1_vtx, &prval_curvtx,
+		      vtx_bel_snL, &vtx_bel_snU))
+      return (mem_error);
+    usub = Llu_symbfact->usub;
+
+#ifdef TEST_SYMB
+    PrintInt10 ("U(%d, :)", nextu - xusub[vtx_lid], &(usub[xusub[vtx_lid]])); 
+#endif
+    
+    /* update statistics on fill-in */
+    if (!domain_symb) {
+      stat = CEILING( (neltsVtxInit_l + neltsVtxInit_u), 2);
+      if (Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] != stat) {
+	stat = CEILING(stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid]);
+	PS->fill_pelt[0] += (float) stat;
+	if ((float) stat > PS->fill_pelt[1]) PS->fill_pelt[1] = (float) stat;
+	PS->fill_pelt[2] += 1.;
+      }
+      stat = CEILING( (neltsVtx_L + neltsVtx_U), 2);
+      stat = CEILING( stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] );
+      PS->fill_pelt[3] += (float) stat;
+      if ((float) stat > PS->fill_pelt[4]) PS->fill_pelt[4] = (float) stat;
+      PS->fill_pelt[5] += 1.;
+    }     
+
+    /* compute number of artificial zeros */
+    neltsTotal = 0;
+    neltsZr = 0;
+    neltsZrSn_L    = neltsVtx_L - neltsMatched_L;
+    neltsZrSn_U    = neltsVtx_U - neltsMatched_U;
+    neltsZrSn      = neltsZrVtx_L + neltsZrVtx_U +
+      (neltsZrSn_L + neltsZrSn_U) * szsn;
+    neltsZr_tmp    = neltsZr + neltsZrSn;
+    neltsTotal_tmp = neltsTotal + neltsZrSn + neltsVtx_L + neltsVtx_U;
+    if (neltsTotal_tmp == 0)
+      neltsTotal_tmp = 1;
+    relax_param = (float) (neltsTotal_tmp - neltsZr_tmp) / neltsTotal_tmp;
+
+#ifdef TEST_SYMB
+    printf ("[%d] vtx %d pr %d szsn %d nVtx_L %d nZrSn_L %d nZrVtx_L %d\n",
+	    iam, vtx, prval_curvtx, szsn,neltsVtx_L, neltsZrSn_L, neltsZrVtx_L);
+    printf ("  [%d] nVtx_U %d, nZrSn_U %d nZrVtx_U %d nextl %d nextu %d\n",
+	    iam, neltsVtx_U, neltsZrSn_U, neltsZrVtx_U, nextl, nextu);
+    printf ("  [%d] nZr %d nZr_tmp %d nTot %d nTot_tmp %d rel %f test %d\n\n", 
+	    iam, neltsZr, neltsZr_tmp, neltsTotal, neltsTotal_tmp,
+	    relax_param, i);
+#endif
+
+    /* Check to see if vtx belongs in the same supernode as vtx-1 */
+    supno[vtx_lid] = nsuper_loc;
+    if (vtx == fstVtx_blk) {
+      prval_cursn = prval_curvtx;
+      neltsTotal += neltsVtx_L + neltsVtx_U;
+    }
+    else {
+      if (maxNeltsVtx > 0) {
+	relax_seps = (float) neltsVtx_L / (float) maxNeltsVtx;
+	relax_seps *= (float) (neltsVtx_U+1) / (float) maxNeltsVtx;
+      } 
+      else
+	relax_seps = 0.0;
+
+      /* check if all upper separators are dense */
+      if (relax_seps >= PS->relax_seps ) {
+	VInfo->filledSep = FILLED_SEPS; 
+	*p_nextl      = xlsub[vtx_lid];
+	*p_nextu      = xusub[vtx_lid];
+	nsuper_loc   += 1;
+	*p_nsuper_loc = nsuper_loc;
+	if (mem_error =
+	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
+				sizes, fstVtxSep, vtx,
+				Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+				p_nextl, p_nextu, p_nsuper_loc))
+	  return (mem_error);
+	/* set up neltsZr and neltsTotal */
+	vtx = lstVtx_blk;
+	return 0;
+      } /* if all upper separators are dense */
+      else {
+	if (relax_param >= PS->relax_gen) {
+	  /* vertex belongs to the same supernode */
+	  if (prval_cursn > prval_curvtx || prval_cursn <= vtx)
+	    prval_cursn = prval_curvtx;
+	  neltsZr    = neltsZr_tmp;
+	  neltsTotal = neltsTotal_tmp;
+	  szsn ++;
+	  /* add artificial zeros at the structure of current supernode */
+	  newnext = xlsub[snrep_lid+1];
+	  if (neltsZrSn_L != 0) {
+	    for (k = xlsub[snrep_lid]; k < xlsub[snrep_lid+1]; k++) {
+	      vtx_elt = lsub[k];
+	      if (vtx_elt >= vtx) 
+		marker[vtx_elt] = markl2_vtx;
+	    }
+	    for (k = xlsub[vtx_lid]; k < nextl; k++) {
+	      vtx_elt = lsub[k];
+	      if (marker[vtx_elt] != markl2_vtx) {
+		/* add vtx_elt to the structure of snrep */
+		lsub[newnext] = vtx_elt; newnext ++;
+		marker[vtx_elt] = markl2_vtx;
+	      }
+	    }
+	    xlsub[snrep_lid+1] = newnext;
+	  }
+	  xlsub[vtx_lid] = newnext;
+	  nextl = newnext;
+	  neltsVtx_L += neltsZrVtx_L;
+	  
+	  newnext = xusub[snrep_lid+1];
+	  if (neltsZrSn_U != 0) {
+	    for (k = xusub[snrep_lid]; k < xusub[snrep_lid+1]; k++) {
+	      vtx_elt = usub[k];
+	      if (vtx_elt >= vtx) {
+		if (marker[vtx_elt] == markl2_vtx)
+		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
+		    prval_cursn = vtx_elt;
+		marker[vtx_elt] = marku2_vtx;
+	      }
+	    }
+	    for (k = xusub[vtx_lid]; k < nextu; k++) {
+	      vtx_elt = usub[k];
+	      if (marker[vtx_elt] != marku2_vtx) {
+		/* add vtx_elt to the structure of snrep */
+		usub[newnext] = vtx_elt; newnext ++;
+		if (marker[vtx_elt] == markl2_vtx)
+		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
+		    prval_cursn = vtx_elt;
+		marker[vtx_elt] = marku2_vtx;
+	      }
+	    }
+	    if (marker[vtxp1] == marku2_vtx)
+	      vtx_bel_snU = vtxp1;
+	    xusub[snrep_lid+1] = newnext;
+	  }
+	  xusub[vtx_lid] = newnext;
+	  nextu = newnext;
+	  neltsVtx_U += neltsZrVtx_U;
+	}  /* if ( relax_param >= PS->relax_param) */
+      }  /* if (VInfo->filledSep != FILLED_SEPS) */
+    } /* if (vtx != fstVtx_blk) */
+
+    if ((relax_param < PS->relax_gen || vtx == lstVtx_blk-1) 
+	&& VInfo->filledSep != FILLED_SEPS) {
+      /* if a new supernode starts or is the last vertex */
+      /* vtx starts a new supernode. Note we only store the
+       * subscript set of the first column of a supernode.  */
+      
+      if (marker[vtxp1] == marku1_vtx)
+	vtx_bel_snU = vtxp1;
+      /* build the pruned structure */
+      if (relax_param < PS->relax_gen
+	  && vtx == lstVtx_blk - 1 && vtx != fstVtx_blk) 
+	szLp = 2;
+      else
+	szLp = 1;
+      if (vtx == fstVtx_blk) {
+	xlsub_snp1 = nextl;
+	xusub_snp1 = nextu;
+      }
+      else {
+	xlsub_snp1 = xlsub[snrep_lid+1];
+	xusub_snp1 = xusub[snrep_lid+1];	
+      }
+      while (szLp > 0) {
+	szLp --;
+#ifdef TEST_SYMB
+	printf ("End sn %d szsn %d\n", nsuper_loc, szsn);
+	printf ("BLD pr vtx %d snrep %d prval %d szLp %d\n",
+		vtx, snrep, prval_cursn, szLp);
+#endif
+	
+	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
+			snrep_lid, pr_offset, prval_cursn,
+			xlsub_snp1, 1,
+			Pslu_freeable, Llu_symbfact, PS);
+	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
+			snrep_lid, pr_offset, prval_cursn,
+			xusub_snp1, 0,
+			Pslu_freeable, Llu_symbfact, PS);
+
+#ifdef TEST_SYMB
+	printf ("Adr lsub %p usub %p lsub %p pos %d usub %p pos %d\n", 
+		&(lsub[xlsub[snrep_lid]]), &(usub[xusub[snrep_lid]]),
+		lsub, xlsub[snrep_lid], usub, xusub[snrep_lid]);
+	PrintInt10 ("Lsn", xlsub_snp1 - xlsub[snrep_lid],
+		    &(lsub[xlsub[snrep_lid]]));
+	PrintInt10 ("Usn", xusub_snp1 - xusub[snrep_lid],
+		    &(usub[xusub[snrep_lid]]));
+#endif
+
+	if (prval_cursn >= lstVtx_blk) {
+	  neltSn_L = xlsub_snp1 - xlsub[snrep_lid];
+	  neltSn_U = xusub_snp1 - xusub[snrep_lid];
+	  if (ind_sizes1 != 0) {
+	    CS->snd_intraSz += neltSn_L + neltSn_U + 4;
+	    CS->snd_LintraSz += neltSn_L + 2;
+	  }
+	  if (prval_cursn >= lstVtx) {
+	    /* this supernode will be send to next layers of the tree */
+	    lvl_tmp = lvl;
+	    ii = ind_sizes1;
+	    jj = ind_sizes2;
+	    szSep_tmp = szSep;
+	    lstVtx_tmp = lstVtx;
+	    while (prval_cursn >= lstVtx_tmp && szSep_tmp != 1) {
+	      jj = ii + szSep_tmp + (jj - ii) / 2;
+	      ii += szSep_tmp;
+	      lvl_tmp ++;
+	      szSep_tmp = szSep_tmp / 2;
+	      lstVtx_tmp = fstVtxSep[jj] + sizes[jj];
+	      CS->snd_interSz[lvl_tmp] += neltSn_L + neltSn_U + 4;
+	      CS->snd_LinterSz[lvl_tmp] += neltSn_L + 2;
+	      if (CS->snd_vtxinter[lvl_tmp] == EMPTY)
+		CS->snd_vtxinter[lvl_tmp] = snrep;
+	    }
+	  }
+	}
+	snrep = vtx;
+	snrep_lid = vtx_lid;
+	prval_cursn = prval_curvtx;
+	szsn        = 1;
+	xlsub_snp1  = nextl;
+	xusub_snp1  = nextu;
+      }
+      if (relax_param < PS->relax_gen) {
+	neltsTotal += neltsVtx_L + neltsVtx_U;
+	nsuper_loc ++;	
+	supno[vtx_lid] = nsuper_loc;
+	if (marker[vtxp1] == marku1_vtx)
+	  vtx_bel_snU = vtxp1;
+	else
+	  vtx_bel_snU = EMPTY;
+      }
+    }
+    if (vtx == lstVtx_blk - 1)
+      nsuper_loc ++;
+    
+    /* check if current separator is dense */
+    if (!VInfo->filledSep) {
+      relax_seps = (float) neltsVtx_CSep_L / (float) (lstVtx - vtx);
+      relax_seps *= (float) (neltsVtx_CSep_U+1) / (float) (lstVtx - vtx);
+      if (relax_seps >= PS->relax_curSep ) 
+	VInfo->filledSep = FILLED_SEP;
+    }
+    maxNeltsVtx --;
+  }
+    
+  *p_mark       = marku2_vtx + 1;
+  *p_nextl      = nextl;
+  *p_nextu      = nextu;
+  *p_neltsZr    = neltsZr;
+  *p_neltsTotal = neltsTotal;
+  *p_nsuper_loc = nsuper_loc;
+
+  return 0;
+}
+
+static void
+domain_symbfact
+(SuperMatrix *A,
+ int   iam,        /* Input - my processor number */  
+ int   lvl,        /* Input - current level in the separator tree */
+ int   szSep,      /* Input - size of the current separator (node) */
+ int   ind_sizes1,
+ int   ind_sizes2, 
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int_t fstVtx,     /* Input - first vertex of current node */ 
+ int_t lstVtx,     /* Input - last vertex of current node */ 
+ Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS,
+ int_t *marker,
+ int_t *p_mark,    /* marker used to merge elements of vertices */
+ int_t *p_nextl,   /* ptr to nextl in lsub structure */
+ int_t *p_nextu,   /* ptr to nextu in usub structure */
+ int_t *p_neltsZr, /* no of artificial zeros introduced so far */
+ int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
+			 computed so far */
+ int_t *p_nsuper_loc
+ )
+{
+  int_t lstVtx_lid, maxNvtcsPProc; 
+
+  /* call blk_symbfact */
+  blk_symbfact (A, iam, lvl, 
+		szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
+		EMPTY, fstVtx, lstVtx, 
+		NULL, EMPTY, NULL, EMPTY,
+		Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
+		marker, p_mark,
+		p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
+		p_nsuper_loc);
+
+  if (VInfo->filledSep != FILLED_SEPS) {
+    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+    if (fstVtx >= lstVtx)
+      lstVtx_lid = 0;
+    else 
+      lstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[lstVtx-1] ) + 1;
+    VInfo->xlsub_nextLvl  = Llu_symbfact->xlsub[lstVtx_lid];
+    Llu_symbfact->xlsub[lstVtx_lid] = *p_nextl;
+    VInfo->xusub_nextLvl  = Llu_symbfact->xusub[lstVtx_lid];
+    Llu_symbfact->xusub[lstVtx_lid] = *p_nextu;
+  }
+  VInfo->maxNeltsVtx -= lstVtx - fstVtx;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Compute counts of rows/columns of current separator.
+ * cntelt_vtcs[i] is 0 when i is nonzero before current separator
+ * and n when i is zero before current separator.
+ *
+ * Set up nvtcsLvl_loc.
+ * </pre>
+ */
+static void
+initLvl_symbfact
+(
+ int_t n,       /* Input - order of the matrix */
+ int   iam,     /* Input - my processor number */
+ int_t fstVtx,  /* Input - first vertex of current node */   
+ int_t lstVtx,  /* Input - last vertex of current node */   
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
+ psymbfact_stat_t *PS,
+ MPI_Comm ndComm,
+ int_t  *marker,
+ int_t  nextl,
+ int_t  nextu
+ ) 
+{
+  int_t *cntelt_vtcs, x_aind_beg, x_aind_end, x_aind_beg_l, x_aind_beg_u,
+    nelts_asup, nelts_ainf;
+  int_t nvtcsLvl_loc, fstVtx_loc, fstVtx_loc_lid, fstVtx_nextLvl;
+  int_t curblk_loc, nblks_loc, ind_blk;
+  int_t *lsub, *xlsub, *usub, *xusub;
+  int_t *begEndBlks_loc, code_err, mem_error;
+  int_t i, j, k, vtx, vtx_lid, fstVtx_blk, lstVtx_blk, vtx_elt, p, fill;
+  int_t nelts, nelts_fill_l, nelts_fill_u, nelts_cnts, maxNvtcsPProc, *globToLoc;
+  int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u;
+  MPI_Status status;
+  
+  fill = PS->fill_par;
+  VInfo->filledSep = FALSE;
+  
+  /* Initializations */
+  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
+  globToLoc      = Pslu_freeable->globToLoc;
+  curblk_loc     = VInfo->curblk_loc;
+  nblks_loc      = VInfo->nblks_loc;
+  begEndBlks_loc = VInfo->begEndBlks_loc;
+  cntelt_vtcs    = Llu_symbfact->cntelt_vtcs;
+  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
+  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
+  
+  /* compute nvtcsLvl_loc */
+  nvtcsLvl_loc = 0;  
+  ind_blk = curblk_loc;
+  while (fstVtx > begEndBlks_loc[ind_blk] && ind_blk < 2 * nblks_loc) {
+    ind_blk += 2;
+  }
+  curblk_loc = ind_blk;
+  fstVtx_loc = begEndBlks_loc[ind_blk];
+  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
+    nvtcsLvl_loc += begEndBlks_loc[ind_blk + 1] - 
+      begEndBlks_loc[ind_blk];
+    ind_blk += 2;
+  }
+  fstVtx_nextLvl = begEndBlks_loc[ind_blk];
+  VInfo->nvtcsLvl_loc = nvtcsLvl_loc;
+  VInfo->curblk_loc = curblk_loc;
+  
+  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
+  vtx_lid      = fstVtx_loc_lid;
+  x_aind_beg_l = VInfo->xlsub_nextLvl;
+  x_aind_beg_u = VInfo->xusub_nextLvl;
+  nelts_cnts   = 0;
+  nelts_fill_l = 0;
+  nelts_fill_u = 0;
+  ind_blk      = curblk_loc;
+  
+  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
+    fstVtx_blk = begEndBlks_loc[ind_blk];
+    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
+    ind_blk += 2;
+    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) 
+      nelts_cnts += cntelt_vtcs[vtx_lid];      
+    nelts_fill_l += fill * (xlsub[vtx_lid] - x_aind_beg_l);
+    nelts_fill_u += fill * (xusub[vtx_lid] - x_aind_beg_u);
+    x_aind_beg_l = xlsub[vtx_lid];
+    x_aind_beg_u = xusub[vtx_lid];
+  }
+
+  if (nvtcsLvl_loc != 0) {
+    nelts_ainf = xlsub[vtx_lid] - VInfo->xlsub_nextLvl;
+    nelts_asup = xusub[vtx_lid] - VInfo->xusub_nextLvl;
+  }
+  else {
+    nelts_ainf = 0;
+    nelts_asup = 0;
+  }
+  
+  use_fillcnts = FALSE;
+  if (nextl + nelts_cnts >= Llu_symbfact->szLsub - nelts_ainf ||
+      nextu + nelts_cnts >= Llu_symbfact->szUsub - nelts_asup) { 
+    use_fillcnts = TRUE;
+  }
+ 
+  use_fillcnts = TRUE; 
+  
+  if (use_fillcnts) {
+    if (nextl + nelts_fill_l >= Llu_symbfact->szLsub - nelts_ainf)
+      mem_error = 
+	psymbfact_LUXpandMem (iam, n, fstVtx, nextl,
+			      nextl + nelts_fill_l, LSUB,
+			      RL_SYMB, 1, 
+			      Pslu_freeable, Llu_symbfact, VInfo, PS);
+    lsub = Llu_symbfact->lsub;
+    if (nextu + nelts_fill_u >= Llu_symbfact->szUsub - nelts_asup) 
+      mem_error = 
+	psymbfact_LUXpandMem (iam, n, fstVtx, nextu,
+			      nextu + nelts_fill_u, USUB,
+			      RL_SYMB, 1, 
+			      Pslu_freeable, Llu_symbfact, VInfo, PS);      
+    usub = Llu_symbfact->usub;
+  }
+
+  /* init xlsub[fstVtx:lstVtx] and xusub[fstVtx:lstVtx] and
+     copy elements of A[fstVtx:lstVtx, fstVtx:lstVtx] in lsub and usub */
+  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
+  x_aind_beg_l = VInfo->xlsub_nextLvl;
+  x_aind_beg_u = VInfo->xusub_nextLvl;
+  vtx_lid = fstVtx_loc_lid;
+  ind_blk = curblk_loc;
+
+  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
+    fstVtx_blk = begEndBlks_loc[ind_blk];
+    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
+    ind_blk += 2;
+
+    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
+      if (vtx_lid != fstVtx_loc_lid) {
+	x_aind_beg_l = xlsub[vtx_lid];
+	x_aind_beg_u = xusub[vtx_lid];
+      }
+      if (use_fillcnts) {
+	cntelt_vtx_l = fill * (xlsub[vtx_lid+1] - x_aind_beg_l);
+	cntelt_vtx_u = fill * (xusub[vtx_lid+1] - x_aind_beg_u);
+      }
+      else {
+	cntelt_vtx_l = cntelt_vtcs[vtx_lid];
+	cntelt_vtx_u = cntelt_vtcs[vtx_lid];
+      }
+      x_aind_end = xlsub[vtx_lid + 1];
+      Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] = 
+	CEILING( (xlsub[vtx_lid+1]-x_aind_beg_l + xusub[vtx_lid+1]-x_aind_beg_u), 2);
+      
+      xlsub[vtx_lid] = nextl;
+      nelts = 0;
+      for (k = x_aind_beg_l; k < x_aind_end; k++) {
+	lsub[nextl] = lsub[k]; nextl ++;
+	nelts ++;
+      }
+      if (nelts < cntelt_vtx_l) 
+	lsub[nextl] = EMPTY; 
+      nextl += cntelt_vtx_l - nelts;
+      x_aind_end = xusub[vtx_lid + 1];
+      xusub[vtx_lid] = nextu;
+      nelts = 0;
+      for (k = x_aind_beg_u; k < x_aind_end; k++) {
+	usub[nextu] = usub[k]; nextu ++;
+	nelts ++;
+      }
+      if (nelts < cntelt_vtx_u) 
+	usub[nextu] = EMPTY; 
+      nextu += cntelt_vtx_u - nelts;
+    }
+  }
+ 
+  if (nvtcsLvl_loc == 0) {
+    if (curblk_loc == 0)
+      vtx_lid = 0;
+    else {
+      if (begEndBlks_loc[curblk_loc-1] == 0)
+	vtx_lid = 0;
+      else
+	vtx_lid = LOCAL_IND( globToLoc[begEndBlks_loc[curblk_loc-1] - 1] ) + 1;
+    }
+
+    xlsub[vtx_lid] = nextl;
+    xusub[vtx_lid] = nextu;
+  }
+  else {
+    VInfo->xlsub_nextLvl   = xlsub[vtx_lid];
+    xlsub[vtx_lid] = nextl;
+    VInfo->xusub_nextLvl   = xusub[vtx_lid];
+    xusub[vtx_lid] = nextu;
+    if (PS->estimLSz < nextl)
+      PS->estimLSz = nextl;
+    if (PS->estimUSz < nextu)
+      PS->estimUSz = nextu;
+    
+    VInfo->nnz_ainf_loc -= nelts_ainf;
+    VInfo->nnz_asup_loc -= nelts_asup;
+  }
+  VInfo->fstVtx_nextLvl = fstVtx_nextLvl;
+}
+
+
+static int_t
+expand_RL 
+(
+ int_t computeRcvd, /* if = 1, then update from receive buffer,
+		       else update from own data */
+ int_t n,
+ int   iam,       /* process number */
+ int_t *lsub_rcvd,      /* elements of node */
+ int_t lsub_rcvd_sz,    /* size of sub to be explored */
+ int_t *usub_rcvd,  
+ int_t usub_rcvd_sz,
+ int_t vtxXp,
+ int_t vtx_upd_pr,    /* ind in pruned structure of upd vertex which 
+			 doesn't fit into the alloc memory */
+ int_t lstVtx_upd_pr, /* ind in pruned structure of lst vtx to update */
+ int_t fstVtx_srcUpd, /* first vertex source of the updates */
+ int_t lstVtx_srcUpd, /* last vertex source of the updates */
+ int_t fstVtx_toUpd,  /* first vertex to update */
+ int_t lstVtx_toUpd,  /* last vertex to update */
+ int_t nvtcs_toUpd,   /* no of vertices to update */
+ int   computeL,
+ int_t *pmarkl,
+ int_t *marker,
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
+ psymbfact_stat_t *PS
+ )
+{
+  int_t fstVtx_toUpd_lid, vtx_lid, vtx, vtx_elt, vtx_elt_lid, nextl, nelts_in;
+  int_t i, ii, j, nelts, nelts_vtx, mpnelts, lvtx_lid, elt, vtxXp_lid;
+  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
+  int_t markl, *globToLoc, maxNvtcsPProc;
+  int_t mem_error, len_texp;
+  
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+
+  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
+  if (computeL) {
+    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
+    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
+  }
+  else {
+    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
+    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
+  }
+  markl = *pmarkl + 1;
+  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
+  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
+  nextl = xlsub[vtxXp_lid+1];
+    
+  lvtx_lid = EMPTY;
+  if (lstVtx_srcUpd != EMPTY)
+    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
+
+  /* count the number of new elements, and update Llu_symbfact->cntelt_vtcs */
+  vtx_lid = fstVtx_toUpd_lid;
+  vtx_lid += vtx_upd_pr;
+  len_texp = 0;
+  for (i = vtx_upd_pr; i < lstVtx_upd_pr; i++, vtx_lid ++) { 
+    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
+    if (xusubPr[i] != xusubPr[i+1]) {
+      j = xusubPr[i]; 
+      vtx = usubPr[j];
+      /* setup marker structure for already existing elements */
+      ii = xlsub[vtx_lid];
+      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
+	marker[lsub[ii]] = markl;
+	ii ++;
+      }
+      nelts_vtx = ii - xlsub[vtx_lid];
+      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
+	vtx_elt = usubPr[j];
+	ii = marker[vtx_elt];
+	if (computeRcvd) {
+	  nelts = lsub_rcvd[ii + NELTS_IND];
+	  ii += RCVD_IND;
+	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
+	}
+	else {
+	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
+	  if (vtx_elt_lid == lvtx_lid)
+	    nelts = lsub_rcvd_sz - ii;
+	  else
+	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
+	  mpnelts = marker[vtx_elt] + nelts;
+	}
+	
+	if (!computeL)
+	  marker[vtx] = markl;
+	for (ii; ii < mpnelts; ii++) {
+	  elt = lsub_rcvd[ii];
+	  if (elt >= vtx) {
+	    if (marker[elt] != markl) {
+	      /* add elt to structure of vtx */
+	      marker[elt] = markl;
+	      nelts_vtx ++;
+	    }
+	  }
+	}
+      }
+      if (nelts_vtx != 0 && (nelts_vtx > xlsub[vtx_lid+1] - xlsub[vtx_lid])) {
+	nelts_in = xlsub[vtx_lid+1] - xlsub[vtx_lid];
+	if (nelts_in == 0) nelts_in = 1;
+	j = nelts_vtx / nelts_in;
+	if (nelts_vtx % nelts_in != 0) j++;
+	nelts_vtx = j * nelts_in;
+      }
+      else
+	nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
+      markl ++;
+      if (markl == n) {
+	/* reset marker array */
+	for (j = fstVtx_toUpd; j < n; j++)
+	  marker[j] = EMPTY;
+	markl = 0;
+      }
+    }
+    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
+    len_texp += nelts_vtx;
+  }
+  for (; i < nvtcs_toUpd; i++, vtx_lid++) {
+    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];    
+    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
+    len_texp += nelts_vtx;
+  }
+
+  *pmarkl = markl;
+  /* mark elements array */
+  for (i = xlsub[vtxXp_lid]; i < nextl; i++) {
+    marker[lsub[i]] = markl;
+  }
+
+  nextl = xlsub[vtxXp_lid+1];  
+  if (mem_error = 
+      psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, 
+			    computeL, Pslu_freeable, Llu_symbfact, VInfo, PS))
+    return (mem_error);		
+
+  return 0;
+}
+
+
+static int_t
+rl_update
+(
+ int   computeRcvd, /* if = 1, then update from receive buffer,
+		       else update from own data */
+ int_t n,
+ int   iam,       /* process number */
+ int_t *lsub_rcvd,      /* elements of node */
+ int_t lsub_rcvd_sz,    /* size of sub to be explored */
+ int_t *usub_rcvd,  
+ int_t usub_rcvd_sz,
+ int_t fstVtx_srcUpd, /* first vertex source of the updates */
+ int_t lstVtx_srcUpd, /* last vertex source of the updates */
+ int_t indBlk_srcUpd, /* block index of first vertex */
+ int_t fstVtx_toUpd,  /* first vertex to update */
+ int_t lstVtx_toUpd,  /* last vertex to update */
+ int_t nvtcs_toUpd,   /* no of vertices to update */
+ int   computeL,
+ int_t *pmarkl,
+ int_t *marker,
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
+ psymbfact_stat_t *PS
+ /*  marker: first elements of marker contain the nodes that will
+     be used in the updates */
+ )
+{
+  int_t i, j, k, prVal, nelts, ind, nextl, ii, mpnelts, mem_error;
+  int_t vtx, vtx_lid, vtx_elt, vtx_elt_lid, lvtx_lid;
+  int_t fstVtx_toUpd_lid, markl, elt, vtx_loc, ind_blk;
+  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
+  int_t fstVtx_upd, lstVtx_upd, maxNvtcsPProc, *globToLoc;
+  int_t fstVtx_srcUpd_lid, nelts_vtx, expand;
+  
+  /* quick return */
+  if (fstVtx_toUpd >= lstVtx_toUpd)
+    return 0;
+
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  
+  fstVtx_upd = EMPTY;
+  lstVtx_upd = EMPTY;
+  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
+  if (computeL) {
+    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
+    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
+  }
+  else {
+    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
+    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
+  }
+  markl = *pmarkl;
+  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
+
+  /* count number of elements in transpose representation of usub_rcvd */
+  /* use marker to count those elements */
+  for (i = 0; i < nvtcs_toUpd; i++)
+    marker[i] = 0;
+  
+  i = 0;
+  if (fstVtx_srcUpd != EMPTY) {
+    fstVtx_srcUpd_lid = LOCAL_IND( globToLoc[fstVtx_srcUpd] );
+    vtx_lid = fstVtx_srcUpd_lid;
+  }
+  lvtx_lid = EMPTY;
+  if (lstVtx_srcUpd != EMPTY)
+    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
+  
+  while (i < usub_rcvd_sz) {
+    if (computeRcvd) {
+      vtx   = usub_rcvd[i + DIAG_IND];
+      nelts = usub_rcvd[i + NELTS_IND];
+      i += RCVD_IND;
+    }
+    else {
+      if (vtx_lid == lvtx_lid)
+	nelts = usub_rcvd_sz - i;
+      else
+	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
+      vtx_lid ++;
+    }
+    prVal = usub_rcvd[i];
+    for (k = i; k < i + nelts; k++) {
+      vtx_elt = usub_rcvd[k];
+      if (vtx_elt > prVal)
+	k = i + nelts;
+      else {
+	if (OWNER( globToLoc[vtx_elt] ) == iam) {
+	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
+	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
+	      fstVtx_toUpd_lid;
+	    marker[vtx_elt_lid] ++;
+	  }
+	}
+      }
+    }
+    i += nelts;
+  }
+
+  ind = 0;
+  for (i = 0; i < nvtcs_toUpd; i++) {
+    if (marker[i] != 0) {
+      marker[i] ++;
+      if (fstVtx_upd == EMPTY)
+	fstVtx_upd = i;
+      lstVtx_upd = i;
+    }
+    xusubPr[i] = ind;
+    ind += marker[i];
+    marker[i] = xusubPr[i];
+  }
+  xusubPr[i] = ind;
+  lstVtx_upd ++;
+
+  if (ind == 0) 
+    /* quick return if no update */
+    return 0;
+
+  /* test if enough memory in usubPr array */
+  if (ind > Llu_symbfact->szLsubPr) {
+    if (mem_error = 
+	psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS))
+      return (mem_error);
+    usubPr  = Llu_symbfact->lsubPr;
+  }
+  
+  i = 0;
+  if (fstVtx_srcUpd != EMPTY) {
+    vtx_loc = fstVtx_srcUpd;
+    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
+    ind_blk = indBlk_srcUpd;
+  }
+  while (i < usub_rcvd_sz) {
+    if (computeRcvd) {
+      vtx   = usub_rcvd[i + DIAG_IND];
+      nelts = usub_rcvd[i + NELTS_IND];
+      i += RCVD_IND;
+    }
+    else {
+      vtx = vtx_loc;
+      if (vtx_lid == lvtx_lid)
+	nelts = usub_rcvd_sz - i;
+      else
+	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
+      vtx_lid ++;
+      vtx_loc ++;
+      if (ind_blk != EMPTY)
+	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
+	  ind_blk += 2;
+	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
+	}
+    }
+
+    prVal = usub_rcvd[i];
+    for (k = i; k < i + nelts; k++) {
+      vtx_elt = usub_rcvd[k];
+      if (vtx_elt > prVal)
+	k = i + nelts;
+      else {
+	if (OWNER( globToLoc[vtx_elt]) == iam) {
+	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
+	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_toUpd_lid;
+	    /* add vtx_elt to the pruned structure */
+	    if (marker[vtx_elt_lid] == xusubPr[vtx_elt_lid]) {
+	      usubPr[marker[vtx_elt_lid]] = vtx_elt;
+	      marker[vtx_elt_lid] ++;
+	    }
+	    usubPr[marker[vtx_elt_lid]] = vtx;
+	    marker[vtx_elt_lid] ++;
+	  }
+	}
+      }
+    }
+    i += nelts;
+  }
+  /* reset marker array */
+  for (i = 0; i < nvtcs_toUpd; i++)
+    marker[i] = EMPTY;
+  if (fstVtx_srcUpd != EMPTY) {
+    vtx_loc = fstVtx_srcUpd;
+    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
+    ind_blk = indBlk_srcUpd;
+  }
+  i = 0;
+  while (i < lsub_rcvd_sz) {
+    if (computeRcvd) {
+      vtx   = lsub_rcvd[i + DIAG_IND];
+      nelts = lsub_rcvd[i + NELTS_IND];
+      marker[vtx] = i;
+      i += RCVD_IND;
+    }
+    else {
+      vtx = vtx_loc;
+      if (vtx_lid == lvtx_lid)
+	nelts = lsub_rcvd_sz - i;
+      else
+	nelts = xlsub[vtx_lid + 1] - xlsub[vtx_lid];
+      vtx_lid ++;
+      marker[vtx] = i;
+      vtx_loc ++;
+      if (ind_blk != EMPTY)
+	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
+	  ind_blk += 2;
+	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
+	}
+    }
+    i += nelts;
+  }
+
+  /* use the pruned structure to update symbolic factorization */
+  vtx_lid = fstVtx_toUpd_lid;
+  vtx_lid += fstVtx_upd;
+  for (i = fstVtx_upd; i < lstVtx_upd; i++, vtx_lid ++) { 
+    if (xusubPr[i] != xusubPr[i+1]) {
+      j = xusubPr[i]; 
+      vtx = usubPr[j];
+      /* setup marker structure for already existing elements */
+      ii = xlsub[vtx_lid];
+      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
+	marker[lsub[ii]] = markl;
+	ii ++;
+      }
+      PS->nops += ii - xlsub[vtx_lid];
+      nextl = ii;
+      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
+	vtx_elt = usubPr[j];
+	ii = marker[vtx_elt];
+	if (computeRcvd) {
+	  nelts = lsub_rcvd[ii + NELTS_IND];
+	  ii += RCVD_IND;
+	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
+	}
+	else {
+	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
+	  if (vtx_elt_lid == lvtx_lid)
+	    nelts = lsub_rcvd_sz - ii;
+	  else
+	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
+	  mpnelts = marker[vtx_elt] + nelts;
+	}
+		
+	if (!computeL)
+	  marker[vtx] = markl;
+	PS->nops += mpnelts - ii;
+	for (ii; ii < mpnelts; ii++) {
+	  elt = lsub_rcvd[ii];
+	  if (elt >= vtx) {
+	    if (marker[elt] != markl) {
+	      /* add elt to structure of vtx */
+	      if (nextl >= xlsub[vtx_lid + 1]) {
+		if (mem_error = 
+		    expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz,
+			       usub_rcvd, usub_rcvd_sz, vtx, i,
+			       lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd,
+			       fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL,
+			       &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS))
+		    return (mem_error);
+		if (computeL) {
+		  lsub    = Llu_symbfact->lsub;
+		  if (!computeRcvd) 
+		    lsub_rcvd    = 
+		      &(Llu_symbfact->lsub[Llu_symbfact->xlsub[fstVtx_srcUpd_lid]]);
+		} else {
+		  marker[vtx] = markl;
+		  lsub    = Llu_symbfact->usub;
+		  if (!computeRcvd) 
+		    lsub_rcvd = 
+		      &(Llu_symbfact->usub[Llu_symbfact->xusub[fstVtx_srcUpd_lid]]);
+		}
+	      }
+	      lsub[nextl] = elt; nextl ++;
+	      marker[elt] = markl;
+	    }
+	  }
+	}
+      }
+      if (nextl < xlsub[vtx_lid+1])
+	lsub[nextl] = EMPTY;
+      markl ++;
+      if (markl == n) {
+	/* reset marker array */
+	for (j = fstVtx_toUpd; j < n; j++)
+	  marker[j] = EMPTY;
+	markl = 0;
+      }
+    }
+  }
+  *pmarkl = markl;
+
+  return 0;
+}
+
+static int_t
+dnsUpSeps_symbfact
+(
+ int_t n,
+ int   iam,      /* my processor number */
+ int   szSep, 
+ int   ind_sizes1,
+ int   ind_sizes2, 
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int_t fstVtx_dns,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ Pslu_freeable_t *Pslu_freeable,
+ vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS,
+ int_t *p_nextl,   /* ptr to nextl in lsub structure */
+ int_t *p_nextu,   /* ptr to nextu in usub structure */
+ int_t *p_nsuper_loc
+ )
+{
+  int_t nextl, nextu, nsuper_loc, curblk_loc, mem_error;
+  int_t vtx_elt, ind_blk, vtx, k;
+  int_t *xlsub, *xusub, *lsub, *usub;
+  int_t fstVtx_blk, fstVtx_blk_lid, vtx_lid, lstVtx_blk, fstVtx_lvl, lstVtx_lvl;
+  int_t *globToLoc, maxNvtcsPProc;
+  
+  /* Initialization */
+  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
+  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
+
+  globToLoc  = Pslu_freeable->globToLoc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  nextl      = *p_nextl;
+  nextu      = *p_nextu;
+  nsuper_loc = *p_nsuper_loc;
+  curblk_loc = VInfo->curblk_loc;
+  VInfo->nnz_ainf_loc = 0;
+  VInfo->nnz_asup_loc = 0;
+
+  if (fstVtx_dns == EMPTY)
+    fstVtx_blk     = VInfo->begEndBlks_loc[curblk_loc];
+  else 
+    fstVtx_blk  = fstVtx_dns;
+  if (fstVtx_blk == n)
+    return 0;
+  fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
+  vtx_lid        = fstVtx_blk_lid;
+  xlsub[vtx_lid] = nextl;
+  xusub[vtx_lid] = nextu;
+  PS->nDnsUpSeps = 0; 
+  
+  while (szSep >= 1) {
+    PS->nDnsUpSeps++; 
+    fstVtx_lvl = fstVtxSep[ind_sizes2];
+    lstVtx_lvl = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
+    if (fstVtx_blk > fstVtx_lvl)
+      vtx_elt = fstVtx_blk;
+    else 
+      vtx_elt = fstVtx_lvl;
+    if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) {
+      if (mem_error =
+	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 
+				nextl + fstVtx_lvl - vtx_elt,
+				LSUB, DNS_UPSEPS, 1,
+				Pslu_freeable, Llu_symbfact, VInfo, PS))
+	return (mem_error);
+      lsub = Llu_symbfact->lsub;
+    }
+    if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) {
+      if (mem_error =
+	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 
+				nextu + fstVtx_lvl - vtx_elt,
+				LSUB, DNS_UPSEPS, 1,
+				Pslu_freeable, Llu_symbfact, VInfo, PS))
+	return (mem_error);
+      usub = Llu_symbfact->usub;
+    }
+    PS->nops += 2 * (lstVtx_lvl - vtx_elt);
+    for (; vtx_elt < lstVtx_lvl; vtx_elt++) {
+      lsub[nextl] = vtx_elt; nextl++;
+      usub[nextu] = vtx_elt; nextu++;
+    }
+    ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
+    ind_sizes1 += szSep;
+    szSep = szSep / 2;
+  }
+  /* delete the diagonal element from the U structure */
+  usub[xusub[fstVtx_blk_lid]] = usub[nextu - 1];
+  nextu --;
+  xlsub[fstVtx_blk_lid+1] = nextl;
+  xusub[fstVtx_blk_lid+1] = nextu;
+
+  vtx_lid = fstVtx_blk_lid;
+  ind_blk = curblk_loc;
+  while (ind_blk < 2 * VInfo->nblks_loc) {
+    if (ind_blk != curblk_loc) {
+      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+
+      xlsub[vtx_lid] = nextl;
+      xusub[vtx_lid] = nextu;
+
+      for (k = xlsub[fstVtx_blk_lid]; k < xlsub[fstVtx_blk_lid+1]; k++) 
+	if (lsub[k] >= fstVtx_blk) {
+	  lsub[nextl] = lsub[k]; nextl ++;
+	  if (nextl >= MEM_LSUB( Llu_symbfact, VInfo ))
+ 	    if (mem_error =
+		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0,
+				      LSUB, DNS_UPSEPS, 1,
+				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      return (mem_error);
+	  lsub = Llu_symbfact->lsub;
+	}
+      for (k = xusub[fstVtx_blk_lid]; k < xusub[fstVtx_blk_lid+1]; k++)
+	if (usub[k] > fstVtx_blk) {
+	  usub[nextu] = usub[k]; nextu ++;
+	  if (nextu >= MEM_USUB( Llu_symbfact, VInfo ))
+	    if (mem_error =
+		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0,
+				      USUB, DNS_UPSEPS, 1,
+				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      return (mem_error);
+	  usub = Llu_symbfact->usub;
+	}
+      PS->nops += xlsub[fstVtx_blk_lid+1] - xlsub[fstVtx_blk_lid];
+      PS->nops += xusub[fstVtx_blk_lid+1] - xusub[fstVtx_blk_lid];
+    }
+    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
+      Pslu_freeable->supno_loc[vtx_lid] = nsuper_loc;
+      if (vtx > fstVtx_blk) {
+	xlsub[vtx_lid] = nextl;
+	xusub[vtx_lid] = nextu;
+      }
+    }
+    ind_blk += 2;
+    nsuper_loc ++;
+  }
+  
+  *p_nextl = nextl;
+  *p_nextu = nextu;
+  *p_nsuper_loc = nsuper_loc;
+/*   VInfo->curblk_loc = ind_blk; */
+  
+  return 0;
+}
+
+static int_t
+dnsCurSep_symbfact
+(
+ int_t n,          /* Input - order of the matrix */
+ int   iam,        /* Input - my processor number */
+ int   ind_sizes1,
+ int   ind_sizes2,
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int   szSep,
+ int   npNode,
+ int_t rcvd_dnsSep,
+ int_t *p_nextl,     
+ int_t *p_nextu, 
+ int_t *p_mark,
+ int_t *p_nsuper_loc,
+ int_t *marker,   /* temporary array of size n */
+ MPI_Comm ndCom,
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ Pslu_freeable_t *Pslu_freeable,
+ vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS
+ )
+{
+  int_t fstVtx_blk, fstVtx_dns, fstVtx_dns_lid, lstVtx_blk, 
+    fstVtx, lstVtx, lstVtx_dns_lid;
+  int_t ind_blk, i, vtx, vtx_lid, vtx_lid_x, nvtcs_upd, save_cnt, mem_error;
+  int_t computeL, computeU, vtx_elt, j, cur_blk, snlid, snrep;
+  int_t *sub, *xsub, *minElt_vtx, *cntelt_vtcs;
+  int_t mark, next, *x_newelts, *x_newelts_L, *x_newelts_U;
+  int_t *newelts_L, *newelts_U, *newelts;
+  int_t *globToLoc, maxNvtcsPProc, lvl;
+  int_t prval, kmin, kmax, maxElt, ktemp, prpos;
+  float mem_dnsCS;
+
+  if (!rcvd_dnsSep)
+    VInfo->curblk_loc += 2;
+  
+  computeL = TRUE; computeU = TRUE;
+  lstVtx_dns_lid = EMPTY;
+  globToLoc = Pslu_freeable->globToLoc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  fstVtx = fstVtxSep[ind_sizes2];
+  lstVtx = fstVtx + sizes[ind_sizes2];
+  cur_blk = VInfo->curblk_loc;
+  fstVtx_dns = VInfo->begEndBlks_loc[cur_blk];
+  fstVtx_dns_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
+  lvl = (int_t) LOG2( npNode );
+  x_newelts_U = NULL;
+  newelts_L = NULL;
+  newelts_U = NULL;
+  mem_dnsCS = 0.;
+  
+  PS->nDnsCurSep ++;
+
+  if (CS->rcv_bufSz > n - fstVtx_dns)
+    minElt_vtx = CS->rcv_buf;
+  else {
+    if (!(minElt_vtx = intMalloc_symbfact(n - fstVtx_dns)))
+      ABORT("Malloc fails for minElt_vtx[].");
+    mem_dnsCS += n - fstVtx_dns;
+  }
+  
+  while (computeL || computeU) {
+    if (computeL) {
+      sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
+      x_newelts = Llu_symbfact->cntelt_vtcs;
+      x_newelts_L = x_newelts;
+    }
+    else {
+      sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
+    }
+
+    /* use minElt_vtx to determine starting vertex of each nonzero element */
+    for (i = 0; i < n - fstVtx_dns; i++)
+      minElt_vtx[i] = n;
+
+    ind_blk = cur_blk;
+    vtx_lid = fstVtx_dns_lid;
+    nvtcs_upd = 0;
+    while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
+	   ind_blk < 2 * VInfo->nblks_loc) {	  
+      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+      ind_blk += 2;
+      nvtcs_upd += lstVtx_blk - fstVtx_blk;
+      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
+	j = xsub[vtx_lid];
+	while (j < xsub[vtx_lid+1] && sub[j] != EMPTY) {
+	  PS->nops ++;
+	  vtx_elt = sub[j] - fstVtx_dns;
+	  if (minElt_vtx[vtx_elt] == n) {
+	    minElt_vtx[vtx_elt] = vtx;
+	  }
+	  j ++;
+	}
+      }	  
+    }
+    if (!computeL) {
+      if (!(x_newelts_U = intMalloc_symbfact(nvtcs_upd + 1)))
+	ABORT("Malloc fails for x_newelts_U[].");
+      mem_dnsCS += nvtcs_upd + 1;
+      x_newelts = x_newelts_U;
+    }
+    else {
+      /* save the value in cntelt_vtcs[lstVtx_blk_lid] */
+      save_cnt = x_newelts[vtx_lid];
+      lstVtx_dns_lid = vtx_lid;
+    }
+    
+    MPI_Allreduce (&(minElt_vtx[lstVtx - fstVtx_dns]), &(marker[lstVtx]), 
+		   n - lstVtx, mpi_int_t, MPI_MIN, ndCom);
+
+#if ( PRNTlevel>=1 )
+    PS->no_msgsCol += (float) (2 * (int_t) LOG2( npNode ));
+    PS->sz_msgsCol += (float) (n - lstVtx);
+    if (PS->maxsz_msgCol < n - lstVtx) 
+      PS->maxsz_msgCol = n - lstVtx;      
+#endif
+    
+    /* use x_newelts to determine counts of elements starting in each vertex */
+    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++)
+      x_newelts[vtx_lid] = 0;
+    
+    for (vtx = lstVtx; vtx < n; vtx++) {
+      if (marker[vtx] != n) {
+	vtx_elt = marker[vtx];
+	if (OWNER( globToLoc[vtx_elt] ) == iam) {
+	  x_newelts[ LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid ] ++;
+	}
+	else {
+	  /* find the first vertex > vtx_elt which belongs to iam */
+	  ind_blk = cur_blk;
+	  vtx_lid = 0;
+	  while (vtx_elt > VInfo->begEndBlks_loc[ind_blk] &&
+		 ind_blk < 2 * VInfo->nblks_loc) {
+	    vtx_lid += VInfo->begEndBlks_loc[ind_blk+1] -
+	      VInfo->begEndBlks_loc[ind_blk];
+	    ind_blk += 2;
+	  }
+	  if (VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
+	    x_newelts[vtx_lid] ++;		
+	    marker[vtx] = VInfo->begEndBlks_loc[ind_blk];
+	  }			    
+	  else
+	    marker[vtx] = n;
+	}
+      }
+    }
+    
+    /* set up beginning of new elements for each local vtx */
+    i = 0;
+    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
+      j = x_newelts[vtx_lid];
+      x_newelts[vtx_lid] = i;
+      i += j;
+    }
+    x_newelts[vtx_lid] = i;
+    newelts = NULL;
+    if (i != 0) {
+      if (!(newelts = intMalloc_symbfact(x_newelts[vtx_lid])))
+	ABORT("Malloc fails for newelts[].");    
+      mem_dnsCS += x_newelts[vtx_lid];
+      
+      for (vtx = lstVtx; vtx < n; vtx++) {
+	if (marker[vtx] != n) {
+	  vtx_elt = marker[vtx];
+	  vtx_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid;	  
+	  newelts[x_newelts[vtx_lid]] = vtx;
+	  x_newelts[vtx_lid] ++;
+	}
+      }
+    }
+    /* reset beginning of new elements for each local vertex */
+    i = 0;
+    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
+      j = x_newelts[vtx_lid];
+      x_newelts[vtx_lid] = i;
+      i = j;
+    }
+
+    if (computeL == TRUE) {
+      computeL = FALSE;
+      newelts_L = newelts;
+    }
+    else {
+      computeU = FALSE;
+      newelts_U = newelts;
+    }
+  }
+  
+  for (i = fstVtx_dns; i < n; i++)
+    marker[i] = EMPTY;
+  mark = 0;
+  
+  /* update vertices */
+  prval = n; 	    
+  ind_blk = cur_blk;
+  fstVtx_dns = VInfo->begEndBlks_loc[ind_blk];
+  vtx_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
+  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
+	 ind_blk < 2 * VInfo->nblks_loc) {	  
+    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+    ind_blk += 2;
+    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
+      vtx_lid_x = vtx_lid - fstVtx_dns_lid;
+      Llu_symbfact->xlsub[vtx_lid] = *p_nextl;
+      Llu_symbfact->xusub[vtx_lid] = *p_nextu;
+      if (vtx == fstVtx_blk || x_newelts_L[vtx_lid_x+1] != x_newelts_L[vtx_lid_x] ||
+	  x_newelts_U[vtx_lid_x+1] != x_newelts_U[vtx_lid_x]) {
+	/* a new supernode starts */
+	snlid = vtx_lid;
+	snrep = vtx;
+	if (mark + 2 > n) {
+	  /* reset to EMPTY marker array */
+	  for (i = 0; i < n; i++)
+	    marker[i] = EMPTY;
+	  mark = 0;
+	}
+
+	computeL = TRUE;
+	computeU = FALSE;
+	while (computeL || computeU) {
+	  if (computeL) {
+	    sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
+	    x_newelts = x_newelts_L; newelts = newelts_L;
+	    next = *p_nextl;
+	  }
+	  else {
+	    sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
+	    x_newelts = x_newelts_U; newelts = newelts_U;
+	    next = *p_nextu;
+	  }
+	  xsub[vtx_lid] = next;
+
+	  /* TEST available memory */
+	  j = x_newelts[vtx_lid_x+1] + lstVtx - vtx;
+	  if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) ||
+	      (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) {
+	    if (mem_error =
+		psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
+				      computeL, DNS_CURSEP, 1,
+				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      return (mem_error);
+	    if (computeL) sub = Llu_symbfact->lsub;
+	    else sub = Llu_symbfact->usub; 
+	  }
+	  
+	  if (computeL)  i = vtx;
+	  else           i = vtx+1;
+	  while (i < lstVtx) {
+	    sub[next] = i; next ++;
+	    i ++;
+	  }
+	  PS->nops += x_newelts[vtx_lid_x+1];
+	  for (i = 0; i < x_newelts[vtx_lid_x+1]; i++) {
+	    vtx_elt = newelts[i];
+	    sub[next] = vtx_elt; next ++;
+	    if (computeU && vtx_elt < prval 
+		&& marker[vtx_elt] == mark-1)
+	      prval = vtx_elt;
+	    marker[vtx_elt] = mark;
+	  }
+	  if (computeL) {
+	    computeL = FALSE; computeU = TRUE;
+	    *p_nextl = next;
+	  }
+	  else {
+	    computeU = FALSE;
+	    *p_nextu = next;
+	  }
+	  mark ++;	  
+	}	  
+	if (vtx != fstVtx_blk)
+	  (*p_nsuper_loc) ++;
+      } /* a new supernode starts */
+      /* vtx belongs to the curent supernode */
+      Pslu_freeable->supno_loc[vtx_lid] = *p_nsuper_loc;
+    } 
+    (*p_nsuper_loc) ++;
+  }
+  
+  if (ind_blk > 0) {
+    /* if iam owns blocks of this level */
+    i = *p_nextl - Llu_symbfact->xlsub[snlid];
+    j = *p_nextu - Llu_symbfact->xusub[snlid];
+    
+    if (VInfo->begEndBlks_loc[ind_blk - 1] == lstVtx && i > 1 && j > 0) {
+      /* if iam the last processor owning a block of this level */
+      computeL = TRUE; computeU = FALSE;
+      /* prune the structure */
+      while (computeL || computeU) {	   
+	if (computeL) {
+	  sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
+	  next = *p_nextl;
+	  computeL = FALSE; computeU = TRUE;
+	}
+	else {
+	  sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
+	  next = *p_nextu;
+	  computeU = FALSE;
+	}
+	
+	kmin = xsub[snlid];
+	kmax = next - 1;
+	if (prval != n) {
+	  maxElt = prval;
+	  while (kmin <= kmax) {
+	    /* Do a quicksort-type partition. */    
+	    if (sub[kmax] > prval) 
+	      kmax--;
+	    else if (sub[kmin] <= prval) {
+	      kmin++;
+	    }
+	    else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
+		    * 	   interchange the two subscripts
+		    */
+	      ktemp = sub[kmin];
+	      sub[kmin] = sub[kmax];
+	      sub[kmax] = ktemp;
+	      kmin ++;
+	      kmax --;
+	    }
+	    if (sub[kmin-1] == prval)
+	      prpos = kmin - 1;
+	  }
+	}
+	else {
+	  maxElt = EMPTY;
+	  while (kmin <= kmax) {
+	    /* compute maximum element of L(:, vtx) */
+	    if (sub[kmin] > maxElt) {
+	      maxElt = sub[kmin];
+	      prpos = kmin;
+	    }
+	    kmin ++;	      
+	  }
+	}
+	ktemp = sub[xsub[snlid]];
+	sub[xsub[snlid]] = maxElt;
+	sub[prpos] = ktemp;
+      }
+      
+      /* setup snd_interSz information */
+      prval = Llu_symbfact->lsub[Llu_symbfact->xlsub[snlid]];
+      if (prval >= lstVtx) {
+	/* this supernode will be send to next layers of the tree */
+	while (prval >= lstVtx && szSep != 1) {
+	  ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
+	  ind_sizes1 += szSep;
+	  lvl ++;
+	  szSep = szSep / 2;
+	  lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
+	  CS->snd_interSz[lvl] += i + j + 4;
+	  CS->snd_LinterSz[lvl] += i + 2;
+	  if (CS->snd_vtxinter[lvl] == EMPTY)
+	    CS->snd_vtxinter[lvl] = snrep;
+	}
+      }
+    }
+  }
+
+  /* restore value in cntelt_vtcs */
+  if (lstVtx_dns_lid != EMPTY)
+    Llu_symbfact->cntelt_vtcs[lstVtx_dns_lid] = save_cnt;
+  *p_mark = mark;
+  if (minElt_vtx != CS->rcv_buf)
+    SUPERLU_FREE (minElt_vtx);  
+  SUPERLU_FREE (x_newelts_U);
+  if (newelts_L) SUPERLU_FREE (newelts_L);
+  if (newelts_U) SUPERLU_FREE (newelts_U);
+  if (PS->szDnsSep < mem_dnsCS)
+    PS->szDnsSep = mem_dnsCS;
+
+  return 0;
+}
+
+/*! \brief
+
+<pre>
+   All processors affected to current node must call this routine
+   when VInfo->filledSep == FILLED_SEP
+   This is necessary since subsequent routines called from here use 
+   MPI_allreduce among all processors affected to curent node
+</pre>
+*/
+
+static int_t
+denseSep_symbfact 
+(
+ int   rcvd_dnsSep, /* =1 if processor received info that the separator
+		       became dense,
+		       =0 if myPE determined that separator is full */
+ int_t n,           /* Input - order of the matrix */
+ int   iam,         /* Input - my processor number */
+ int   ind_sizes1,
+ int   ind_sizes2,
+ int_t *sizes,     /* Input - sizes of each separator in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int   szSep, 
+ int   fstP,        /* first pe affected current node */
+ int   lstP,        /* last pe affected current node */
+ int_t fstVtx_blkCyc, 
+ int_t nblk_loc,    /* block number in the block cyclic distribution of current
+		       supernode */
+ int_t *p_nextl,
+ int_t *p_nextu,
+ int_t *p_mark,
+ int_t *p_nsuper_loc,
+ int_t *marker,
+ MPI_Comm ndCom,
+ MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ Pslu_freeable_t *Pslu_freeable,
+ vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS
+) 
+{
+  int   nprocsLvl, p, prvP, tag;
+  int_t nmsgsToSnd, nmsgsToRcv;
+  int_t ind_blk, mem_error;
+  int_t *rcv_intraLvl;
+  int_t fstVtx, lstVtx, cur_blk, lstVtx_blk, fstVtx_blk;
+  int_t *globToLoc, maxNvtcsPProc;
+  MPI_Status status;
+  
+  globToLoc = Pslu_freeable->globToLoc;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  fstVtx = fstVtxSep[ind_sizes2];
+  lstVtx = fstVtx + sizes[ind_sizes2];
+  rcv_intraLvl = CS->rcv_intraLvl;
+  cur_blk   = VInfo->curblk_loc;
+  nprocsLvl = lstP - fstP;
+  
+  if (nblk_loc == 0) {
+    nmsgsToSnd = 2; nmsgsToRcv = 1;
+  }
+  else {
+    nmsgsToSnd = 1; nmsgsToRcv = 0;
+    if (!rcvd_dnsSep) nmsgsToRcv ++;
+  }
+  if (iam == fstP && rcvd_dnsSep && nblk_loc == 1) 
+    nmsgsToRcv ++;
+  
+  /* first exchange msgs with all processors affected to current node */
+  ind_blk = cur_blk;
+  while ((nmsgsToSnd || nmsgsToRcv) && VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
+    tag = (int) (tag_intraLvl + nblk_loc);
+    if (nmsgsToSnd) {
+      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+      if (lstVtx_blk != lstVtx) {
+	p = OWNER( globToLoc[lstVtx_blk]);
+	MPI_Send (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
+		  tag, (*symb_comm));
+#if ( PRNTlevel>=1 )
+	PS->no_shmSnd += (float) 1;
+#endif
+      }
+      nmsgsToSnd --;
+    }
+    ind_blk += 2;
+    nblk_loc ++;
+    tag = tag_intraLvl + nblk_loc;
+    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+    if (nmsgsToRcv && fstVtx_blk < lstVtx) {
+      if (iam == fstP) tag --;
+      prvP = OWNER( globToLoc[fstVtx_blk - 1]);
+      MPI_Recv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
+		tag, (*symb_comm), &status);
+#if ( PRNTlevel>=1 )
+      PS->no_shmRcvd += (float) 1;
+#endif
+      nmsgsToRcv --;
+    }
+  }
+
+  if (VInfo->filledSep == FILLED_SEP) {
+    if (mem_error = 
+	dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, 
+			    szSep, lstP - fstP, rcvd_dnsSep, p_nextl, 
+			    p_nextu, p_mark, p_nsuper_loc, marker, ndCom,
+			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS))
+      return (mem_error);
+  }
+  else if (rcvd_dnsSep) 
+    if (mem_error = 
+	dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
+			    sizes, fstVtxSep, EMPTY,
+			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+			    p_nextl, p_nextu, p_nsuper_loc))
+      return (mem_error);
+  return 0;
+}
+
+
+static int_t
+interLvl_symbfact
+(
+ SuperMatrix *A, /* Input - input matrix A */
+ int   iam,      /* Input - my processor number */  
+ int   lvl,      /* Input - current level in the separator tree */ 
+ int   szSep,    /* Input - size of the current separator (node) */
+ int   fstP,     /* Input - first processor assigned to current node */
+ int   lstP,     /* Input - last processor assigned to current node */
+ int   ind_sizes1,
+ int   ind_sizes2, 
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int_t *p_nextl,
+ int_t *p_nextu,
+ int_t *p_nsuper_loc,
+ int_t *pmark,   /* mark for symbfact */
+ int_t *marker,  /* temp array used for marking */
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ Pslu_freeable_t *Pslu_freeable,
+ comm_symbfact_t *CS,/* infos on communication data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
+ psymbfact_stat_t *PS,
+ MPI_Comm ndComm,
+ MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
+ )
+{
+  MPI_Status  *status; 
+  MPI_Request *request_snd, *request_rcv;
+  
+  int   nprocsLvl, rcvdP, p, filledSep_lvl;
+  int   toSend, toSendL, toSendU;
+  int_t *rcv_interLvl;
+  int_t *snd_interLvl, *snd_interLvl1, *snd_interLvl2,
+    snd_interLvlSz, snd_LinterLvlSz, snd_vtxLvl;
+  int_t  vtx_elt, update_loc, code_err;
+  int_t *lsub, *xlsub, *usub, *xusub;
+  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
+  int_t  n, mark, max_rcvSz; 
+  int_t nextl, nextu, ind_blk, vtx_lid, k, count, nelts, 
+    lstVtxLvl_loc, lstVtxLvl_loc_lid, mem_error;
+  int_t fstVtx_blk, lstVtx_blk, i, j, vtx, prElt_L, prElt_U, 
+    snd_indBlk, prElt_ind;
+  int_t fstVtxLvl_loc, nvtcsLvl_loc, maxNvtcsPProc, *globToLoc, 
+    fstVtx, lstVtx;
+  int  ind1, nprocsToRcv, nprocsToSnd, ind2, ind_l, ind_u, ij, ik;
+  int_t req_ind, sent_msgs, req_ind_snd;
+  int_t initInfo_loc[2], initInfo_gl[2];
+
+  /* Initialization */
+  n = A->ncol;
+  fstVtx          = fstVtxSep[ind_sizes2];
+  lstVtx          = fstVtx + sizes[ind_sizes2];
+  maxNvtcsPProc   = Pslu_freeable->maxNvtcsPProc;
+  globToLoc       = Pslu_freeable->globToLoc;
+  nprocsLvl       = lstP - fstP;
+  rcv_interLvl    = CS->rcv_interLvl;
+  snd_interLvl    = CS->snd_interLvl;
+  snd_interLvlSz  = CS->snd_interSz[lvl];
+  snd_LinterLvlSz = CS->snd_LinterSz[lvl];
+  snd_vtxLvl      = CS->snd_vtxinter[lvl];
+  fstVtxLvl_loc   = VInfo->begEndBlks_loc[VInfo->curblk_loc];
+  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
+  request_snd = NULL;
+  request_rcv = NULL;
+  status = NULL;
+  mark = *pmark;
+  
+  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
+  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
+
+  /* snd_vtxLvl denotes the first vertex from which iam needs
+     to send data.  
+     snd_interLvlSz denotes maximum size of the send data,
+     snd_LinterLvlSz denotes send data corresponding to L part */
+
+  /* determine maximum size of receive buffer and information
+   on filled sep */
+  if (snd_interLvlSz != 0) {
+    if (snd_LinterLvlSz == 0) 
+      snd_interLvlSz = 0;
+    if (snd_interLvlSz - snd_LinterLvlSz == 0)
+      snd_interLvlSz = 0;
+  }
+  
+  initInfo_loc[0] = snd_interLvlSz;
+  initInfo_loc[1] = (int_t) VInfo->filledSep;
+  MPI_Allreduce (initInfo_loc, initInfo_gl, 2, 
+		 mpi_int_t, MPI_MAX, ndComm);
+#if ( PRNTlevel>=1 )
+  PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
+  PS->sz_msgsCol += 2;
+  if (PS->maxsz_msgCol < 2) 
+    PS->maxsz_msgCol = 2;      
+#endif  
+  max_rcvSz = initInfo_gl[0];
+  filledSep_lvl = (int) initInfo_gl[1];
+
+  if (filledSep_lvl == FILLED_SEPS) {
+    /* quick return if all upper separators are dense */
+    if (VInfo->filledSep != FILLED_SEPS) {
+      VInfo->filledSep = FILLED_SEPS;
+      if (mem_error = 
+	  dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, 
+			      fstVtxSep,
+			      EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+			      p_nextl, p_nextu, p_nsuper_loc))
+	return (mem_error);
+    }
+    return 0;
+  }
+
+  if (max_rcvSz == 0)
+    /* quick return if no communication necessary */
+    return 0; 
+  
+  /* allocate data for the send buffer */  
+  if (snd_interLvlSz)
+    if (CS->snd_bufSz < snd_interLvlSz) {
+      PS->maxSzBuf += snd_interLvlSz - CS->snd_bufSz;
+      if (CS->snd_bufSz != 0)
+	/* not first time allocate memory */
+	SUPERLU_FREE (CS->snd_buf);
+      CS->snd_bufSz = snd_interLvlSz;
+      if (!(CS->snd_buf = intMalloc_symbfact (snd_interLvlSz))) {
+	ABORT("Malloc fails for snd_buf[].");
+      }
+    }
+    
+  /* snd_interLvl : to which processors the data need to be send 
+   * information setup during the copy of data to be send in the buffer  
+   * rcv_interLvl : from which processors iam receives update data  */
+  for (p = 2*fstP; p < 2*lstP; p++)
+    snd_interLvl[p] = EMPTY;
+
+  if (snd_interLvlSz == 0 && nvtcsLvl_loc == 0) {
+    code_err = MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
+			     &(rcv_interLvl[2*fstP]), 2, mpi_int_t,
+			     ndComm);
+#if ( PRNTlevel>=1 )
+    PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
+    PS->sz_msgsCol += 2;
+    if (PS->maxsz_msgCol < 2) 
+      PS->maxsz_msgCol = 2;      
+#endif  
+    return 0;
+  }
+  
+  /* in interLvlInfos, 
+   * obtain from which processors iam receives update information */
+  update_loc = FALSE;
+  nextl = 0; 
+  nextu = snd_LinterLvlSz;
+  if (snd_interLvlSz != 0) {
+    /* copy data to be send */
+    /* find index block from where to send data */
+    ind_blk = VInfo->curblk_loc;
+    while (snd_vtxLvl < VInfo->begEndBlks_loc[ind_blk]) {
+      ind_blk -= 2;
+    }
+    snd_indBlk = ind_blk;
+    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
+    for (; ind_blk < VInfo->curblk_loc; ind_blk += 2) {
+      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
+      if (ind_blk == snd_indBlk)
+	fstVtx_blk = snd_vtxLvl;
+      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
+      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
+	toSendL = FALSE; toSendU = FALSE;
+	if (xlsub[vtx_lid] != xlsub[vtx_lid+1] && 
+	    xusub[vtx_lid] != xusub[vtx_lid+1]) {
+	  k = xlsub[vtx_lid];
+	  prElt_L = lsub[k];
+	  j = xusub[vtx_lid];
+	  prElt_U = usub[j];
+	  if (prElt_L >= fstVtx || prElt_U >= fstVtx) {
+	    if (prElt_L >= fstVtx)
+	      while (lsub[k] <= prElt_L && k < xlsub[vtx_lid + 1]) {
+		vtx_elt = lsub[k];
+		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
+		  p = OWNER( globToLoc[vtx_elt] );
+		  if (p != iam) {
+		    /* vtx will be send to another processor */
+		    snd_interLvl[2*p] = TRUE;
+		    toSendL = TRUE;
+		  }
+		  else
+		    update_loc = TRUE;
+		}
+		k++;
+	      }
+	    if (prElt_U >= fstVtx)
+	      while (usub[j] <= prElt_U && j < xusub[vtx_lid + 1]) {
+		vtx_elt = usub[j];
+		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
+		  p = OWNER( globToLoc[vtx_elt] );
+		  if (p != iam) {
+		    /* vtx will be send to another processor */
+		    snd_interLvl[2*p+1] = TRUE;
+		    toSendU = TRUE;
+		  }
+		  else
+		    update_loc = TRUE;
+		}
+		j ++;
+	      }
+	    if (toSendL || toSendU) {
+	      /* L(:, vtx) and U(vtx, :) will be send to processors */
+	      CS->snd_buf[nextu + DIAG_IND]  = vtx;
+	      nelts = xusub[vtx_lid+1] - xusub[vtx_lid];
+	      CS->snd_buf[nextu + NELTS_IND] = nelts;
+	      nextu += 2;
+	      for (j = xusub[vtx_lid]; j < xusub[vtx_lid+1]; j++, nextu ++) {
+		CS->snd_buf[nextu] = usub[j]; 
+	      }
+	      CS->snd_buf[nextl + DIAG_IND] = vtx;
+	      nelts = xlsub[vtx_lid+1] - xlsub[vtx_lid];
+	      CS->snd_buf[nextl + NELTS_IND] = nelts; 
+	      nextl += 2;
+	      for (j = xlsub[vtx_lid]; j < xlsub[vtx_lid+1]; j++, nextl ++) {
+		CS->snd_buf[nextl] = lsub[j];
+	      }
+	    }
+	  }
+	}
+      }
+    }
+    lstVtxLvl_loc = vtx;
+    lstVtxLvl_loc_lid = vtx_lid;
+  }
+  
+  if (nextl == 0 || nextu - snd_LinterLvlSz == 0) {
+    for (p = 2*fstP; p < 2*lstP; p++)
+      snd_interLvl[p] = EMPTY;
+  }
+  
+  nprocsToSnd = 0;
+  for (p = 2*fstP; p < 2*lstP; p +=2) {
+    if (snd_interLvl[p] != EMPTY || snd_interLvl[p+1] != EMPTY) {
+      snd_interLvl[p] = nextl;
+      snd_interLvl[p+1] = nextu - snd_LinterLvlSz;
+      nprocsToSnd ++;
+    }
+  }
+  
+  MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
+		&(rcv_interLvl[2*fstP]), 2, mpi_int_t, ndComm);    
+#if ( PRNTlevel>=1 )
+  PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
+  PS->sz_msgsCol += 2 * nprocsLvl;
+  if (PS->maxsz_msgCol < 2 * nprocsLvl) 
+    PS->maxsz_msgCol = 2 * nprocsLvl;      
+#endif    
+
+  max_rcvSz = 0;
+  nprocsToRcv = 0;
+  for (p = 2*fstP; p < 2*lstP; p +=2) {
+    CS->ptr_rcvBuf[p] = max_rcvSz;
+    if (rcv_interLvl[p] != EMPTY) 
+      max_rcvSz += rcv_interLvl[p];
+    CS->ptr_rcvBuf[p+1] = max_rcvSz;
+    if (rcv_interLvl[p+1] != EMPTY) 
+      max_rcvSz += rcv_interLvl[p+1];
+    if (rcv_interLvl[p] != EMPTY || rcv_interLvl[p+1] != EMPTY) 
+      nprocsToRcv ++;
+  }
+
+  /* allocate data for the receive buffer */  
+  if (CS->rcv_bufSz < max_rcvSz) {
+    PS->maxSzBuf += max_rcvSz - CS->rcv_bufSz;
+    if (CS->rcv_bufSz != 0) /* not first time allocate memory */
+      SUPERLU_FREE (CS->rcv_buf);
+    CS->rcv_bufSz = max_rcvSz;
+    if (!(CS->rcv_buf = intMalloc_symbfact (max_rcvSz))) {
+      ABORT("Malloc fails for rcv_buf[].");
+    }
+  }
+  
+  /* allocate memory for status arrays */
+  if (nprocsToSnd)
+    if ( !(request_snd = (MPI_Request*) 
+	   SUPERLU_MALLOC(2 * nprocsToSnd * sizeof(MPI_Request))))
+      ABORT("Not enough memory when allocating MPI_Request");
+  if (nprocsToRcv)
+    if ( !(request_rcv = (MPI_Request*) 
+	   SUPERLU_MALLOC(2 * nprocsToRcv * sizeof(MPI_Request))))
+      ABORT("Not enough memory when allocating MPI_Request");
+  if (nprocsToRcv || nprocsToSnd)
+    if ( !(status = (MPI_Status*) 
+	   SUPERLU_MALLOC(2 * (lstP-fstP) * sizeof(MPI_Status))))
+      ABORT("Not enough memory when allocating MPI_Request");
+  
+  /* determine if we have to send data */
+  i = 0;
+  for (toSend = fstP, p = 2*fstP; p < 2*lstP; toSend++, p+=2) 
+    if (snd_interLvl[p] != EMPTY && toSend != iam) {
+      MPI_Isend (CS->snd_buf, nextl, mpi_int_t, toSend,
+		 tag_interLvl_LData, (*symb_comm), &(request_snd[2*i]));
+      MPI_Isend (&(CS->snd_buf[snd_LinterLvlSz]), 
+		 nextu - snd_LinterLvlSz, mpi_int_t, toSend,
+		 tag_interLvl_UData, (*symb_comm), &(request_snd[2*i+1]));
+      i++;
+#if ( PRNTlevel>=1 )
+      PS->no_msgsSnd += (float) 2;
+      PS->sz_msgsSnd += (float) (nextl + nextu - snd_LinterLvlSz);
+      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
+      if (PS->maxsz_msgSnd < nextu - snd_LinterLvlSz) 
+	PS->maxsz_msgSnd = nextu - snd_LinterLvlSz;      
+#endif
+    }
+  
+  if (update_loc) {
+    /* use own data to update symbolic factorization */
+    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
+    lsub_rcvd    = &(lsub[xlsub[vtx_lid]]);
+    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
+    usub_rcvd    = &(usub[xusub[vtx_lid]]);
+    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
+    
+    mem_error = 
+      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
+		 usub_rcvd, usub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
+		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
+		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+
+    lsub_rcvd    = &(Llu_symbfact->lsub[xlsub[vtx_lid]]);
+    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
+    usub_rcvd    = &(Llu_symbfact->usub[xusub[vtx_lid]]);
+    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
+    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
+    mem_error = 
+      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
+		 lsub_rcvd, lsub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
+		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
+		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
+  }
+
+  /* post non-blocking receives for all the incoming messages */
+  i = 0;
+  for (rcvdP = fstP, p = 2*fstP; p < 2*lstP; rcvdP++, p += 2) 
+    if (rcv_interLvl[p] != EMPTY) {
+      lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p]]);
+      MPI_Irecv (lsub_rcvd, rcv_interLvl[p], mpi_int_t, rcvdP,
+		 tag_interLvl_LData, (*symb_comm), &(request_rcv[i]));
+      usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p+1]]);
+      MPI_Irecv (usub_rcvd, rcv_interLvl[p+1], mpi_int_t, rcvdP,
+		 tag_interLvl_UData, (*symb_comm), &(request_rcv[i+1]));
+      i += 2;
+#if ( PRNTlevel>=1 )
+      PS->no_msgsRcvd += (float) 2;
+      PS->sz_msgsRcvd += (float) (rcv_interLvl[p] + rcv_interLvl[p+1]);
+      if (PS->maxsz_msgRcvd < rcv_interLvl[p])
+	PS->maxsz_msgRcvd = rcv_interLvl[p];
+      if (PS->maxsz_msgRcvd < rcv_interLvl[p+1])
+	PS->maxsz_msgRcvd = rcv_interLvl[p+1];
+#endif
+    }
+  
+  /* wait until messages are received and update local data */
+  for (i = 0; i < nprocsToRcv; i++) {
+    MPI_Waitany (2*nprocsToRcv, request_rcv, &ind1, status);
+    ij = 0;
+    for (p = fstP; p < lstP; p++)
+      if (rcv_interLvl[2*p] != EMPTY) {
+	if (ij <= ind1 && ind1 < ij+2) {
+	  rcvdP = p; p = lstP;
+	  if (ind1 == ij) ind2 = ij+1;
+	  else ind2 = ind1 - 1;
+	  ind_l = ij; ind_u = ij+1;
+	}
+	ij += 2;
+      }
+    MPI_Get_count (status, mpi_int_t, &ij);
+    MPI_Wait (&(request_rcv[ind2]), status);
+    MPI_Get_count (status, mpi_int_t, &ik);    
+    if (ind1 == ind_l) {
+      lsub_rcvd_sz = ij;
+      usub_rcvd_sz = ik;
+    } else {
+      lsub_rcvd_sz = ik;
+      usub_rcvd_sz = ij;
+    }
+    lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP]]);
+    usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP+1]]);
+    
+    /* use received data to update symbolic factorization information */
+    mem_error = 
+      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
+		 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
+		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
+		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+    lsub = Llu_symbfact->lsub;
+    mem_error = 
+      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
+		 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY,
+		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
+		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+    usub = Llu_symbfact->usub;      
+  }
+  
+  if (nprocsToSnd)
+    MPI_Waitall (2*nprocsToSnd, request_snd, status);
+
+  *pmark = mark;
+  if (request_snd != NULL) SUPERLU_FREE (request_snd);
+  if (request_rcv != NULL) SUPERLU_FREE (request_rcv);
+  if (status != NULL) SUPERLU_FREE (status);
+
+  return 0;
+}
+
+static void
+freeComm
+(
+ int   iam,          /* Input -my processor number */
+ int   nprocs,       /* Input -number of procs for the symbolic fact. */
+ MPI_Comm *commLvls, /* Input -communicators for the nodes in the sep tree */
+ MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */
+ )
+{
+  int szSep, i, j, k;
+  int np, npNode, fstP, lstP, ind;
+
+  i = 2 * nprocs - 2;
+  MPI_Comm_free (&(commLvls[i]));
+  
+  szSep = 2;
+  i -= szSep;
+  
+  while (i > 0) {
+    /* for each level in the separator tree */
+    npNode = nprocs / szSep; 
+    fstP = 0; 
+    /* for each node in the level */
+    for (j = i; j < i + szSep; j++) {
+      lstP = fstP + npNode;
+      if (fstP <= iam && iam < lstP) {
+	ind = j;
+      }
+      fstP += npNode;
+    }
+    MPI_Comm_free ( &(commLvls[ind]) );
+    szSep *= 2;
+    i -= szSep;
+  }  
+}
+
+static void
+createComm 
+(
+ int   iam,          /* Input -my processor number */
+ int   nprocs,       /* Input -number of procs for the symbolic factorization */
+ MPI_Comm *commLvls, /* Output -communicators for the nodes in the sep tree */
+ MPI_Comm *symb_comm
+ )
+{
+  int szSep, i, j, jj, k, *pranks;
+  int np, npNode, fstP, lstP, p, code_err, ind, col, key;
+  
+  for (i=0; i < 2*nprocs; i++)
+    commLvls[i] = MPI_COMM_NULL;
+
+  /* Make a list of the processes in the new communicator. */
+  pranks = (int *) SUPERLU_MALLOC( nprocs * sizeof(int) );
+  
+  i = 2 * nprocs - 2;
+  MPI_Comm_dup ((*symb_comm), &(commLvls[i]));
+  szSep = 2;
+  i -= szSep;
+
+  while (i > 0) {
+    /* for each level in the separator tree */
+    npNode = nprocs / szSep; 
+    fstP = 0; 
+    /* for each node in the level */
+    for (j = i; j < i + szSep; j++) {
+      lstP = fstP + npNode;
+      if (fstP <= iam && iam < lstP) {
+	ind = j;
+	key = iam - fstP;
+	col = fstP;
+      }
+      fstP += npNode;
+    }
+    MPI_Comm_split ((*symb_comm), col, key, &(commLvls[ind]) );
+    
+    szSep *= 2;
+    i -= szSep;
+  }
+  
+  SUPERLU_FREE (pranks);
+}
+
+static void
+intraLvl_symbfact 
+(
+ SuperMatrix *A, /* Input - original matrix A  */
+ int   iam,      /* Input - my processor number */
+ int   lvl,      /* Input - current level in the separator tree */
+ int   szSep,    /* Input - size of the current separator(node) */
+ int   ind_sizes1,
+ int   ind_sizes2, 
+ int_t *sizes,     /* Input - sizes of each node in the separator tree */
+ int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
+ int   fstP,     /* Input - first processor assigned to current node */
+ int   lstP,     /* Input - last processor assigned to current node */
+ int_t fstVtx,   /* Input - first vertex of current node */
+ int_t lstVtx,   /* Input - last vertex of current node */
+ Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS,
+ int_t *marker,
+ int_t *p_mark,    /* marker used to merge elements of vertices */
+ int_t *p_nextl,   /* ptr to nextl in lsub structure */
+ int_t *p_nextu,   /* ptr to nextu in usub structure */
+ int_t *p_neltsZr, /* no of artificial zeros introduced so far */
+ int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
+			 computed so far */
+ int_t *p_nsuper_loc,
+ MPI_Comm ndComm,
+ MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
+ )
+{
+  int nprocsLvl, p, prvP, rcvP;
+  int toSend, rcvd_prvP, index_req[2];
+  int_t fstVtx_loc_lid, fstVtx_loc, vtx, vtxLvl, curblk_loc, denseSep;
+  int_t fstVtx_blk, fstVtx_blk_lid, lstVtx_blk, lstVtx_blk_lid, tag;
+  int_t nvtcs_blk, xusub_end, xlsub_end, prv_fstVtx_blk;
+  int_t n;
+  int_t *rcv_intraLvl, *snd_intraLvl;
+  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
+  int_t nmsgsRcvd, nmsgsTRcv, sz_msg;
+  int_t nvtcsLvl_loc, nextl, nextu, ind_blk, snd_vtxLvl, maxNeltsVtx_in;
+  int_t count, vtx_loc, mem_error, lstBlkRcvd;
+  int_t fstVtx_blk_loc, fstBlk, vtx_lid, prElt, nelts, j, nvtcs_toUpd;
+  int_t snd_LinterLvlSz, fstVtx_blk_loc_lid, prElt_ind, maxNmsgsToRcv;
+  int_t *xlsub, *xusub, *lsub, *usub;
+  int_t *globToLoc, maxNvtcsPProc, nblk_loc, upd_myD, r, fstVtx_blkCyc;
+  int_t k, prElt_L, prElt_U, vtx_elt, fstVtx_toUpd;
+  int intSzMsg;
+
+  MPI_Status status[4];
+  MPI_Request request[4];
+  
+  /* Initializations */
+  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
+  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
+  
+  /* max number of msgs this processor can receive during 
+     intraLvl_symbfact routine */
+  maxNmsgsToRcv  = (lstVtx - fstVtx) / VInfo->maxSzBlk + 1;
+  maxNeltsVtx_in = VInfo->maxNeltsVtx;
+  globToLoc      = Pslu_freeable->globToLoc;
+  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
+  n = A->ncol;
+  nprocsLvl       = lstP - fstP;
+  rcv_intraLvl    = CS->rcv_intraLvl;
+  snd_intraLvl    = CS->snd_intraLvl;
+  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
+  nmsgsTRcv       = 0;
+  nmsgsRcvd       = 0;
+  nblk_loc        = 0;
+  nvtcs_toUpd     = nvtcsLvl_loc;
+  fstVtx_blk      = fstVtx;
+  denseSep        = FALSE;
+
+  /* determine first vertex that belongs to fstP */
+  k = fstVtx;
+  fstVtx_blkCyc = n;
+  while (k < lstVtx && fstVtx_blkCyc == n) {
+    p = OWNER( globToLoc[k] );
+    if (p == fstP)
+      fstVtx_blkCyc = k;
+    k += VInfo->maxSzBlk;
+  }
+
+  for (p = fstP; p < lstP; p++)
+    rcv_intraLvl[p] = 0;
+
+  for (r = 0; r < 3; r++) 
+    request[r] = MPI_REQUEST_NULL;
+
+  fstVtx_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc];
+  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); 
+  vtx = fstVtx_loc;
+  if (fstVtx_loc >= fstVtx_blkCyc)
+    nblk_loc = 1;
+  while (VInfo->begEndBlks_loc[VInfo->curblk_loc] < lstVtx && !VInfo->filledSep) {
+    CS->snd_intraSz  = 0;
+    CS->snd_LintraSz = 0;
+
+    lstBlkRcvd     = FALSE;
+    prv_fstVtx_blk = fstVtx_blk;
+    fstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc];
+    lstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc + 1];
+    fstVtx_toUpd   = VInfo->begEndBlks_loc[VInfo->curblk_loc + 2];
+    fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
+    lstVtx_blk_lid = LOCAL_IND( globToLoc[lstVtx_blk - 1] + 1);
+    nvtcs_blk      = lstVtx_blk - fstVtx_blk;
+    nvtcs_toUpd   -= nvtcs_blk;
+    nmsgsTRcv      = n;
+    VInfo->maxNeltsVtx -= fstVtx_blk - prv_fstVtx_blk;
+
+    index_req[0] = EMPTY;
+    for (r = 0; r < 3; r++) 
+      request[r] = MPI_REQUEST_NULL;
+    if (fstVtx_blk != fstVtx) {
+      /* if not the first vertex of the level */
+      prvP           = OWNER( globToLoc[fstVtx_blk - 1] );
+      rcvd_prvP      = FALSE;
+      /* receive info on number messages to receive */
+      tag = tag_intraLvl + nblk_loc;
+      if (iam == fstP)  tag --;
+      
+      MPI_Irecv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
+		 tag, (*symb_comm), &(request[1]));
+
+      while (!rcvd_prvP || nmsgsRcvd < nmsgsTRcv) {
+	if (index_req[0] != 1) {
+	  MPI_Irecv (&sz_msg, 1, mpi_int_t, 
+		     MPI_ANY_SOURCE, tag_intraLvl_szMsg, 
+		     (*symb_comm), &(request[0]));  
+	  if (sz_msg > INT_MAX)
+	    ABORT("ERROR in intraLvl_symbfact size to send > INT_MAX\n");
+	}
+	MPI_Waitany (2, request, index_req, status);
+	if (index_req[0] == 1) {
+	  /* receive information on no msgs to receive */
+#if ( PRNTlevel>=1 )
+	  PS->no_shmRcvd ++;
+#endif
+	  rcvd_prvP = TRUE;
+	  nmsgsTRcv = rcv_intraLvl[iam];
+	  /* if dense separator was detected by one of the 
+	     previous processors ... */
+	  if (nmsgsTRcv > maxNmsgsToRcv) {
+	    VInfo->filledSep = (int) nmsgsTRcv / maxNmsgsToRcv;
+	    nmsgsTRcv = nmsgsTRcv % maxNmsgsToRcv;
+	  }
+	  
+	  if (nmsgsTRcv == nmsgsRcvd) {
+	    /* MPI_Cancel (&(request[0])); */
+	    MPI_Send (&r, 1, mpi_int_t, iam, 
+		      tag_intraLvl_szMsg, (*symb_comm));
+	    MPI_Wait (&(request[0]), status);	    
+	  }
+	}
+	if (index_req[0] == 0) {
+	  nmsgsRcvd ++;
+	  if (nmsgsTRcv == nmsgsRcvd)  lstBlkRcvd = TRUE; 
+	  rcvP = status->MPI_SOURCE;
+
+	  /* allocate enough space to receive data */
+	  if (CS->rcv_bufSz < sz_msg) {
+	    PS->maxSzBuf += sz_msg - CS->rcv_bufSz;
+	    if (CS->rcv_bufSz != 0)
+	      /* not first time allocate memory */
+	      SUPERLU_FREE (CS->rcv_buf);
+	    CS->rcv_bufSz = sz_msg;
+	    if (!(CS->rcv_buf = intMalloc_symbfact (sz_msg))) {
+	      ABORT("Malloc fails for rcv_buf[].");
+	    }
+	  }
+	  
+	  /* use received data to update symbolic factorization */
+	  lsub_rcvd = CS->rcv_buf;
+	  MPI_Recv (lsub_rcvd, sz_msg, mpi_int_t, 
+		    rcvP, tag_intraLvl_LData, (*symb_comm), status);
+	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
+	  lsub_rcvd_sz = intSzMsg;
+	  usub_rcvd    = &(CS->rcv_buf[lsub_rcvd_sz]);
+	  MPI_Recv (usub_rcvd, sz_msg - lsub_rcvd_sz, 
+		    mpi_int_t, rcvP,
+		    tag_intraLvl_UData, (*symb_comm), status);
+	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
+	  usub_rcvd_sz = intSzMsg;
+#if ( PRNTlevel>=1 )
+	  PS->no_shmRcvd ++;
+	  PS->no_msgsRcvd += (float) 2;
+	  PS->sz_msgsRcvd += (float) sz_msg;
+	  if (PS->maxsz_msgRcvd < lsub_rcvd_sz) PS->maxsz_msgRcvd = lsub_rcvd_sz;
+	  if (PS->maxsz_msgRcvd < usub_rcvd_sz) PS->maxsz_msgRcvd = usub_rcvd_sz;
+#endif
+
+	  if (!lstBlkRcvd) {
+	    mem_error = 
+	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
+			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
+			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
+			 1, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    lsub = Llu_symbfact->lsub;
+	    mem_error = 
+	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
+			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
+			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
+			 0, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    usub = Llu_symbfact->usub;
+	  }
+	}
+      }
+    }
+  
+    if (VInfo->filledSep) {
+      mem_error = 
+	denseSep_symbfact (1, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
+			   szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
+			   p_nextl, p_nextu, p_mark, p_nsuper_loc, marker,
+			   ndComm, symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
+    }
+    else {
+      /* compute symbolic factorization for this block */
+      if (!lstBlkRcvd) {
+	lsub_rcvd = NULL; usub_rcvd = NULL;
+      }
+
+      blk_symbfact (A, iam, lvl, 
+		    szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
+		    fstVtx_loc, fstVtx_blk, lstVtx_blk, 
+		    lsub_rcvd, lsub_rcvd_sz, usub_rcvd, usub_rcvd_sz,
+		    Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
+		    marker, p_mark,
+		    p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
+		    p_nsuper_loc);
+      lsub = Llu_symbfact->lsub;
+      usub = Llu_symbfact->usub; 	 
+      
+      if (lstVtx_blk != lstVtx) {
+	/* if this is not the last block of the level */
+	if (VInfo->filledSep == FILLED_SEPS ||
+	    ( VInfo->filledSep == FILLED_SEP && 
+	      ((lstVtx - lstVtx_blk > VInfo->maxSzBlk * nprocsLvl && nblk_loc > 0) ||
+	       (lstVtx - fstVtx_blkCyc > VInfo->maxSzBlk * nprocsLvl && nblk_loc == 0))))
+	  /* if current separator is dense and this is not the last block, 
+	     then ... */
+	  denseSep = TRUE;
+	else
+	  /* separator dense but not enough uncomputed blocks 
+	     in the separator to take advantage of it */
+	  VInfo->filledSep = FALSE;
+	
+	if (VInfo->filledSep == FILLED_SEPS) {
+	  for (p = fstP; p < lstP; p++)
+	    rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + rcv_intraLvl[p];
+	  denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
+			     szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
+			     p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
+			     symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
+	}
+	else {
+	  /* send blk to next procs and update the rest of my own blocks */
+	  if (lstBlkRcvd) {
+	    mem_error = 
+	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
+			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
+			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
+			 1, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    lsub = Llu_symbfact->lsub;
+	    mem_error = 
+	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
+			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
+			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
+			 0, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    usub = Llu_symbfact->usub;
+	  }
+
+	  upd_myD = FALSE;
+	  /* determine processors to which send this block
+	     and copy data to be sent */
+	  for (p = fstP; p < lstP; p++)
+	    snd_intraLvl[p] = FALSE;
+	  nextl = 0; 
+	  nextu = nextl + CS->snd_LintraSz;
+	  
+	  /* allocate enough space to receive data */
+	  if (CS->rcv_bufSz < CS->snd_intraSz) {
+	    PS->maxSzBuf += CS->snd_intraSz - CS->rcv_bufSz;
+	    if (CS->rcv_bufSz != 0)
+	      /* not first time allocate memory */
+	      SUPERLU_FREE (CS->rcv_buf);
+	    CS->rcv_bufSz = CS->snd_intraSz;
+	    if (!(CS->rcv_buf = intMalloc_symbfact (CS->snd_intraSz))) {
+	      ABORT("Malloc fails for rcv_buf[].");
+	    }
+	  }
+
+	  for (vtx = fstVtx_blk, vtx_lid = fstVtx_blk_lid; 
+	       vtx < lstVtx_blk; vtx++, vtx_lid ++) {
+	    toSend = FALSE;
+	    k = xlsub[vtx_lid];
+	    prElt_L = lsub[k];
+	    j = xusub[vtx_lid];
+	    prElt_U = usub[j];
+
+	    if (prElt_L >= lstVtx_blk || prElt_U >= lstVtx_blk) {
+	      if (vtx == lstVtx_blk - 1) {
+		xlsub_end = *p_nextl;
+		xusub_end = *p_nextu;
+	      }
+	      else {
+		xlsub_end = xlsub[vtx_lid + 1];
+		xusub_end = xusub[vtx_lid + 1];
+	      }
+	      if (prElt_L >= lstVtx_blk) {
+		while (lsub[k] <= prElt_L && k < xlsub_end) {
+		  vtx_elt = lsub[k];
+		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
+		    p = OWNER( globToLoc[vtx_elt] );
+		    if (p != iam) {
+		      /* vtx will be send to another processor */
+		      snd_intraLvl[p] = TRUE;
+		      toSend = TRUE;
+		    }
+		    else {
+		      upd_myD = TRUE;
+		    }
+		  }
+		  k++;
+		}
+	      }
+	      if (prElt_U >= lstVtx_blk) {
+		while (usub[j] <= prElt_U && j < xusub_end) {
+		  vtx_elt = usub[j];
+		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
+		    p = OWNER( globToLoc[vtx_elt] );
+		    if (p != iam) {
+		      /* vtx will be send to another processor */
+		      snd_intraLvl[p] = TRUE;
+		      toSend = TRUE;
+		    }
+		    else {
+		      upd_myD = TRUE;
+		    }
+		  }
+		  j ++;
+		}
+	      }
+	      if (toSend) {
+		/* L(:, vtx) and U(vtx, :) will be send to processors */
+		nelts = xusub_end - xusub[vtx_lid];
+		CS->rcv_buf[nextu + DIAG_IND]  = vtx;
+		CS->rcv_buf[nextu + NELTS_IND] = nelts;
+		nextu += 2;
+		for (j = xusub[vtx_lid]; j < xusub_end; j++) {
+		  CS->rcv_buf[nextu] = usub[j]; nextu ++;
+		}
+		
+		nelts = xlsub_end - xlsub[vtx_lid];
+		CS->rcv_buf[nextl + DIAG_IND] = vtx;
+		CS->rcv_buf[nextl + NELTS_IND] = nelts; 
+		nextl += 2;
+		for (j = xlsub[vtx_lid]; j < xlsub_end; j++) {
+		  CS->rcv_buf[nextl] = lsub[j]; nextl ++;
+		}
+	      }
+	    }
+	  }
+	  for (p = fstP; p < lstP; p++) 
+	    if (snd_intraLvl[p])
+	      rcv_intraLvl[p] ++;
+
+	  if (VInfo->filledSep == FILLED_SEP) {
+	    for (p = fstP; p < lstP; p++)
+	      rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + 
+		rcv_intraLvl[p];
+	  }
+	  else {
+	    /* send to the owner of the next block info on no of messages */
+	    p = OWNER( globToLoc[lstVtx_blk] );
+	    tag = tag_intraLvl + nblk_loc;
+	    
+	    MPI_Isend (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
+		       tag, (*symb_comm), request);
+#if ( PRNTlevel>=1 )
+	    PS->no_shmSnd ++;
+#endif
+	  }
+
+	  /* there is data to be send */
+	  sz_msg = nextl + nextu - CS->snd_LintraSz;
+	  for (p = fstP; p < lstP; p++) {
+	    if (p != iam && snd_intraLvl[p]) {
+	      MPI_Isend (&sz_msg, 1, mpi_int_t, p,
+			 tag_intraLvl_szMsg, (*symb_comm), &(request[1]));
+	      MPI_Isend (CS->rcv_buf, nextl, mpi_int_t, p,
+			 tag_intraLvl_LData, (*symb_comm), &(request[2]));
+	      MPI_Isend (&(CS->rcv_buf[CS->snd_LintraSz]), 
+			 nextu - CS->snd_LintraSz, mpi_int_t, p,
+			 tag_intraLvl_UData, (*symb_comm), &(request[3]));
+	      MPI_Waitall(3, &(request[1]), &(status[1]));
+#if ( PRNTlevel>=1 )
+	      PS->no_shmSnd ++;
+	      PS->no_msgsSnd += (float) 2;
+	      PS->sz_msgsSnd += (float) sz_msg;
+	      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
+	      if (PS->maxsz_msgSnd < nextu - CS->snd_LintraSz) 
+		PS->maxsz_msgSnd = nextu - CS->snd_LintraSz;
+#endif
+	    }
+	  }
+	  if (VInfo->filledSep != FILLED_SEP) {
+	    MPI_Wait (request, status);      
+	  }
+
+	  /* update rest of vertices */
+	  if (upd_myD) {
+	    lsub_rcvd_sz = (*p_nextl) - xlsub[fstVtx_blk_lid];
+	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
+	    usub_rcvd_sz = (*p_nextu) - xusub[fstVtx_blk_lid];
+	    usub_rcvd    = &(usub[xusub[fstVtx_blk_lid]]);
+	    
+	    mem_error =
+	      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
+			 usub_rcvd, usub_rcvd_sz, fstVtx_blk, lstVtx_blk,
+			 EMPTY,
+			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
+			 1, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    lsub = Llu_symbfact->lsub;
+	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
+	    mem_error =
+	      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
+			 lsub_rcvd, lsub_rcvd_sz, fstVtx_blk, lstVtx_blk,
+			 EMPTY,
+			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
+			 0, p_mark,
+			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
+	    usub = Llu_symbfact->usub;
+	  }
+	  if (VInfo->filledSep == FILLED_SEP)
+	    denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
+			       szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
+			       p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
+			       symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
+	}
+      }
+    }
+    VInfo->curblk_loc += 2;
+    nblk_loc ++;
+  }
+  
+  /* update maxNeltsVtx */
+  VInfo->maxNeltsVtx = maxNeltsVtx_in - lstVtx + fstVtx;
+  
+  /* if current separator dense, then reset value of filledSep */
+  if (VInfo->filledSep == FILLED_SEP)
+    VInfo->filledSep = FALSE;
+}
+
+static void
+symbfact_free 
+(
+ int   iam,    /* Input - my processor number */
+ int   nprocs, /* Input - number of processors for the symbolic factorization */
+ Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
+ comm_symbfact_t *CS
+ )
+{
+  /* free memory corresponding to prune structure */
+  if (Llu_symbfact->szLsubPr != 0)
+    SUPERLU_FREE( Llu_symbfact->lsubPr );
+  if (Llu_symbfact->szUsubPr != 0)
+    SUPERLU_FREE( Llu_symbfact->usubPr );
+  if (Llu_symbfact->xlsubPr != NULL)
+    SUPERLU_FREE( Llu_symbfact->xlsubPr );
+  if (Llu_symbfact->xusubPr != NULL)
+    SUPERLU_FREE( Llu_symbfact->xusubPr );
+  
+  if (Llu_symbfact->xlsub_rcvd != NULL)
+    SUPERLU_FREE( Llu_symbfact->xlsub_rcvd);
+  if (Llu_symbfact->xusub_rcvd != NULL)
+    SUPERLU_FREE( Llu_symbfact->xusub_rcvd);
+  
+  if (Llu_symbfact->cntelt_vtcs != NULL)
+    SUPERLU_FREE( Llu_symbfact->cntelt_vtcs);
+  if (Llu_symbfact->cntelt_vtcsA_lvl != NULL)
+    SUPERLU_FREE( Llu_symbfact->cntelt_vtcsA_lvl);
+  
+  if (CS->rcv_bufSz != 0)
+    SUPERLU_FREE( CS->rcv_buf );
+  if (CS->snd_bufSz != 0)
+    SUPERLU_FREE( CS->snd_buf );
+  
+  SUPERLU_FREE( VInfo->begEndBlks_loc);
+  SUPERLU_FREE( CS->rcv_interLvl);
+  SUPERLU_FREE( CS->snd_interLvl);
+  SUPERLU_FREE( CS->ptr_rcvBuf);
+  SUPERLU_FREE( CS->rcv_intraLvl);
+  SUPERLU_FREE( CS->snd_intraLvl);
+  SUPERLU_FREE( CS->snd_interSz);
+  SUPERLU_FREE( CS->snd_LinterSz);
+  SUPERLU_FREE( CS->snd_vtxinter);  
+}
+
+static void
+estimate_memUsage
+(
+ int_t n,  /* Input - order of the matrix */
+ int iam,  /* Input - my processor number */
+ superlu_dist_mem_usage_t *symb_mem_usage,
+ float *p_totalMemLU,   /* Output -memory used for symbolic factorization */
+ float *p_overestimMem, /* Output -memory allocated during to right looking 
+			   overestimation memory usage */
+ Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
+ Llu_symbfact_t *Llu_symbfact,  /* Input - local L, U data structures */
+ vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */
+ comm_symbfact_t *CS,
+ psymbfact_stat_t *PS
+ )
+{
+  int_t nvtcs_loc, lword, nsuper_loc;
+  float lu_mem, other_mem, overestimMem;
+  
+  nvtcs_loc = VInfo->nvtcs_loc;
+  nsuper_loc = Pslu_freeable->supno_loc[nvtcs_loc];
+  lword     = sizeof(int_t);
+  
+  /* memory for xlsub, xusub, supno_loc, cntelt_vtcs */
+  lu_mem = 4.0 * (float) nvtcs_loc * (float) lword;
+  /* memory for xlsubPr, xusubPr */
+  lu_mem += 2.0 * (float) VInfo->maxNvtcsNds_loc * (float) lword;
+  
+  if (PS->estimLSz < Llu_symbfact->xlsub[nvtcs_loc])
+    PS->estimLSz = Llu_symbfact->xlsub[nvtcs_loc];
+  if (PS->estimUSz < Llu_symbfact->xusub[nvtcs_loc])
+    PS->estimUSz = Llu_symbfact->xusub[nvtcs_loc];
+  
+  lu_mem += (float) PS->estimLSz * lword;
+  lu_mem += (float) PS->estimUSz * lword;
+  lu_mem += (float) PS->maxSzLPr * lword;
+  lu_mem += (float) PS->maxSzUPr * lword;
+  lu_mem += (float) PS->szDnsSep * lword;
+  /* memory for globToLoc, tempArray */
+  lu_mem += (float) 2* (float) n * lword;
+  lu_mem += (float) PS->maxSzBuf * lword;
+  
+  overestimMem  = (float) (PS->estimLSz - Llu_symbfact->xlsub[nvtcs_loc]) * lword;
+  overestimMem += (float) (PS->estimUSz - Llu_symbfact->xusub[nvtcs_loc]) * lword;
+  
+  *p_totalMemLU = lu_mem;  
+  *p_overestimMem = overestimMem;
+  
+  symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword);
+  symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword); 
+  symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword);   
+  symb_mem_usage->total = lu_mem;
+}
+
+
+static int_t *
+intMalloc_symbfact(int_t n)
+{
+  int_t *buf;
+  if (n == 0)
+    buf = NULL;
+  else
+    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
+  return buf;
+}
+
+static int_t *
+intCalloc_symbfact(int_t n)
+{
+  int_t *buf;
+  register int_t i;
+
+  if (n == 0)
+    buf = NULL;
+  else
+    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
+  if ( buf )
+    for (i = 0; i < n; i++) buf[i] = 0;
+  return (buf);
+}
+
diff --git a/SRC/psymbfact.h b/SRC/psymbfact.h
new file mode 100644
index 0000000..b65f382
--- /dev/null
+++ b/SRC/psymbfact.h
@@ -0,0 +1,302 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Definitions for parallel symbolic factorization routine
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#ifndef __SUPERLU_DIST_PSYMBFACT /* allow multiple inclusions */
+#define __SUPERLU_DIST_PSYMBFACT
+
+/*
+ * File name:	psymbfact.h
+ * Purpose:     Definitions for parallel symbolic factorization routine
+ */
+
+/*! \brief
+ *
+ * <pre>
+ *-- Structure returned by the symbolic factorization routine
+ *
+ * Memory is allocated during parallel symbolic factorization
+ * symbfact_dist, and freed after dist_symbLU routine.
+ *
+ * (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xlsub[j] points to the starting
+ *	location of the j-th column in lsub[*]. Note that xlsub 
+ *	is indexed by column.
+ *	Storage: row subscripts
+ *
+ * (xusub,usub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xusub[j] points to the starting
+ *	location of the j-th row in usub[*]. Note that xusub 
+ *	is indexed by rows.
+ *	Storage: column subscripts
+ *
+ * (xsup_beg_loc,xsup_end_loc, supno_loc) describes mapping between 
+ *      supernode and column, information local to each processor:
+ *	xsup_beg_loc[s] is the leading column of the local s-th supernode.
+ *	xsup_end_loc[s] is the last column of the local s-th supernode.
+ *      supno[i] is the supernode no to which column i belongs;
+ * </pre>
+ */
+
+typedef struct {
+  int_t     *xlsub;  /* pointer to the beginning of each column of L */
+  int_t     *lsub;   /* compressed L subscripts, stored by columns */
+  int_t     szLsub;  /* current max size of lsub */
+  
+  int_t     *xusub;  /* pointer to the beginning of each row of U */
+  int_t     *usub;   /* compressed U subscripts, stored by rows */
+  int_t     szUsub;  /* current max size of usub */
+  
+  int_t     *supno_loc;  
+  int_t     *xsup_beg_loc;
+  int_t     *xsup_end_loc;
+  int_t     nvtcs_loc;       /* number of local vertices */
+  int_t     *globToLoc;      /* global to local indexing */
+  int_t     maxNvtcsPProc;   /* max number of vertices on the processors */
+} Pslu_freeable_t;
+
+
+/*! \brief
+ * 
+ * <pre>
+ *-- The structures are determined by symbfact_dist and not used thereafter.
+ *
+ * (xlsub,lsub): lsub[*] contains the compressed subscript of L, as described above
+ *      for Pslu_freeable_t.  This structure is used internally in symbfact_dist.
+ * (xusub,usub): usub[*] contains the compressed subscript of U, as described above
+ *      for Pslu_freeable_t.  This structure is used internally in symbfact_dist.
+ *
+ * (xlsubPr,lsubPr): contains the pruned structure of the graph of
+ *      L, stored by rows as a linked list.
+ *	xlsubPr[j] points to the starting location of the j-th 
+ *      row in lsub[*].
+ *	Storage: original row subscripts.
+ *      It contains the structure corresponding to one node in the sep_tree.
+ *      In each independent domain formed by x vertices, xlsubPr is of size x.
+ *      Allocated and freed during domain_symbolic.
+ *      For the other nodes in the level tree, formed by a maximum of 
+ *      maxNvtcsNds_loc, xlsubPr is of size maxNvtcsNds_loc. 
+ *      Allocated after domain_symbolic, freed at the end of symbolic_dist
+ *      routine.
+ * (xusubPr,usubPr): contains the pruned structure of the graph of
+ *      U, stored by columns as a linked list.  Similar to (xlsubPr,lsubPr),
+ *      except that it is column oriented. 
+ *
+ * This is allocated during symbolic factorization symbfact_dist.
+ * </pre>
+ */
+
+typedef struct {
+  int_t     *xlsubPr;  /* pointer to pruned structure of L */
+  int_t     *lsubPr;   /* pruned structure of L */
+  int_t     szLsubPr;  /* size of lsubPr array */
+  int_t     indLsubPr; /* current index in lsubPr */
+  int_t     *xusubPr;  /* pointer to pruned structure of U */
+  int_t     *usubPr;   /* pruned structure of U */
+  int_t     szUsubPr;  /* size of usubPr array */
+  int_t     indUsubPr; /* current index in usubPr */
+
+  int_t     *xlsub_rcvd;
+  int_t     *xlsub;     /* pointer to structure of L, stored by columns */
+  int_t     *lsub;      /* structure of L, stored by columns */
+  int_t     szLsub;     /* current max size of lsub */
+  int_t     nextl;      /* pointer to current computation in lsub */
+  
+  int_t     *xusub_rcvd; /* */
+  int_t     *xusub;      /* pointer to structure of U, stored by rows */
+  int_t     *usub;       /* structure of U, stored by rows */
+  int_t     szUsub;      /* current max size of usub */
+  int_t     nextu;       /* pointer to current computation in usub */
+  
+  int_t     *cntelt_vtcs; /* size of column/row for each vertex */
+  int_t     *cntelt_vtcsA_lvl; /* size of column/row of A for each vertex at the
+				  current level */
+  
+  LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
+  int_t  no_expand;    /* Number of memory expansions */
+  int_t  no_expand_pr; /* Number of memory expansions of the pruned structures */
+  int_t  no_expcp;     /* Number of memory expansions due to the right looking 
+			  overestimation approach */
+} Llu_symbfact_t;
+
+/*! \brief Local information on vertices distribution */
+typedef struct {
+  int_t  maxSzBlk;        /* Max no of vertices in a block */
+  int_t  maxNvtcsNds_loc; /* Max number of vertices of a node distributed on one
+			     processor.  The maximum is computed among all the nodes 
+			     of the sep arator tree and among all the processors */
+  int_t  maxNeltsVtx;     /* Max number of elements of a vertex,
+			     that is condisering that the matrix is
+			     dense */
+  int_t  nblks_loc;       /* Number of local blocks */
+  int_t  *begEndBlks_loc; /* Begin and end vertex of each local block.
+			     Array of size 2 * nblks_loc */
+  int_t  curblk_loc;      /* Index of current block in the level under computation */
+  int_t  nvtcs_loc;       /* Number of local vertices distributed on a processor */
+  int_t  nvtcsLvl_loc;    /* Number of local vertices for current
+			     level under computation */
+  int    filledSep;       /* determines if curent or all separators are filled */
+  int_t  nnz_asup_loc;    /* Number of nonzeros in asup not yet consumed.  Used during
+			     symbolic factorization routine to determine how much 
+			     of xusub, usub is still used to store the input matrix AS */
+  int_t  nnz_ainf_loc;    /* Number of nonzeros in ainf.  Similar to nnz_asup_loc. */
+  int_t  xusub_nextLvl;   /* Pointer to usub of the next level */
+  int_t  xlsub_nextLvl;   /* Pointer to lsub of the next level */
+  int_t  fstVtx_nextLvl;  /* First vertex of the next level */
+} vtcsInfo_symbfact_t;
+
+/*! \brief Structure used for redistributing A for the symbolic factorization algorithm */
+typedef struct {
+  int_t  *x_ainf;   /* pointers to columns of Ainf */
+  int_t  *ind_ainf; /* column indices of Ainf */
+  int_t  *x_asup;   /* pointers to rows of Asup */
+  int_t  *ind_asup; /* row indices of Asup */
+} matrix_symbfact_t;
+
+typedef struct {
+  int_t  *rcv_interLvl; /* from which processors iam receives data */
+  int_t  *snd_interLvl; /* to which processors iam sends data */
+  int_t  *snd_interSz;  /* size of data to be send */
+  int_t  *snd_LinterSz; /* size of data in L part to be send */
+  int_t  *snd_vtxinter; /* first vertex from where to send data */
+
+  /* inter level data structures */
+  int_t  *snd_intraLvl; /* to which processors iam sends data */
+  int_t  snd_intraSz;   /* size of data to send */
+  int_t  snd_LintraSz;  /* size of data to send */
+  int_t  *rcv_intraLvl; /* from which processors iam receives data */
+  int_t  *rcv_buf;      /* buffer to receive data */
+  int_t  rcv_bufSz;     /* size of the buffer to receive data */
+  int_t  *snd_buf;      /* buffer to send data */
+  int_t  snd_bufSz;     /* size of the buffer to send data */
+  int_t  *ptr_rcvBuf;   /* pointer to rcv_buf, the buffer to receive data */
+} comm_symbfact_t;
+
+/* relaxation parameters used in the algorithms - for future release */
+/*! \brief statistics collected during parallel symbolic factorization */
+typedef struct {
+  int_t  fill_par;     /* Estimation of fill.  It corresponds to sp_ienv_dist(6) */
+  float  relax_seps;   /* relaxation parameter -not used in this version */
+  float  relax_curSep; /* relaxation parameter -not used in this version */
+  float  relax_gen;    /* relaxation parameter -not used in this version */
+
+  /* number of operations performed during parallel symbolic factorization */
+  float  nops;
+  
+  /* no of dense current separators per proc */
+  int_t nDnsCurSep;
+  /* no of dense separators up per proc */
+  int_t  nDnsUpSeps;
+  
+  float  no_shmSnd;    /* Number of auxiliary messages for send data */
+  float  no_msgsSnd;   /* Number of messages sending data */
+  int_t  maxsz_msgSnd; /* Max size of messages sending data */
+  float  sz_msgsSnd;   /* Average size of messages sending data */
+  float  no_shmRcvd;   /* Number of auxiliary messages for rcvd data */
+  float  no_msgsRcvd;  /* Number of messages receiving data */
+  int_t  maxsz_msgRcvd;/* Max size of messages receiving data */
+  float  sz_msgsRcvd;  /* Average size of messages receiving data */
+  float  no_msgsCol;   /* Number of messages sent for estimating size
+			  of rows/columns, setup information
+			  interLvl_symbfact,  */
+  int_t  maxsz_msgCol; /* Average size of messages counted in
+			  no_msgsCol */
+  float  sz_msgsCol;   /* Max size of messages counted in no_msgsCol */
+
+  /* statistics on fill-in */
+  float  fill_pelt[6];
+  /* 
+     0 - average fill per elt added during right-looking factorization 
+     1 - max fill per elt added during right-looking factorization 
+     2 - number vertices modified during right-looking factorization 
+     3 - average fill per elt 
+     4 - max fill per elt 
+     5 - number vertices computed in upper levels of separator tree
+  */
+
+  /* Memory usage */
+  int_t  estimLSz; /* size of lsub due to right looking overestimation */
+  int_t  estimUSz; /* size of usub due to right looking overestimation */
+  int_t  maxSzLPr; /* maximum size of pruned L */
+  int_t  maxSzUPr; /* maximum size of pruned U */
+  int_t  maxSzBuf; /* maximum size of the send and receive buffers */
+  int_t  szDnsSep; /* size of memory used when there are dense separators */
+  float  allocMem; /* size of the total memory allocated (in bytes) */
+} psymbfact_stat_t;
+
+/* MACROS */
+
+/* 
+   Macros for comptuting the owner of a vertex and the local index
+   corresponding to a vertex 
+*/
+#define OWNER(x)      ((x) / maxNvtcsPProc)
+#define LOCAL_IND(x)  ((x) % maxNvtcsPProc)
+
+/* Macros for computing the available memory in lsub, usub */
+#define MEM_LSUB(Llu, VInfo) (Llu->szLsub - VInfo->nnz_ainf_loc)
+#define MEM_USUB(Llu, VInfo) (Llu->szUsub - VInfo->nnz_asup_loc)
+
+#define tag_interLvl 2
+#define tag_interLvl_LData 0
+#define tag_interLvl_UData 1
+#define tag_intraLvl_szMsg 1000
+#define tag_intraLvl_LData 1001
+#define tag_intraLvl_UData 1002
+/* tag_intraLvl has to be the last tag number */
+#define tag_intraLvl 1003
+
+/* 
+ * Index of diagonal element, no of elements preceding each column/row
+ * of L/U send to another processor 
+ */
+#define DIAG_IND 0
+#define NELTS_IND 1
+#define RCVD_IND 2
+
+#define SUCCES_RET 0  /* successful return from a routine */
+#define ERROR_RET 1   /* error return code from a routine */
+#define FILLED_SEP 2  /* the current separator is dense */
+#define FILLED_SEPS 3 /* all the separators situated on the path from the current 
+			 separator to the root separator are dense */
+
+/* Code for the type of the memory to expand */
+#define USUB_PR 0
+#define LSUB_PR 1
+#define USUB 0
+#define LSUB 1
+
+/* 
+ * Code for the type of computation - right looking (RL_SYMB); left
+ * looking (LL_SYMB); symbolic factorization of an independent domain
+ * (DOMAIN_SYMB); current separator is dense (DNS_CURSEP); all the
+ * separators from the current one to the root of the tree are dense
+ * (DNS_UPSEPS).
+ */
+#define RL_SYMB 0
+#define DOMAIN_SYMB 1
+#define LL_SYMB 2
+#define DNS_UPSEPS 3
+#define DNS_CURSEP 4
+
+
+#endif /* __SUPERLU_DIST_PSYMBFACT */
+
+
+
diff --git a/SRC/psymbfact_util.c b/SRC/psymbfact_util.c
new file mode 100644
index 0000000..40b9c86
--- /dev/null
+++ b/SRC/psymbfact_util.c
@@ -0,0 +1,552 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Utilities for parallel symbolic factorization routine
+ *
+ * <pre>
+ * -- Distributed symbolic factorization auxialiary routine  (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
+ * INRIA France - January 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+#include "psymbfact.h"
+
+static void
+copy_mem_int(int_t howmany, int_t* old, int_t* new)
+{
+  register int_t i;
+  for (i = 0; i < howmany; i++) new[i] = old[i];
+}
+
+
+/*! \brief Expand the existing storage to accommodate more fill-ins.
+ */
+/************************************************************************/
+static int_t *expand
+/************************************************************************/
+(
+ int_t prev_len,    /* length used from previous call */
+ int_t min_new_len, /* minimum new length to allocate */
+ int_t *prev_mem,    /* pointer to the previous memory */
+ int_t *p_new_len,      /* length of the new memory allocated */
+ int_t len_tcopy_fbeg,  /* size of the memory to be copied to new store 
+			     starting from the beginning of the memory */
+ int_t len_tcopy_fend,  /* size of the memory to be copied to new store,
+			    starting from the end of the memory */
+ psymbfact_stat_t *PS
+ )
+{
+  float exp = 2.0;
+  float alpha;
+  int_t *new_mem;
+  int_t new_len, tries, lword, extra, bytes_to_copy;
+  
+  alpha = exp;
+  lword = sizeof(int_t);
+  
+  new_len = alpha * prev_len;
+  if (min_new_len > 0 && new_len < min_new_len)
+    new_len = min_new_len;
+  
+  new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
+  PS->allocMem += new_len * lword;
+  
+  if (new_mem) {
+    if (len_tcopy_fbeg != 0)
+      copy_mem_int(len_tcopy_fbeg, prev_mem, new_mem);
+    if (len_tcopy_fend != 0)  
+      copy_mem_int(len_tcopy_fend, &(prev_mem[prev_len-len_tcopy_fend]), 
+		   &(new_mem[new_len-len_tcopy_fend]));
+  }
+  *p_new_len = new_len;
+  return new_mem;
+  
+} /* EXPAND */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Expand the data structures for L and U during the factorization.
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * </pre>
+ */
+/************************************************************************/
+int_t psymbfact_LUXpandMem
+/************************************************************************/
+(
+ int_t iam,
+ int_t n,           /* total number of columns */
+ int_t vtxXp,       /* current vertex */
+ int_t next,        /* number of elements currently in the factors */
+ int_t min_new_len, /* minimum new length to allocate */
+ int_t mem_type,    /* which type of memory to expand  */
+ int_t rout_type,   /* during which type of factorization */
+ int_t free_prev_mem, /* =1 if prev_mem has to be freed */
+ Pslu_freeable_t *Pslu_freeable,
+ Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
+ vtcsInfo_symbfact_t *VInfo,
+ psymbfact_stat_t *PS
+ )
+{
+  int_t  *new_mem, *prev_mem, *xsub;
+  /* size of the memory to be copied to new store starting from the 
+     beginning/end of the memory */
+  int_t xsub_nextLvl;  
+  int_t exp, prev_xsub_nextLvl, vtxXp_lid;
+  int_t *globToLoc, maxNvtcsPProc, nvtcs_loc;
+  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid, vtx_lid, i, j;
+  int_t len_tcopy_fbeg, len_tcopy_fend, new_len, prev_len;  
+
+  exp  = 2;
+  globToLoc = Pslu_freeable->globToLoc;
+  nvtcs_loc = VInfo->nvtcs_loc;
+  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
+  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
+  vtxXp_lid      = LOCAL_IND( globToLoc[vtxXp] );
+  len_tcopy_fbeg = next;
+  if (fstVtx_nextLvl == n)
+    fstVtx_nextLvl_lid = nvtcs_loc;
+  else
+    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
+
+  if ( mem_type == LSUB ) {
+    prev_mem = Llu_symbfact->lsub;
+    prev_len = Llu_symbfact->szLsub;
+    xsub = Llu_symbfact->xlsub;
+    if (rout_type == DOMAIN_SYMB)
+      prev_xsub_nextLvl = xsub[vtxXp_lid+1];
+    else
+      prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
+  } else if ( mem_type == USUB ) {
+    prev_mem = Llu_symbfact->usub;
+    prev_len = Llu_symbfact->szUsub;
+    xsub = Llu_symbfact->xusub;
+    if (rout_type == DOMAIN_SYMB)
+      prev_xsub_nextLvl = xsub[vtxXp_lid+1];
+    else
+      prev_xsub_nextLvl = VInfo->xusub_nextLvl;
+  }
+  
+  len_tcopy_fend = prev_len - prev_xsub_nextLvl;  
+  /* if (rout_type == DNS_UPSEPS || rout_type == DNS_CURSEP)  { - bug corrected on Sept 1st, 2013 - */
+  if (rout_type == DNS_UPSEPS) { 
+    fstVtx_nextLvl = n;
+    fstVtx_nextLvl_lid = nvtcs_loc;
+    len_tcopy_fend = 0;
+  }
+#ifdef TEST_SYMB
+  printf ("Pe[" IFMT "] LUXpand mem_t " IFMT " vtxXp " IFMT "\n",
+	  iam, mem_type, vtxXp); 
+#endif
+  new_mem = expand (prev_len, min_new_len, prev_mem,
+		    &new_len, len_tcopy_fbeg, len_tcopy_fend, PS);
+  if ( !new_mem ) {
+    fprintf(stderr, "Pe[" IFMT "] Can't exp MemType " IFMT ": prv_len " IFMT
+	    " min_new " IFMT " new_l " IFMT "\n",
+	   iam, mem_type, prev_len, min_new_len, new_len);
+    return ERROR_RET;
+  }
+  
+  xsub_nextLvl = new_len - len_tcopy_fend;
+  
+  /* reset xsub information pointing to A data */
+  if (fstVtx_nextLvl != n || rout_type == DOMAIN_SYMB) {
+    if (rout_type == DOMAIN_SYMB)
+      vtx_lid = vtxXp_lid + 1;
+    else {
+      vtx_lid = fstVtx_nextLvl_lid +1;
+    }
+    i = xsub_nextLvl + xsub[vtx_lid] - prev_xsub_nextLvl;
+    for (; vtx_lid < nvtcs_loc; vtx_lid ++) {
+      j = xsub[vtx_lid+1] - xsub[vtx_lid];
+      xsub[vtx_lid] = i;
+      i += j;
+    }
+    xsub[vtx_lid] = i;
+  }
+
+  if (free_prev_mem) {
+    SUPERLU_FREE (prev_mem);
+    PS->allocMem -= 0;
+  }
+  
+  if ( mem_type == LSUB ) {
+    Llu_symbfact->lsub   = new_mem;
+    Llu_symbfact->szLsub = new_len;
+    VInfo->xlsub_nextLvl = xsub_nextLvl;
+  } else if ( mem_type == USUB ) {
+    Llu_symbfact->usub   = new_mem;
+    Llu_symbfact->szUsub = new_len;
+    VInfo->xusub_nextLvl = xsub_nextLvl;
+  }
+  
+  Llu_symbfact->no_expand ++;
+  return SUCCES_RET;
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Expand the data structures for L and U during the factorization.
+ * Return value: SUCCES_RET - successful return
+ *               ERROR_RET - error due to a memory alocation failure
+ * </pre>
+ */
+/************************************************************************/
+int_t psymbfact_LUXpand
+/************************************************************************/
+(
+ int_t iam, 
+ int_t n,           /* total number of columns */
+ int_t fstVtxLvl_loc, /* first vertex in the level to update */
+ int_t vtxXp,         /* current vertex */
+ int_t *p_next,        /* number of elements currently in the factors */
+ int_t min_new_len, /* minimum new length to allocate */
+ int_t mem_type,   /* which type of memory to expand  */
+ int_t rout_type,  /* during which type of factorization */
+ int_t free_prev_mem, /* =1 if free prev_mem memory */
+ Pslu_freeable_t *Pslu_freeable, 
+ Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
+ vtcsInfo_symbfact_t *VInfo,
+ psymbfact_stat_t *PS
+ )
+{
+  int mem_error;
+  int_t  *new_mem, *prev_mem, *xsub, sz_prev_mem;
+  /* size of the memory to be copied to new store starting from the 
+     beginning/end of the memory */
+  int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl;
+  int_t *globToLoc, nvtcs_loc, maxNvtcsPProc;
+  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid;
+  int_t i, j, k, vtx_lid, len_texp, nelts, nel;
+  int_t fstVtxLvl_loc_lid, prev_len, next;
+  
+  exp  = 2;
+  next = *p_next;
+  globToLoc = Pslu_freeable->globToLoc;
+  nvtcs_loc = VInfo->nvtcs_loc;
+  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
+  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
+  
+  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
+  if (fstVtx_nextLvl == n)
+    fstVtx_nextLvl_lid = VInfo->nvtcs_loc;
+  else
+    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
+  if (rout_type == RL_SYMB)
+    fstVtxLvl_loc_lid = LOCAL_IND( globToLoc[fstVtxLvl_loc] );
+
+  if ( mem_type == LSUB ) {
+    xsub = Llu_symbfact->xlsub;
+    prev_mem = Llu_symbfact->lsub;
+    prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
+    sz_prev_mem = Llu_symbfact->szLsub;
+  } else if ( mem_type == USUB ) {
+    xsub = Llu_symbfact->xusub;
+    prev_mem = Llu_symbfact->usub;
+    prev_xsub_nextLvl = VInfo->xusub_nextLvl;
+    sz_prev_mem = Llu_symbfact->szUsub;
+  }
+#ifdef TEST_SYMB
+  printf ("Pe[%d] Expand LU mem_t %d vtxXp %d\n", 
+	  iam, mem_type, vtxXp); 
+#endif
+  /* Try to expand the size of xsub in the existing memory */
+  if (rout_type == RL_SYMB) {
+    len_texp = 0;
+    for (vtx_lid = fstVtxLvl_loc_lid; vtx_lid < fstVtx_nextLvl_lid; vtx_lid ++) {
+      nelts = xsub[vtx_lid+1] - xsub[vtx_lid];
+      if (nelts == 0) nelts = 1;
+      nelts = 2 * nelts;
+      if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid])
+	nelts = Llu_symbfact->cntelt_vtcs[vtx_lid];
+      len_texp += nelts;
+    }
+/*     len_texp = 2 * (xsub[fstVtx_nextLvl_lid] - xsub[fstVtxLvl_loc_lid]); */
+    prev_len = xsub[fstVtxLvl_loc_lid];
+    next = prev_len;
+  }
+  else {
+    nelts = xsub[vtxXp_lid+1] - xsub[vtxXp_lid];
+    if (nelts == 0) nelts = 1;
+    len_texp = xsub[fstVtx_nextLvl_lid] - xsub[vtxXp_lid+1] +
+      4 * nelts;
+    prev_len = xsub[vtxXp_lid];
+  }
+  
+  if (prev_len + len_texp >= prev_xsub_nextLvl) {
+    /* not enough memory */
+    min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
+    if (mem_error = 
+	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
+			      mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact,
+			      VInfo, PS))
+      return (mem_error);
+    if ( mem_type == LSUB ) 
+      new_mem = Llu_symbfact->lsub;
+    else if ( mem_type == USUB ) 
+      new_mem = Llu_symbfact->usub;
+  }
+  else 
+    new_mem = prev_mem;
+
+  if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp))
+    PS->estimLSz = prev_len + len_texp;
+  if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp))
+    PS->estimUSz = prev_len;
+
+  /* expand the space */
+  if (rout_type == LL_SYMB) {
+    i = xsub[vtxXp_lid] + len_texp;
+    vtx_lid = fstVtx_nextLvl_lid - 1;
+    for (; vtx_lid > vtxXp_lid; vtx_lid --) {
+      j = xsub[vtx_lid];  
+      nel = 0;
+      while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
+	nel ++; j ++;
+      }
+      j = xsub[vtx_lid] + nel - 1;  
+      k = i - (xsub[vtx_lid+1] - xsub[vtx_lid]) + nel - 1;
+      if (k+1 < i)  new_mem[k+1] = EMPTY; 
+      while (j >= xsub[vtx_lid]) {
+	new_mem[k] = prev_mem[j]; k--; j--;
+      }
+      k = i;
+      i -= (xsub[vtx_lid+1] - xsub[vtx_lid]);
+      xsub[vtx_lid+1] = k;
+    }
+    xsub[vtx_lid+1] = i;
+    k = *p_next;
+    if (k < xsub[vtx_lid+1])
+      new_mem[k] = EMPTY;
+  }
+
+  if (rout_type == RL_SYMB) {
+    *p_next -= xsub[vtxXp_lid];
+    i = xsub[fstVtxLvl_loc_lid] + len_texp;
+    vtx_lid = fstVtx_nextLvl_lid - 1;
+    for (; vtx_lid >= fstVtxLvl_loc_lid; vtx_lid --) {
+      nelts = 2 * (xsub[vtx_lid+1] - xsub[vtx_lid]);
+      if (nelts == 0) nelts = 2;
+      if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid])
+	nelts = Llu_symbfact->cntelt_vtcs[vtx_lid];
+      j = xsub[vtx_lid];  
+      nel = 0;
+      while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
+	nel ++; j ++;
+      }
+      j = xsub[vtx_lid] + nel - 1;  
+      k = i - nelts + nel - 1;
+      if (k+1 < i) new_mem[k+1] = EMPTY; 
+      while (j >= xsub[vtx_lid]) {
+	new_mem[k] = prev_mem[j]; k--; j--;
+      }
+      k = i;
+      i -= nelts;
+      xsub[vtx_lid+1] = k;
+    }
+    *p_next += xsub[vtxXp_lid];
+  }  
+
+  if (free_prev_mem && new_mem != prev_mem)
+    SUPERLU_FREE (prev_mem);
+  Llu_symbfact->no_expcp ++;
+  
+  return SUCCES_RET;
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Expand the data structures for L and U during the factorization.
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * </pre>
+ */
+/************************************************************************/
+int_t psymbfact_LUXpand_RL
+/************************************************************************/
+(
+ int_t iam, 
+ int_t n,           /* total number of columns */
+ int_t vtxXp,       /* current vertex */
+ int_t next,        /* number of elements currently in the factors */
+ int_t len_texp,    /* length to expand */
+ int_t mem_type,    /* which type of memory to expand  */
+ Pslu_freeable_t *Pslu_freeable, 
+ Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
+ vtcsInfo_symbfact_t *VInfo,
+ psymbfact_stat_t *PS
+ )
+{
+  int_t  *new_mem, *prev_mem, *xsub, mem_error, sz_prev_mem;
+  /* size of the memory to be copied to new store starting from the 
+     beginning/end of the memory */
+  int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl;
+  int_t *globToLoc, nvtcs_loc, maxNvtcsPProc;
+  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid;
+  int_t i, j, k, vtx_lid, nel;
+  int_t fstVtxLvl_loc_lid, prev_len, min_new_len;
+
+#ifdef TEST_SYMB
+  printf ("Pe[%d] Expand LU_RL mem_t %d vtxXp %d\n", 
+	  iam, mem_type, vtxXp); 
+#endif
+  globToLoc = Pslu_freeable->globToLoc;
+  nvtcs_loc = VInfo->nvtcs_loc;
+  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
+  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
+  
+  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
+  if (fstVtx_nextLvl == n)
+    fstVtx_nextLvl_lid = VInfo->nvtcs_loc;
+  else
+    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
+
+  if ( mem_type == LSUB ) {
+    xsub = Llu_symbfact->xlsub;
+    prev_mem = Llu_symbfact->lsub;
+    prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
+    sz_prev_mem = Llu_symbfact->szLsub;
+  } else if ( mem_type == USUB ) {
+    xsub = Llu_symbfact->xusub;
+    prev_mem = Llu_symbfact->usub;
+    prev_xsub_nextLvl = VInfo->xusub_nextLvl;
+    sz_prev_mem = Llu_symbfact->szUsub;
+  }
+  else ABORT("Tries to expand nonexisting memory type.\n");
+  
+  /* Try to expand the size of xsub in the existing memory */
+  prev_len = xsub[vtxXp_lid];
+  
+  if (prev_len + len_texp >= prev_xsub_nextLvl) {
+    /* not enough memory */
+    min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
+    if (mem_error = 
+	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
+			      mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact,
+			      VInfo, PS))
+      return (mem_error);
+    if ( mem_type == LSUB ) 
+      new_mem = Llu_symbfact->lsub;
+    else if ( mem_type == USUB ) 
+      new_mem = Llu_symbfact->usub;
+  }
+  else 
+    new_mem = prev_mem;
+
+  /* expand the space */
+  if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp))
+    PS->estimLSz = prev_len + len_texp;
+  if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp))
+    PS->estimUSz = prev_len;
+
+  i = xsub[vtxXp_lid] + len_texp;
+  vtx_lid = fstVtx_nextLvl_lid - 1;
+  for (; vtx_lid > vtxXp_lid; vtx_lid --) {
+    j = xsub[vtx_lid];  
+    nel = 0;
+    while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
+      nel ++; j++;
+    }
+    j = xsub[vtx_lid] + nel - 1;  
+    k = i - Llu_symbfact->cntelt_vtcs[vtx_lid] + nel - 1;
+    if (k+1 < i) 
+      new_mem[k+1] = EMPTY; 
+    while (j >= xsub[vtx_lid]) {
+      new_mem[k] = prev_mem[j];
+      k--; j--;
+    }
+    k = i;
+    i -= Llu_symbfact->cntelt_vtcs[vtx_lid];
+    xsub[vtx_lid+1] = k;
+  }
+  xsub[vtx_lid+1] = i;
+  k = next;
+  if (k < xsub[vtx_lid+1])
+    new_mem[k] = EMPTY;
+  
+  if (new_mem != prev_mem)
+    SUPERLU_FREE (prev_mem);
+  Llu_symbfact->no_expcp ++;
+  
+  return SUCCES_RET;
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Expand the data structures for L and U pruned during the factorization.
+ * Return value: SUCCES_RET - successful return
+ *               ERROR_RET - error when run out of space
+ * </pre>
+ */
+/************************************************************************/
+int_t psymbfact_prLUXpand
+/************************************************************************/
+(
+ int_t iam, 
+ int_t min_new_len, /* minimum new length to allocate */ 
+#if 0
+ MemType mem_type,  /* which type of memory to expand  */
+#else /* Sherry */
+ int mem_type,  /* which type of memory to expand  */
+#endif
+ Llu_symbfact_t *Llu_symbfact, /* modified L/U pruned structures */
+ psymbfact_stat_t *PS
+ )
+{
+  int_t *prev_mem, *new_mem;
+  int_t prev_len, new_len, len_tcopy_fbeg;
+  
+  if ( mem_type == LSUB_PR ) {
+    prev_len = Llu_symbfact->szLsubPr;
+    prev_mem = Llu_symbfact->lsubPr;
+    len_tcopy_fbeg = Llu_symbfact->indLsubPr;
+  } else if ( mem_type == USUB_PR ) {
+    prev_len = Llu_symbfact->szUsubPr;
+    prev_mem = Llu_symbfact->usubPr;
+    len_tcopy_fbeg = Llu_symbfact->indUsubPr;
+  } else ABORT("Tries to expand nonexisting memory type.\n");
+  
+#ifdef TEST_SYMB
+  printf ("Pe[%d] Expand prmem prev_len %d min_new_l %d len_tfbeg %d\n", 
+	  iam, prev_len, min_new_len, len_tcopy_fbeg);
+#endif
+  
+  new_mem = expand (prev_len, min_new_len, prev_mem, 
+		    &new_len, len_tcopy_fbeg, 0, PS);
+  
+  if ( !new_mem ) {
+    fprintf(stderr, "Can't expand MemType %d: \n", mem_type);
+    return (ERROR_RET);
+  }
+  
+  Llu_symbfact->no_expand_pr ++;
+  if ( mem_type == LSUB_PR ) {
+    Llu_symbfact->lsubPr  = new_mem;
+    Llu_symbfact->szLsubPr = new_len;
+  } else if ( mem_type == USUB_PR ) {
+    Llu_symbfact->usubPr  = new_mem;
+    Llu_symbfact->szUsubPr = new_len;
+  } else ABORT("Tries to expand nonexisting memory type.\n");
+  
+  SUPERLU_FREE (prev_mem);
+
+  return SUCCES_RET;
+}
diff --git a/SRC/pxerr_dist.c b/SRC/pxerr_dist.c
new file mode 100644
index 0000000..41fea2f
--- /dev/null
+++ b/SRC/pxerr_dist.c
@@ -0,0 +1,32 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified: November 21, 1999
+ *
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+/* pxerbla */
+void pxerr_dist(char *srname, gridinfo_t *grid, int_t info)
+{
+    printf("{" IFMT "," IFMT "}: On entry to %6s, parameter number " IFMT " had an illegal value\n",
+	   MYROW(grid->iam, grid), MYCOL(grid->iam, grid), srname, info);
+
+}
diff --git a/SRC/pzGetDiagU.c b/SRC/pzGetDiagU.c
new file mode 100644
index 0000000..8bc80a5
--- /dev/null
+++ b/SRC/pzGetDiagU.c
@@ -0,0 +1,120 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file p@(pre)GetDiagU.c
+ * \brief Extracts the main diagonal of matrix U 
+ *
+ * <pre>
+ * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Xiaoye S. Li
+ * Created:  April 16, 2002
+ * Modified: May 15, 2016
+ * </pre>
+ */
+
+
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
+ *  
+ * Arguments
+ * =========
+ *
+ * n        (input) int
+ *          Dimension of the matrix.
+ *
+ * LUstruct (input) LUstruct_t*
+ *          The data structures to store the distributed L and U factors.
+ *          see superlu_ddefs.h for its definition.
+ *
+ * grid     (input) gridinfo_t*
+ *          The 2D process mesh. It contains the MPI communicator, the number
+ *          of process rows (NPROW), the number of process columns (NPCOL),
+ *          and my process rank. It is an input argument to all the
+ *          parallel routines.
+ *
+ * diagU    (output) double*, dimension (n)
+ *          The main diagonal of matrix U.
+ *          On exit, it is available on all processes.
+ *
+ *
+ * Note
+ * ====
+ *
+ * The diagonal blocks of the L and U matrices are stored in the L
+ * data structures, and are on the diagonal processes of the
+ * 2D process grid.
+ *
+ * This routine is modified from gather_diag_to_all() in pzgstrs_Bglobal.c.
+ * </pre>
+ */
+void pzGetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+                  doublecomplex *diagU)
+{
+
+    int_t *xsup;
+    int iam, knsupc, pkk;
+    int nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int_t i, j, jj, k, lk, lwork, nsupers, p;
+    int_t num_diag_procs, *diag_procs, *diag_len;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex *zblock, *zwork, *lusup;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    if ( !(zwork = doublecomplexMalloc_dist(jj)) ) ABORT("Malloc fails for zwork[]");
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy diagonal into buffer dwork[]. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBj( k, grid );
+		nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+		lusup = Llu->Lnzval_bc_ptr[lk];
+		for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
+		    zwork[lwork+i] = lusup[i*(nsupr+1)];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( zwork, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( zwork, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	}
+
+	/* Scatter zwork[] into global diagU vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    zblock = &diagU[FstBlockC( k )];
+	    for (i = 0; i < knsupc; ++i) zblock[i] = zwork[lwork+i];
+	    lwork += knsupc;
+	}
+    } /* for p = ... */
+
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    SUPERLU_FREE(zwork);
+}
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
new file mode 100644
index 0000000..2bed7bc
--- /dev/null
+++ b/SRC/pzdistribute.c
@@ -0,0 +1,1070 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Re-distribute A on the 2D process mesh.
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * colptr (output) int*
+ *
+ * rowind (output) int*
+ *
+ * a      (output) doublecomplex*
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+int_t
+zReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
+                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
+                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
+                doublecomplex *a[])
+{
+    NRformat_loc *Astore;
+    int_t  *perm_r; /* row permutation vector */
+    int_t  *perm_c; /* column permutation vector */
+    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
+    int_t  nnz_loc;    /* number of local nonzeros */
+    int_t  SendCnt; /* number of remote nonzeros to be sent */
+    int_t  RecvCnt; /* number of remote nonzeros to be sent */
+    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ptr_to_send;
+    doublecomplex *aij, **aij_send, *nzval, *dtemp;
+    doublecomplex *nzval_a;
+    int    iam, it, p, procs;
+    MPI_Request *send_req;
+    MPI_Status  status;
+    
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zReDistribute_A()");
+#endif
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nnzToRecv = intCalloc_dist(2*procs);
+    nnzToSend = nnzToRecv + procs;
+
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+       THEN ALLOCATE SPACE.
+       THIS ACCOUNTS FOR THE FIRST PASS OF A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+	    ++nnzToSend[p]; 
+	}
+    }
+
+    /* All-to-all communication */
+    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		  grid->comm);
+
+    maxnnzToRecv = 0;
+    nnz_loc = SendCnt = RecvCnt = 0;
+
+    for (p = 0; p < procs; ++p) {
+	if ( p != iam ) {
+	    SendCnt += nnzToSend[p];
+	    RecvCnt += nnzToRecv[p];
+	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+	} else {
+	    nnz_loc += nnzToRecv[p];
+	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+	}
+    }
+    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+
+    /* Allocate space for storing the triplets after redistribution. */
+    if ( k ) { /* count can be zero. */
+        if ( !(ia = intMalloc_dist(2*k)) )
+            ABORT("Malloc fails for ia[].");
+        if ( !(aij = doublecomplexMalloc_dist(k)) )
+            ABORT("Malloc fails for aij[].");
+    }
+    ja = ia + k;
+
+    /* Allocate temporary storage for sending/receiving the A triplets. */
+    if ( procs > 1 ) {
+      if ( !(send_req = (MPI_Request *)
+	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
+        ABORT("Malloc fails for ia_send[].");
+      if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) )
+        ABORT("Malloc fails for aij_send[].");
+      if ( SendCnt ) { /* count can be zero */
+          if ( !(index = intMalloc_dist(2*SendCnt)) )
+              ABORT("Malloc fails for index[].");
+          if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) )
+              ABORT("Malloc fails for nzval[].");
+      }
+      if ( !(ptr_to_send = intCalloc_dist(procs)) )
+        ABORT("Malloc fails for ptr_to_send[].");
+      if ( maxnnzToRecv ) { /* count can be zero */
+          if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
+              ABORT("Malloc fails for itemp[].");
+          if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) )
+              ABORT("Malloc fails for dtemp[].");
+      }
+
+      for (i = 0, j = 0, p = 0; p < procs; ++p) {
+          if ( p != iam ) {
+	      ia_send[p] = &index[i];
+	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	      aij_send[p] = &nzval[j];
+	      j += nnzToSend[p];
+	  }
+      }
+    } /* if procs > 1 */
+      
+    if ( !(*colptr = intCalloc_dist(n+1)) )
+        ABORT("Malloc fails for *colptr[].");
+
+    /* ------------------------------------------------------------
+       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+       THIS ACCOUNTS FOR THE SECOND PASS OF A.
+       ------------------------------------------------------------*/
+    nnz_loc = 0; /* Reset the local nonzero count. */
+    nzval_a = Astore->nzval;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+
+	    if ( p != iam ) { /* remote */
+	        k = ptr_to_send[p];
+	        ia_send[p][k] = irow;
+	        ia_send[p][k + nnzToSend[p]] = jcol;
+		aij_send[p][k] = nzval_a[j];
+		++ptr_to_send[p]; 
+	    } else {          /* local */
+	        ia[nnz_loc] = irow;
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = nzval_a[j];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       NOTE: Can possibly use MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    it = 2*nnzToSend[p];
+	    MPI_Isend( ia_send[p], it, mpi_int_t,
+		       p, iam, grid->comm, &send_req[p] );
+	    it = nnzToSend[p];
+	    MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
+	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    it = 2*nnzToRecv[p];
+	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+	    it = nnzToRecv[p];
+            MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
+		      grid->comm, &status );
+	    for (i = 0; i < nnzToRecv[p]; ++i) {
+	        ia[nnz_loc] = itemp[i];
+		jcol = itemp[i + nnzToRecv[p]];
+		/*assert(jcol<n);*/
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = dtemp[i];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
+	    }
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam ) {
+	    MPI_Wait( &send_req[p], &status);
+	    MPI_Wait( &send_req[procs+p], &status);
+	}
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE
+       ------------------------------------------------------------*/
+
+    SUPERLU_FREE(nnzToRecv);
+
+    if ( procs > 1 ) {
+	SUPERLU_FREE(send_req);
+	SUPERLU_FREE(ia_send);
+	SUPERLU_FREE(aij_send);
+	if ( SendCnt ) {
+            SUPERLU_FREE(index);
+            SUPERLU_FREE(nzval);
+        }
+	SUPERLU_FREE(ptr_to_send);
+        if ( maxnnzToRecv ) {
+            SUPERLU_FREE(itemp);
+            SUPERLU_FREE(dtemp);
+        }
+    }
+
+    /* ------------------------------------------------------------
+       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
+       ------------------------------------------------------------*/
+    if ( nnz_loc ) { /* nnz_loc can be zero */
+        if ( !(*rowind = intMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *rowind[].");
+        if ( !(*a = doublecomplexMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *a[].");
+    }
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = (*colptr)[0];
+    (*colptr)[0] = 0;
+    for (j = 1; j < n; ++j) {
+	k += jsize;
+	jsize = (*colptr)[j];
+	(*colptr)[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (i = 0; i < nnz_loc; ++i) {
+	j = ja[i];
+	k = (*colptr)[j];
+	(*rowind)[k] = ia[i];
+	(*a)[k] = aij[i];
+	++(*colptr)[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
+    (*colptr)[0] = 0;
+
+    if ( nnz_loc ) {
+        SUPERLU_FREE(ia);
+        SUPERLU_FREE(aij);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zReDistribute_A()");
+#endif
+ 
+    return 0;
+} /* zReDistribute_A */
+
+float
+pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
+	     ScalePermstruct_t *ScalePermstruct,
+	     Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
+	     gridinfo_t *grid)
+/*
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * LUstruct (input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ *
+ */
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
+          len, len1, nsupc;
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+    int iam, jbrow, kcol, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NRformat_loc *Astore;
+    doublecomplex *a;
+    int_t *asub, *xa;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;   
+    int_t *lsub, *xlsub, *usub, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+    int   *index1;        /* temporary pointer to array of int */
+    doublecomplex *lusup, *uval; /* nonzero values in L and U */
+    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in 
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+    doublecomplex *dense, *dense_col; /* SPA */
+    doublecomplex zero = {0.0, 0.0};
+    int_t ldaspa;     /* LDA of SPA */
+    int_t iword, dword;
+    float mem_use = 0.0;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 ) 
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = (NRformat_loc *) A->Store;
+
+#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    dword = sizeof(doublecomplex);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzdistribute()");
+#endif
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_();
+#endif
+
+    zReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
+		      grid, &xa, &asub, &a);
+
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf("--------\n"
+		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+#if ( PRNTlevel>=1 )
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else {
+        /* ------------------------------------------------------------
+	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+	   ------------------------------------------------------------*/
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We first need to set up the L and U data structures and then
+	 * propagate the values of A into them.
+	 */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+    
+	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+#if ( PRNTlevel>=1 )
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+#endif
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr = 
+              (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+	
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword;
+#endif
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+	
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+	
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+	
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+#if ( PRNTlevel>=1 )
+        mem_use -= 2.0*k * iword;
+#endif
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+
+	/* ------------------------------------------------ */
+#if ( PRNTlevel>=1 )	
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr = 
+              (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+	/* -------------------------------------------------------------- */
+#if ( PRNTlevel>=1 )
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+#endif
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+		
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		} /* for j ... */
+
+		jbrow = PROW( jb, grid );
+
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see 
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif		
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in 
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) ) 
+			ABORT("Malloc fails for index[]");
+		    Lrowind_bc_ptr[ljb] = index;
+		    if (!(Lnzval_bc_ptr[ljb] = 
+                         doublecomplexMalloc_dist(len*nsupc))) {
+			fprintf(stderr, "col block " IFMT " ", jb);
+			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
+		    }
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len; 
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[],
+                       and the initial values of A from SPA into Lnzval[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		      MPI_MAX, grid->comm);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
+        SUPERLU_FREE(asub);
+        SUPERLU_FREE(a);
+    }
+    SUPERLU_FREE(xa);
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit pzdistribute()");
+#endif
+    
+    return (mem_use);
+} /* PZDISTRIBUTE */
diff --git a/SRC/pzgsequ.c b/SRC/pzgsequ.c
new file mode 100644
index 0000000..00bce37
--- /dev/null
+++ b/SRC/pzgsequ.c
@@ -0,0 +1,243 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Computes row and column scalings
+ *
+ * File name:	pzgsequ.c
+ * History:     Modified from LAPACK routine ZGEEQU
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+ <pre>    
+    Purpose   
+    =======   
+
+    PZGSEQU computes row and column scalings intended to equilibrate an   
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make   
+    the largest element in each row and column of the matrix B with   
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+    number and BIGNUM = largest safe number.  Use of these scaling   
+    factors is not guaranteed to reduce the condition number of A but   
+    works well in practice.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+ 
+    Arguments   
+    =========   
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+	    
+    R       (output) double*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors   
+            for A.
+	    
+    C       (output) double*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+	    
+    ROWCND  (output) double*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+            AMAX is neither too large nor too small, it is not worth   
+            scaling by R.
+	    
+    COLCND  (output) double*
+            If INFO = 0, COLCND contains the ratio of the smallest   
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            worth scaling by C.
+	    
+    AMAX    (output) double*
+            Absolute value of largest matrix element.  If AMAX is very   
+            close to overflow or very close to underflow, the matrix   
+            should be scaled.
+	    
+    INFO    (output) int*
+            = 0:  successful exit   
+            < 0:  if INFO = -i, the i-th argument had an illegal value   
+            > 0:  if INFO = i,  and i is   
+                  <= M:  the i-th row of A is exactly zero   
+                  >  M:  the (i-M)-th column of A is exactly zero   
+
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+    ===================================================================== 
+</pre>
+*/
+
+void
+pzgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
+	double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
+{
+
+    /* Local variables */
+    NRformat_loc *Astore;
+    doublecomplex *Aval;
+    int i, j, irow, jcol, m_loc;
+    double rcmin, rcmax;
+    double bignum, smlnum;
+    double tempmax, tempmin;
+    double *loc_max;
+    int *r_sizes, *displs;
+    double *loc_r;
+    int_t  procs;
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NR_loc || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("pzgsequ", grid, i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+    
+    /* Get machine constants. */
+    smlnum = dmach_dist("S");
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+            r[irow] = SUPERLU_MAX( r[irow], slud_z_abs1(&Aval[j]) );
+	++irow;
+    }
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+  
+    /* Get the global MAX and MIN for R */
+    tempmax = rcmax;
+    tempmin = rcmin;
+    MPI_Allreduce( &tempmax, &rcmax, 
+		1, MPI_DOUBLE, MPI_MAX, grid->comm);
+    MPI_Allreduce( &tempmin, &rcmin, 
+		1, MPI_DOUBLE, MPI_MIN, grid->comm);
+
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	    jcol = Astore->colind[j];
+	    c[jcol] = SUPERLU_MAX( c[jcol], slud_z_abs1(&Aval[j]) * r[irow] );
+	}
+	++irow;
+    }
+
+    /* Find the global maximum for c[j] */
+    if ( !(loc_max = doubleMalloc_dist(A->ncol)))
+      ABORT("Malloc fails for loc_max[].");
+    for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
+    MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
+    SUPERLU_FREE(loc_max);
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* gather R from each process to get the global R.  */
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
+      ABORT("Malloc fails for r_sizes[].");
+    displs = r_sizes + procs;
+    if ( !(loc_r = doubleMalloc_dist(m_loc)))
+      ABORT("Malloc fails for loc_r[].");
+    j = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
+
+    /* First gather the size of each piece. */
+    MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
+      
+    /* Set up the displacements for allgatherv */
+    displs[0] = 0;
+    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
+
+    /* Now gather the actual data */
+    MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
+                MPI_DOUBLE, grid->comm);
+      
+    SUPERLU_FREE(r_sizes);
+    SUPERLU_FREE(loc_r);
+
+    return;
+
+} /* pzgsequ */
diff --git a/SRC/pzgsmv.c b/SRC/pzgsmv.c
new file mode 100644
index 0000000..5a69043
--- /dev/null
+++ b/SRC/pzgsmv.c
@@ -0,0 +1,385 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief  Parallel sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+void pzgsmv_init
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
+			  The type of A can be:
+			  Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. */
+ int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
+ gridinfo_t *grid,     /* Input */
+ pzgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
+ )
+{
+    NRformat_loc *Astore;
+    int iam, p, procs;
+    int *SendCounts, *RecvCounts;
+    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
+    int_t TotalIndSend, TotalValSend;
+    int_t *colind, *rowptr;
+    int_t *ind_tosend = NULL, *ind_torecv = NULL;
+    int_t *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, *spa, *itemp;
+    doublecomplex *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzgsmv_init()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    m = A->nrow;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = Astore->nzval;
+    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for SendCounts[]");
+    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
+    RecvCounts = SendCounts + procs;
+    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
+        ABORT("Malloc fails for ptr_ind_tosend[]");
+    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
+    if ( !(extern_start = intMalloc_dist(m_loc)) )
+        ABORT("Malloc fails for extern_start[]");
+    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
+       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
+       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
+       LOCAL PART OF X.
+       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
+        ABORT("Malloc fails for spa[]");
+    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+        k = extern_start[i];
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
+	    jcol = colind[j];
+            p = row_to_proc[jcol];
+	    if ( p != iam ) { /* External */
+	        if ( spa[jcol] == 0 ) { /* First time see this index */
+		    ++SendCounts[p];
+		    spa[jcol] = 1;
+                }
+	    } else { /* Swap to beginning the part of A corresponding
+			to the local part of X */
+		l = colind[k];
+		t = nzval[k];
+		colind[k] = jcol;
+		nzval[k] = nzval[j];
+		colind[j] = l;
+		nzval[j] = t;
+		++k;
+	    }
+	}
+	extern_start[i] = k;
+    }
+
+    /* ------------------------------------------------------------
+       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
+       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    /* Build pointers to ind_tosend[]. */
+    ptr_ind_tosend[0] = 0;
+    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
+        TotalIndSend += SendCounts[p]; /* Total to send. */
+	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
+    }
+#if 0
+    ptr_ind_tosend[iam] = 0; /* Local part of X */
+#endif
+    if ( TotalIndSend ) {
+        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
+	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
+    }
+
+    /* Build SPA to aid global to local translation. */
+    for (i = 0; i < n; ++i) spa[i] = EMPTY;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
+	        p = row_to_proc[jcol];
+		if ( p == iam ) { /* Local */
+		  /*assert(jcol>=fst_row);*/
+		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
+		} else {          /* External */
+		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
+		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
+		  ++ptr_ind_tosend[p];
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
+       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    colind[j] = spa[jcol];
+	}
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE EXTERNAL INDICES OF X.
+       ------------------------------------------------------------*/
+    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
+		 grid->comm);
+
+    /* Build pointers to ind_torecv[]. */
+    ptr_ind_torecv[0] = 0;
+    for (p = 0, TotalValSend = 0; p < procs; ++p) {
+        TotalValSend += RecvCounts[p]; /* Total to receive. */
+	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
+    }
+    if ( TotalValSend ) {
+        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
+	    ABORT("Malloc fails for ind_torecv[]");
+    }
+
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
+        if ( SendCounts[p] ) {
+	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
+		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
+	}
+	if ( RecvCounts[p] ) {
+	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
+		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
+	}
+    }
+    for (p = 0; p < procs; ++p) {
+        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
+	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
+    }
+
+    /* Allocate storage for the X values to to transferred. */
+    if ( TotalIndSend &&
+         !(val_torecv = doublecomplexMalloc_dist(TotalIndSend)) )
+        ABORT("Malloc fails for val_torecv[].");
+    if ( TotalValSend &&
+         !(val_tosend = doublecomplexMalloc_dist(TotalValSend)) )
+        ABORT("Malloc fails for val_tosend[].");
+
+    gsmv_comm->extern_start = extern_start;
+    gsmv_comm->ind_tosend = ind_tosend;
+    gsmv_comm->ind_torecv = ind_torecv;
+    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
+    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
+    gsmv_comm->SendCounts = SendCounts;
+    gsmv_comm->RecvCounts = RecvCounts;
+    gsmv_comm->val_tosend = val_tosend;
+    gsmv_comm->val_torecv = val_torecv;
+    gsmv_comm->TotalIndSend = TotalIndSend;
+    gsmv_comm->TotalValSend = TotalValSend;
+    
+    SUPERLU_FREE(spa);
+    SUPERLU_FREE(send_req);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("pzgsmv_init::rowptr", m_loc+1, rowptr);
+    PrintInt10("pzgsmv_init::extern_start", m_loc, extern_start);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgsmv_init()");
+#endif
+
+} /* PZGSMV_INIT */
+
+
+/*
+ * Performs sparse matrix-vector multiplication.
+ */
+void
+pzgsmv
+(
+ int_t  abs,               /* Input. Do abs(A)*abs(x). */
+ SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
+			      The column indices are translated into
+			      the relative positions in the gathered x-vector.
+			      The type of A can be:
+			      Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. */
+ gridinfo_t *grid,         /* Input */
+ pzgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
+ doublecomplex x[],       /* Input. The distributed source vector */
+ doublecomplex ax[]       /* Output. The distributed destination vector */
+)
+{
+    NRformat_loc *Astore;
+    int iam, procs;
+    int_t i, j, p, m, m_loc, n, fst_row, jcol;
+    int_t *colind, *rowptr;
+    int   *SendCounts, *RecvCounts;
+    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, TotalValSend;
+    doublecomplex *nzval, *val_tosend, *val_torecv;
+    doublecomplex zero = {0.0, 0.0}, temp;
+    double *ax_abs = (double *) ax;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzgsmv()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A_internal->Store;
+    m = A_internal->nrow;
+    n = A_internal->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = (doublecomplex *) Astore->nzval;
+    extern_start = gsmv_comm->extern_start;
+    ind_torecv = gsmv_comm->ind_torecv;
+    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
+    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
+    SendCounts = gsmv_comm->SendCounts;
+    RecvCounts = gsmv_comm->RecvCounts;
+    val_tosend = (doublecomplex *) gsmv_comm->val_tosend;
+    val_torecv = (doublecomplex *) gsmv_comm->val_torecv;
+    TotalValSend = gsmv_comm->TotalValSend;
+
+    /* ------------------------------------------------------------
+       COPY THE X VALUES INTO THE SEND BUFFER.
+       ------------------------------------------------------------*/
+    for (i = 0; i < TotalValSend; ++i) {
+        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
+	val_tosend[i] = x[j];
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE X VALUES.
+       ------------------------------------------------------------*/
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        if ( RecvCounts[p] ) {
+	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
+                      SuperLU_MPI_DOUBLE_COMPLEX, p, iam,
+                      grid->comm, &send_req[p]);
+	}
+	if ( SendCounts[p] ) {
+	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
+                      SuperLU_MPI_DOUBLE_COMPLEX, p, p,
+                      grid->comm, &recv_req[p]);
+	}
+    }
+    
+    /* ------------------------------------------------------------
+       PERFORM THE ACTUAL MULTIPLICATION.
+       ------------------------------------------------------------*/
+    if ( abs ) { /* Perform abs(A)*abs(x) */
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+            ax_abs[i] = 0.0;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax_abs[i] += slud_z_abs1(&nzval[j]) * slud_z_abs1(&x[jcol]);
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax_abs[i] += slud_z_abs1(&nzval[j]) * slud_z_abs(&val_torecv[jcol]);
+	    }
+	}
+    } else {
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = zero;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+                zz_mult(&temp, &nzval[j], &x[jcol]);
+                z_add(&ax[i], &ax[i], &temp);
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+                zz_mult(&temp, &nzval[j], &val_torecv[jcol]);
+                z_add(&ax[i], &ax[i], &temp);
+	    }
+	}
+    }
+
+    SUPERLU_FREE(send_req);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgsmv()");
+#endif
+
+} /* PZGSMV */
+
+void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm)
+{
+    int_t *it;
+    doublecomplex *dt;
+    SUPERLU_FREE(gsmv_comm->extern_start);
+    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
+    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
+    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
+    SUPERLU_FREE(gsmv_comm->SendCounts);
+    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
+    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
+}
+
diff --git a/SRC/pzgsmv_AXglobal.c b/SRC/pzgsmv_AXglobal.c
new file mode 100644
index 0000000..67026a9
--- /dev/null
+++ b/SRC/pzgsmv_AXglobal.c
@@ -0,0 +1,327 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Performs sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+
+static void zcreate_msr_matrix(SuperMatrix *, int_t [], int_t,
+			      doublecomplex **, int_t **);
+static void zPrintMSRmatrix(int, doublecomplex [], int_t [], gridinfo_t *);
+
+
+int pzgsmv_AXglobal_setup
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */
+ Glu_persist_t *Glu_persist, /* input */
+ gridinfo_t *grid,     /* input */
+ int_t *m,             /* output */
+ int_t *update[],      /* output */
+ doublecomplex *val[],        /* output */
+ int_t *bindx[],       /* output */
+ int_t *mv_sup_to_proc /* output */
+ )
+{
+    int n;
+    int input_option;
+    int N_update;    /* Number of variables updated on this process (output) */
+    int iam = grid->iam;
+    int nprocs = grid->nprow * grid->npcol;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *supno = Glu_persist->supno;
+    int_t nsupers;
+    int i, nsup, p, t1, t2, t3;
+
+
+    /* Initialize the list of global indices.
+     * NOTE: the list of global indices must be in ascending order.
+     */
+    n = A->nrow;
+    input_option = SUPER_LINEAR;
+    nsupers = supno[n-1] + 1;
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) {
+	PrintInt10("xsup", supno[n-1]+1, xsup);
+	PrintInt10("supno", n, supno);
+    }
+#endif
+
+    if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on
+					     individual rows.  */
+	/* Figure out mv_sup_to_proc[] on all processes. */
+	for (p = 0; p < nprocs; ++p) {
+	    t1 = n / nprocs;       /* Number of rows */
+	    t2 = n - t1 * nprocs;  /* left-over, which will be assigned
+				      to the first t2 processes.  */
+	    if ( p >= t2 ) t2 += (p * t1); /* Starting row number */
+	    else { /* First t2 processes will get one more row. */
+ 	        ++t1;              /* Number of rows. */
+		t2 = p * t1;       /* Starting row. */
+	    }
+	    /* Make sure the starting and ending rows are at the
+	       supernode boundaries. */
+	    t3 = t2 + t1;      /* Ending row. */
+	    nsup = supno[t2];
+	    if ( t2 > xsup[nsup] ) { /* Round up the starting row. */
+		t1 -= xsup[nsup+1] - t2;
+		t2 = xsup[nsup+1];
+	    }
+	    nsup = supno[t3];
+	    if ( t3 > xsup[nsup] ) /* Round up the ending row. */
+		t1 += xsup[nsup+1] - t3;
+	    t3 = t2 + t1 - 1;
+	    if ( t1 ) {
+		for (i = supno[t2]; i <= supno[t3]; ++i) {
+		    mv_sup_to_proc[i] = p;
+#if ( DEBUGlevel>=3 )
+		    if ( mv_sup_to_proc[i] == p-1 ) {
+			fprintf(stderr, 
+				"mv_sup_to_proc conflicts at supno %d\n", i);
+			exit(-1);
+		    }
+#endif
+		}
+	    }
+	    
+	    if ( iam == p ) {
+		N_update = t1;
+		if ( N_update ) {
+		    if ( !(*update = intMalloc_dist(N_update)) )
+			ABORT("Malloc fails for update[]");
+		}
+		for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i;
+#if ( DEBUGlevel>=3 )
+		printf("(%2d) N_update = %4d\t"
+		       "supers %4d to %4d\trows %4d to %4d\n",
+		       iam, N_update, supno[t2], supno[t3], t2, t3);
+#endif
+	    }
+	} /* for p ... */
+    } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on
+						   individual supernodes.  */
+	/* This may cause bad load balance, because the blocks are usually
+	   small in the beginning and large toward the end.   */
+	t1 = nsupers / nprocs;
+	t2 = nsupers - t1 * nprocs; /* left-over */
+	if ( iam >= t2 ) t2 += (iam * t1);
+	else {
+	    ++t1;          /* Number of blocks. */
+	    t2 = iam * t1; /* Starting block. */
+	}
+	N_update = xsup[t2+t1] - xsup[t2];
+	if ( !(*update = intMalloc_dist(N_update)) )
+	    ABORT("Malloc fails for update[]");
+	for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i;
+    }
+
+
+    /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */
+    zcreate_msr_matrix(A, *update, N_update, val, bindx);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc);
+    zPrintMSRmatrix(N_update, *val, *bindx, grid);
+#endif
+
+    *m = N_update;
+    return 0;
+} /* PZGSMV_AXglobal_SETUP */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Create the distributed modified sparse row (MSR) matrix: bindx/val.
+ * For a submatrix of size m-by-n, the MSR arrays are as follows:
+ *    bindx[0]      = m + 1
+ *    bindx[0..m]   = pointer to start of each row
+ *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
+ *                    where, ks = bindx[k], ke = bindx[k+1]-1
+ *    val[k]        = A(k,k), k < m, diagonal elements
+ *    val[m]        = not used
+ *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
+ * Both arrays are of length nnz + 1.
+ * </pre> 
+*/
+static void zcreate_msr_matrix
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */
+ int_t update[],       /* input (local) */
+ int_t N_update,       /* input (local) */
+ doublecomplex **val,         /* output */
+ int_t **bindx         /* output */
+)
+{
+    int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag;
+    NCPformat *Astore;
+    doublecomplex *nzval;
+    int_t *rowcnt;
+    doublecomplex zero = {0.0, 0.0};
+    
+    if ( !N_update ) return;
+
+    n = A->ncol;
+    Astore = A->Store;
+    nzval = Astore->nzval;
+
+    /* One pass of original matrix A to count nonzeros of each row. */
+    if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) )
+	ABORT("Malloc fails for rowcnt[]");
+    lo = update[0];
+    hi = update[N_update-1];
+    nnz_local = 0;
+    nnz_diag = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow != j ) /* Exclude diagonal */
+		    ++rowcnt[irow - lo];
+		else ++nnz_diag; /* Count nonzero diagonal entries */
+		++nnz_local;
+	    }
+	}
+    }
+
+    /* Add room for the logical diagonal zeros which are not counted
+       in nnz_local. */
+    nnz_local += (N_update - nnz_diag);
+
+    /* Allocate storage for bindx[] and val[]. */
+    if ( !(*val = (doublecomplex *) doublecomplexMalloc_dist(nnz_local+1)) )
+	ABORT("Malloc fails for val[]");
+    for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */
+    if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) )
+	ABORT("Malloc fails for bindx[]");
+
+    /* Set up row pointers. */
+    (*bindx)[0] = N_update + 1;
+    for (j = 1; j <= N_update; ++j) {
+	(*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1];
+	rowcnt[j-1] = (*bindx)[j-1];
+    }
+
+    /* One pass of original matrix A to fill in matrix entries. */
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow == j ) /* Diagonal */
+		    (*val)[irow - lo] = nzval[i];
+		else {
+		    irow -= lo;
+		    k = rowcnt[irow];
+		    (*bindx)[k] = j;
+		    (*val)[k] = nzval[i];
+		    ++rowcnt[irow];
+		}
+	    }
+	}
+    }
+
+    SUPERLU_FREE(rowcnt);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ * </pre>
+ */
+int
+pzgsmv_AXglobal(int_t m, int_t update[], doublecomplex val[], int_t bindx[],
+                doublecomplex X[], doublecomplex ax[])
+{
+    int_t i, j, k;
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex temp;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = zero;
+
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    zz_mult(&temp, &val[k], &X[j]);
+	    z_add(&ax[i], &ax[i], &temp);
+	}
+	zz_mult(&temp, &val[i], &X[update[i]]); /* diagonal */
+	z_add(&ax[i], &ax[i], &temp);
+    }
+    return 0;
+} /* PZGSMV_AXglobal */
+ 
+/*
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ */
+int
+pzgsmv_AXglobal_abs(int_t m, int_t update[], doublecomplex val[], int_t bindx[],
+	            doublecomplex X[], double ax[])
+{
+    int_t i, j, k;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = 0.0;
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    ax[i] += slud_z_abs1(&val[k]) * slud_z_abs1(&X[j]);
+	}
+	ax[i] += slud_z_abs1(&val[i]) * slud_z_abs1(&X[update[i]]); /* diagonal */
+    }
+    
+    return 0;
+} /* PZGSMV_AXglobal_ABS */
+
+/*
+ * Print the local MSR matrix
+ */
+static void zPrintMSRmatrix
+(
+ int m,       /* Number of rows of the submatrix. */
+ doublecomplex val[],
+ int_t bindx[],
+ gridinfo_t *grid
+)
+{
+    int iam, nnzp1;
+
+    if ( !m ) return;
+
+    iam = grid->iam;
+    nnzp1 = bindx[m];
+    printf("(%2d) MSR submatrix has %d rows -->\n", iam, m);
+    PrintDoublecomplex("val", nnzp1, val);
+    PrintInt10("bindx", nnzp1, bindx);
+}
diff --git a/SRC/pzgsrfs.c b/SRC/pzgsrfs.c
new file mode 100644
index 0000000..8a371d7
--- /dev/null
+++ b/SRC/pzgsrfs.c
@@ -0,0 +1,263 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Last modified:
+ * December 31, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief 
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSRFS improves the computed solution to a system of linear   
+ * equations and provides error bounds and backward error estimates
+ * for the solution. 
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_zdefs.h for the definition of 'LUstruct_t'.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) doublecomplex* (local)
+ *        The m_loc-by-NRHS right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *       
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) doublecomplex* (local)
+ *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
+ *            transformed system A1*Y = Pc*Pr*B. where
+ *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
+ *        On exit, the improved solution matrix Y.
+ *
+ *        In order to obtain the solution X to the original system,
+ *        Y should be permutated by Pc^T, and premultiplied by diag(C)
+ *        if DiagScale = COL or BOTH.
+ *        This must be done after this routine is called.
+ *
+ * ldx    (input) int (local)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (output) SOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        
+ * Internal Parameters   
+ * ===================   
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.   
+ * </pre>
+ */
+void
+pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
+	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
+	doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, 
+	SOLVEstruct_t *SOLVEstruct,
+	double *berr, SuperLUStat_t *stat, int *info)
+{
+#define ITMAX 20
+    
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col;
+    double *rtemp;
+    int_t count, i, j, lwork, nz;
+    int   iam;
+    double eps, lstres;
+    double s, safmin, safe1, safe2;
+
+    /* Data structures used by matrix-vector multiply routine. */
+    pzgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
+    NRformat_loc *Astore;
+    int_t        m_loc, fst_row;
+
+
+    /* Initialization. */
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    iam = grid->iam;
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	      || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("PZGSRFS", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgsrfs()");
+#endif
+
+    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
+    if ( !(work = doublecomplexMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = dx = work;
+    temp = ax + m_loc;
+    rtemp = (double *) temp;
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = dmach_dist("Epsilon");
+    safmin = dmach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,   
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    pzgsmv(0, A, grid, gsmv_comm, X_col, ax);
+	    
+	    /* Compute residual, stored in R[]. */
+	    for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
+
+	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
+	    pzgsmv(1, A, grid, gsmv_comm, X_col, temp);
+            /* NOTE: rtemp is aliased to temp */
+	    for (i = 0; i < m_loc; ++i) rtemp[i] += slud_z_abs1(&B_col[i]);
+	    
+	    s = 0.0;
+	    for (i = 0; i < m_loc; ++i) {
+		if ( rtemp[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, slud_z_abs1(&R[i]) / rtemp[i]);
+		} else if ( rtemp[i] != 0.0 ) {
+		    s = SUPERLU_MAX(s, (safe1 + slud_z_abs1(&R[i])) / rtemp[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
+		
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		pzgstrs(n, LUstruct, ScalePermstruct, grid,
+			dx, m_loc, fst_row, m_loc, 1, 
+			SOLVEstruct, stat, info);
+
+		/* Update solution. */
+		for (i = 0; i < m_loc; ++i)
+                    z_add(&X_col[i], &X_col[i], &dx[i]);
+
+		lstres = berr[j];
+		++count;
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(work);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgsrfs()");
+#endif
+
+} /* PZGSRFS */
+
diff --git a/SRC/pzgsrfs_ABXglobal.c b/SRC/pzgsrfs_ABXglobal.c
new file mode 100644
index 0000000..36c0dc5
--- /dev/null
+++ b/SRC/pzgsrfs_ABXglobal.c
@@ -0,0 +1,470 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Improves the computed solution and provies error bounds
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015  version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*-- Function prototypes --*/
+static void gather_1rhs_diag_to_all(int_t, doublecomplex [], Glu_persist_t *,
+                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
+				    int_t [], doublecomplex [], doublecomplex []);
+static void redist_all_to_diag(int_t, doublecomplex [], Glu_persist_t *,
+                               LocalLU_t *, gridinfo_t *, int_t [], doublecomplex []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pzgsrfs_ABXglobal improves the computed solution to a system of linear   
+ * equations and provides error bounds and backward error estimates
+ * for the solution. 
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
+ *        are permutation matrices. The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ *        NOTE: Currently, A must reside in all processes when calling
+ *              this routine.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pzgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) doublecomplex* (global)
+ *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *       
+ *        NOTE: Currently, B must reside on all processes when calling
+ *              this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) doublecomplex* (global)
+ *        On entry, the solution matrix X, as computed by PZGSTRS.
+ *        On exit, the improved solution matrix X.
+ *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
+ *        in order to obtain the solution to the original system.
+ *
+ *        NOTE: Currently, X must reside on all processes when calling
+ *              this routine.
+ *
+ * ldx    (input) int (global)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        
+ * Internal Parameters   
+ * ===================   
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.   
+ * </pre>
+ */
+
+void
+pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
+		  gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx,
+		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
+{
+
+
+#define ITMAX 20
+    
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    /* 
+     * Data structures used by matrix-vector multiply routine.
+     */
+    int_t  N_update; /* Number of variables updated on this process */
+    int_t  *update;  /* vector elements (global index) updated 
+			on this processor.                     */
+    int_t  *bindx;
+    doublecomplex *val;
+    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
+			       matrix-vector multiply.  */
+    /*-- end data structures for matrix-vector multiply --*/
+    doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col,
+           *x_trs, *dx_trs;
+    double *rwork;
+    int_t notran;
+    int_t count, ii, j, jj, k, knsupc, lk, lwork,
+          nprow, nsupers, nz, p;
+    int   i, iam, pkk;
+    int_t *ilsum, *xsup;
+    double eps, lstres;
+    double s, safmin, safe1, safe2;
+
+    /* NEW STUFF */
+    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
+    int_t *diag_len; /* Length of the X vector on diagonal processes. */
+
+    /*-- Function prototypes --*/
+    extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
+			 doublecomplex *, int, SuperLUStat_t *, int *);
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 ||
+	      A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("pzgsrfs_ABXglobal", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+    /* Initialization. */
+    iam = grid->iam;
+    nprow = grid->nprow;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+    notran = 1;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()");
+#endif
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. number of diag processes = " IFMT "\n", num_diag_procs);
+	PrintInt10("diag_procs", num_diag_procs, diag_procs);
+	PrintInt10("diag_len", num_diag_procs, diag_len);
+    }
+#endif
+
+    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
+	ABORT("Calloc fails for mv_sup_to_proc[]");
+
+    pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
+		          &val, &bindx, mv_sup_to_proc);
+
+    i = CEILING( nsupers, nprow ); /* Number of local block rows */
+    ii = Llu->ldalsum + i * XK_H;
+    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    jj = SUPERLU_MAX( jj, N_update );
+    lwork = N_update         /* For ax and R */
+	  + ii               /* For dx_trs */
+	  + ii               /* For x_trs */
+          + k                /* For b */
+	  + jj;              /* for temp */
+    if ( !(work = doublecomplexMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = work;
+    dx_trs = work + N_update;
+    x_trs  = dx_trs + ii;
+    b      = x_trs + ii;
+    temp   = b + k;
+    if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) )
+	ABORT("Malloc fails for rwork[]");
+
+#if ( DEBUGlevel>=2 )
+    {
+	doublecomplex *dwork = doublecomplexMalloc_dist(n);
+	for (i = 0; i < n; ++i) {
+	    if ( i & 1 ) dwork[i].r = 1.;
+	    else dwork[i].r = 2.;
+	    dwork[i].i = 0.;
+        }
+	/* Check correctness of matrix-vector multiply. */
+	pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
+	PrintDoublecomplex("Mult A*x", N_update, ax);
+	SUPERLU_FREE(dwork);
+    }
+#endif
+
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = dmach_dist("Epsilon");
+    safmin = dmach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+
+	/* Copy X into x on the diagonal processes. */
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+	for (p = 0; p < num_diag_procs; ++p) {
+	    pkk = diag_procs[p];
+	    if ( iam == pkk ) {
+		for (k = p; k < nsupers; k += num_diag_procs) {
+		    knsupc = SuperSize( k );
+		    lk = LBi( k, grid );
+		    ii = ilsum[lk] + (lk+1)*XK_H;
+		    jj = FstBlockC( k );
+		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
+		    dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */
+		}
+	    }
+	}
+	/* Copy B into b distributed the same way as matrix-vector product. */
+        if ( N_update ) ii = update[0];
+	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,   
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
+	    
+	    /* Compute residual. */
+	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);
+
+	    /* Compute abs(op(A))*abs(X) + abs(B). */
+	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
+	    for (i = 0; i < N_update; ++i) rwork[i] += slud_z_abs1(&b[i]);
+	    
+	    s = 0.0;
+	    for (i = 0; i < N_update; ++i) {
+		if ( rwork[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, slud_z_abs1(&R[i]) / rwork[i]);
+		} else if ( rwork[i] != 0.0 ) {
+		    s = SUPERLU_MAX(s, (safe1 + slud_z_abs1(&R[i])) / rwork[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
+		
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
+				   mv_sup_to_proc, dx_trs);
+		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
+
+		/* Update solution. */
+		for (p = 0; p < num_diag_procs; ++p) 
+		    if ( iam == diag_procs[p] )
+			for (k = p; k < nsupers; k += num_diag_procs) {
+			    lk = LBi( k, grid );
+			    ii = ilsum[lk] + (lk+1)*XK_H;
+			    knsupc = SuperSize( k );
+			    for (i = 0; i < knsupc; ++i)
+				z_add(&x_trs[i + ii], &x_trs[i + ii], 
+				      &dx_trs[i + ii]);
+			}
+		lstres = berr[j];
+		++count;
+		/* Transfer x_trs (on diagonal processes) into X
+		   (on all processes). */
+		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
+					num_diag_procs, diag_procs, diag_len,
+					X_col, temp);
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+
+    /* Deallocate storage used by matrix-vector multiplication. */
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    if ( N_update ) {
+	SUPERLU_FREE(update);
+	SUPERLU_FREE(bindx);
+	SUPERLU_FREE(val);
+    }
+    SUPERLU_FREE(mv_sup_to_proc);
+    SUPERLU_FREE(work);
+    SUPERLU_FREE(rwork);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()");
+#endif
+
+} /* PZGSRFS_ABXGLOBAL */
+
+
+/*! \brief
+ *
+ * <pre>
+ * r[] is the residual vector distributed the same way as
+ * matrix-vector product.
+ * </pre>
+ */
+static void
+redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist,
+		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
+		   doublecomplex work[])
+{
+    int_t i, ii, k, lk, lr, nsupers;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, psrc, pkk;
+    MPI_Status status;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+    lr = 0;
+
+    for (k = 0; k < nsupers; ++k) {
+	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
+	psrc = mv_sup_to_proc[k];
+	knsupc = SuperSize( k );
+	lk = LBi( k, grid );
+	ii = ilsum[lk] + (lk+1)*XK_H;
+	if ( iam == psrc ) {
+	    if ( iam != pkk ) { /* Send X component. */
+		MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk,
+			 grid->comm );
+	    } else { /* Local copy. */
+		for (i = 0; i < knsupc; ++i)
+		    work[i + ii] = r[i + lr];
+	    }
+	    lr += knsupc;
+	} else {
+	    if ( iam == pkk ) { /* Recv X component. */
+		MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk,
+			 grid->comm, &status );
+	    }
+	}
+    }
+} /* REDIST_ALL_TO_DIAG */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * </pre>
+ */
+static void
+gather_1rhs_diag_to_all(int_t n, doublecomplex x[],
+			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+			gridinfo_t *grid, int_t num_diag_procs,
+			int_t diag_procs[], int_t diag_len[],
+			doublecomplex y[], doublecomplex work[])
+{
+    int_t i, ii, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = ilsum[lk] + (lk+1)*XK_H;
+		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
+	    lwork += knsupc;
+	}
+    }
+} /* GATHER_1RHS_DIAG_TO_ALL */
+
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
new file mode 100644
index 0000000..288e6eb
--- /dev/null
+++ b/SRC/pzgssvx.c
@@ -0,0 +1,1464 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Solves a system of linear equations A*X=B
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * October  1, 2014
+ * April 5, 2015
+ * December 31, 2015  version 4.3
+ * December 31, 2016  version 5.1.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ * 
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using 
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously 
+ *      solved problem to save time by reusing part or all of 
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *         
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described 
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where 
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *     
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply 
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *         
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply 
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous 
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *         
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous 
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply 
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *         
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *         
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *           
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *         
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was 
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *         
+ * B       (input/output) doublecomplex* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_zdefs.h for the definition of 'LocalLU_t'.
+ *
+ * SOLVEstruct (input/output) SOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_zdefs.h for the definitions of various data types.
+ * </pre>
+ */
+
+void
+pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, 
+	ScalePermstruct_t *ScalePermstruct,
+	doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
+	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
+	SuperLUStat_t *stat, int *info)
+{
+    NRformat_loc *Astore;
+    SuperMatrix GA;      /* Global A in NC format */
+    NCformat *GAstore;
+    doublecomplex   *a_GA;
+    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+	      routine. They will be freed after PDDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    doublecomplex   *a;
+    int_t    *colptr, *rowind;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *rowptr, *colind;  /* Local A in NR*/
+    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t    nnz_loc, m_loc, fst_row, icol;
+    int      iam;
+    int      ldx;  /* LDA for matrix X (local). */
+    char     equed[1], norm[1];
+    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    doublecomplex   *X, *b_col, *b_work, *x_col;
+    double   t;
+    float    GA_mem_use;    /* memory usage by global A */
+    float    dist_mem_use; /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+
+    /* Structures needed for parallel symbolic factorization */
+    int_t *sizes, *fstVtxSep, parSymbFact;
+    int   noDomains, nprocs_num;
+    MPI_Comm symb_comm; /* communicator for symbolic factorization */
+    int   col, key; /* parameters for creating a new communicator */
+    Pslu_freeable_t Pslu_freeable;
+    float  flinfo;
+
+    /* Initialization. */
+    m       = A->nrow;
+    n       = A->ncol;
+    Astore  = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc   = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a       = (doublecomplex *) Astore->nzval;
+    rowptr  = Astore->rowptr;
+    colind  = Astore->colind;
+    sizes   = NULL;
+    fstVtxSep = NULL;
+    symb_comm = MPI_COMM_NULL;
+
+    /* Test the input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < 0 || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	printf("ERROR: Extra precise iterative refinement yet to support.\n");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+		|| A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < m_loc )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( sp_ienv_dist(2) > sp_ienv_dist(3) ) {
+        *info = 1;
+	printf("ERROR: Relaxation (NREL) cannot be larger than max. supernode size (NSUP).\n"
+	"\t-> Check parameter setting in sp_ienv_dist.c to correct error.\n");
+    }
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("pzgssvx", grid, -*info);
+	return;
+    }
+
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    parSymbFact = options->ParSymbFact;
+	
+    iam = grid->iam;
+    job = 5;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+    /* The following arrays are replicated on all processes. */
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    /********/
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgssvx()");
+#endif
+
+    /* Not factored & ask for equilibration */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW: 
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL: 
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       Diagonal scaling to equilibrate the matrix. (simple scheme)
+       ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
+		    }
+		    ++irow;
+		}
+		break;
+	      case COL:
+		for (j = 0; j < m_loc; ++j)
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
+		        icol = colind[i];
+                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
+		    }
+		break;
+	      case BOTH:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			icol = colind[i];
+                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
+                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
+		    }
+		    ++irow;
+		}
+	        break;
+	    }
+	} else { /* Compute R & C from scratch */
+            /* Compute the row and column scalings. */
+	    pzgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+
+	    if ( iinfo > 0 ) {
+		if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		} else {
+#if ( PRNTlevel>=1 )
+                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+                }
+ 	    } else if ( iinfo < 0 ) return;
+
+	    /* Now iinfo == 0 */
+
+            /* Equilibrate matrix A if it is badly-scaled. 
+               A <-- diag(R)*A*diag(C)                     */
+	    pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
+
+	    if ( strncmp(equed, "R", 1)==0 ) {
+		  ScalePermstruct->DiagScale = ROW;
+		  rowequ = ROW;
+	    } else if ( strncmp(equed, "C", 1)==0 ) {
+		  ScalePermstruct->DiagScale = COL;
+		  colequ = COL;
+	    } else if ( strncmp(equed, "B", 1)==0 ) {
+		  ScalePermstruct->DiagScale = BOTH;
+		  rowequ = ROW;
+		  colequ = COL;
+	    } else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		/*fflush(stdout);*/
+	    }
+#endif
+	} /* end if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... LAPACK style, not involving MC64 */
+
+    if ( !factored ) { /* Skip this if already factored. */
+        /*
+         * For serial symbolic factorization, gather A from the distributed
+	 * compressed row format to global A in compressed column format.
+         * Numerical values are gathered only when a row permutation
+         * for large diagonal is sought after.
+         */
+	if ( Fact != SamePattern_SameRowPerm &&
+             (parSymbFact == NO || options->RowPerm != NO) ) {
+             /* Performs serial symbolic factorzation and/or MC64 */
+
+            need_value = (options->RowPerm == LargeDiag);
+
+            pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
+
+            GAstore = (NCformat *) GA.Store;
+            colptr = GAstore->colptr;
+            rowind = GAstore->rowind;
+            nnz = GAstore->nnz;
+            GA_mem_use = (nnz + n + 1) * sizeof(int_t);
+
+            if ( need_value ) {
+                a_GA = (doublecomplex *) GAstore->nzval;
+                GA_mem_use += nnz * sizeof(doublecomplex);
+            } else assert(GAstore->nzval == NULL);
+	}
+
+        /* ------------------------------------------------------------
+           Find the row permutation Pr for A, and apply Pr*[GA].
+	   GA is overwritten by Pr*[GA].
+           ------------------------------------------------------------*/
+        if ( options->RowPerm != NO ) {
+	    t = SuperLU_timer_();
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
+	            /* Permute the global matrix GA for symbfact() */
+	            for (i = 0; i < colptr[n]; ++i) {
+	            	irow = rowind[i]; 
+		    	rowind[i] = perm_r[irow];
+	            }
+	        } else { /* options->RowPerm == LargeDiag */
+	            /* Get a new perm_r[] */
+	            if ( job == 5 ) {
+		        /* Allocate storage for scaling factors. */
+		        if ( !(R1 = doubleMalloc_dist(m)) )
+		            ABORT("SUPERLU_MALLOC fails for R1[]");
+		    	if ( !(C1 = doubleMalloc_dist(n)) )
+		            ABORT("SUPERLU_MALLOC fails for C1[]");
+	            }
+
+	            if ( !iam ) { /* Process 0 finds a row permutation */
+		        iinfo = zldperm_dist(job, m, nnz, colptr, rowind, a_GA,
+		                perm_r, R1, C1);
+		
+                        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		        if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+                            }
+		        }
+	            } else {
+		        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+                            }
+		        }
+	            }
+
+	    	    if ( iinfo && job == 5) { /* Error return */
+	                SUPERLU_FREE(R1);
+	        	SUPERLU_FREE(C1);
+   	            }
+#if ( PRNTlevel>=2 )
+	            dmin = dmach_dist("Overflow");
+	            dsum = 0.0;
+	            dprod = 1.0;
+#endif
+	            if ( iinfo == 0 ) {
+	              if ( job == 5 ) {
+		        if ( Equil ) {
+		            for (i = 0; i < n; ++i) {
+			        R1[i] = exp(R1[i]);
+			        C1[i] = exp(C1[i]);
+		            }
+
+		            /* Scale the distributed matrix further.
+			       A <-- diag(R1)*A*diag(C1)            */
+		            irow = fst_row;
+		            for (j = 0; j < m_loc; ++j) {
+			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			            icol = colind[i];
+                                    zd_mult(&a[i], &a[i], R1[irow]);
+                                    zd_mult(&a[i], &a[i], C1[icol]);
+#if ( PRNTlevel>=2 )
+			            if ( perm_r[irow] == icol ) { /* New diagonal */
+			              if ( job == 2 || job == 3 )
+			                dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i]));
+			              else if ( job == 4 )
+				        dsum += slud_z_abs1(&a[i]);
+			              else if ( job == 5 )
+				        dprod *= slud_z_abs1(&a[i]);
+			            }
+#endif
+			        }
+			        ++irow;
+		            }
+
+		            /* Multiply together the scaling factors --
+			       R/C from simple scheme, R1/C1 from MC64. */
+		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		            else for (i = 0; i < m; ++i) R[i] = R1[i];
+		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		            else for (i = 0; i < n; ++i) C[i] = C1[i];
+		    
+		            ScalePermstruct->DiagScale = BOTH;
+		            rowequ = colequ = 1;
+
+		        } /* end Equil */
+
+                        /* Now permute global GA to prepare for symbfact() */
+                        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	                        irow = rowind[i];
+		                rowind[i] = perm_r[irow];
+		            }
+		        }
+		        SUPERLU_FREE (R1);
+		        SUPERLU_FREE (C1);
+	              } else { /* job = 2,3,4 */
+		        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			        irow = rowind[i];
+			        rowind[i] = perm_r[irow];
+		            } /* end for i ... */
+		        } /* end for j ... */
+	              } /* end else job ... */
+                    } else { /* if iinfo != 0 */
+			for (i = 0; i < m; ++i) perm_r[i] = i;
+		    }
+
+#if ( PRNTlevel>=2 )
+	            if ( job == 2 || job == 3 ) {
+		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	            } else if ( job == 4 ) {
+		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	            } else if ( job == 5 ) {
+		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	            }
+#endif
+                } /* end if options->RowPerm ... */
+
+	        t = SuperLU_timer_() - t;
+	        stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+                if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
+	                            job, t);
+#endif
+            } /* end if Fact ... */
+
+        } else { /* options->RowPerm == NOROWPERM / NATURAL */
+            for (i = 0; i < m; ++i) perm_r[i] = i;
+        }
+
+#if ( DEBUGlevel>=2 )
+        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
+#endif
+    } /* end if (!factored) */
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = pzlangs(norm, A, grid);
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. anorm %e\n", anorm);
+#endif
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization: symbolic factorization, 
+       redistribution, and numerical factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+
+	if ( parSymbFact == YES || permc_spec == PARMETIS ) {
+	    nprocs_num = grid->nprow * grid->npcol;
+  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));
+
+	    /* create a new communicator for the first noDomains
+               processes in grid->comm */
+	    key = iam;
+    	    if (iam < noDomains) col = 0;
+	    else col = MPI_UNDEFINED;
+	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
+
+	    if ( permc_spec == NATURAL || permc_spec == MY_PERMC ) {
+		if ( permc_spec == NATURAL ) {
+		     for (j = 0; j < n; ++j) perm_c[j] = j;
+                }
+		if ( !(sizes = intMalloc_dist(2 * noDomains)) ) 
+		     ABORT("SUPERLU_MALLOC fails for sizes.");
+		if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) )
+		    ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+		for (i = 0; i < 2*noDomains - 2; ++i) {
+		    sizes[i] = 0;
+		    fstVtxSep[i] = 0;
+		}
+		sizes[2*noDomains - 2] = m;
+		fstVtxSep[2*noDomains - 2] = 0;
+	    } else if ( permc_spec != PARMETIS ) {   /* same as before */
+		printf("{" IFMT "," IFMT "}: pzgssvx: invalid ColPerm option when ParSymbfact is used\n",
+		       MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
+	    }
+        }
+
+	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
+          /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
+	  if ( permc_spec == PARMETIS ) {
+	      /* Get column permutation vector in perm_c.                    *
+	       * This routine takes as input the distributed input matrix A  *
+	       * and does not modify it.  It also allocates memory for       *
+	       * sizes[] and fstVtxSep[] arrays, that contain information    *
+	       * on the separator tree computed by ParMETIS.                 */
+	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
+                                  	   noDomains, &sizes, &fstVtxSep,
+                                           grid, &symb_comm);
+	      if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	          fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
+#endif
+		  *info = flinfo;
+		  return;
+     	      }
+	  } else {
+	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
+          }
+        }
+
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T
+	   (a.k.a. column etree), depending on the choice of ColPerm.
+	   Adjust perm_c[] to be consistent with a postorder of etree.
+	   Permute columns of A to form A*Pc'. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    if ( parSymbFact == NO ) { /* Perform serial symbolic factorization */
+		/* GA = Pr*A, perm_r[] is already applied. */
+	        int_t *GACcolbeg, *GACcolend, *GACrowind;
+
+		/* After this routine, GAC = GA*Pc^T.  */
+	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
+
+	        /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */
+	        GACstore = (NCPformat *) GAC.Store;
+	        GACcolbeg = GACstore->colbeg;
+	        GACcolend = GACstore->colend;
+	        GACrowind = GACstore->rowind;
+	        for (j = 0; j < n; ++j) {
+	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+		        irow = GACrowind[i];
+		        GACrowind[i] = perm_c[irow];
+	            }
+	        }
+
+	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
+                   the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 ) 
+                if ( !iam )
+		  printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+  	        t = SuperLU_timer_();
+	        if ( !(Glu_freeable = (Glu_freeable_t *)
+		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		    ABORT("Malloc fails for Glu_freeable.");
+
+	    	/* Every process does this. */
+	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
+			     	 Glu_persist, Glu_freeable);
+
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if ( iinfo <= 0 ) { /* Successful return */
+		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+		    	printf("\tNo of supers " IFMT "\n", (long long) Glu_persist->supno[n-1]+1);
+		    	printf("\tSize of G(L) " IFMT "\n", (long long) Glu_freeable->xlsub[n]);
+		    	printf("\tSize of G(U) " IFMT "\n", (long long) Glu_freeable->xusub[n]);
+		    	printf("\tint %d, short %d, float %d, double %d\n", 
+			       (int) sizeof(int_t), (int) sizeof(short),
+        		       (int) sizeof(float), (int) sizeof(double));
+		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+			   	symb_mem_usage.for_lu*1e-6, 
+			   	symb_mem_usage.total*1e-6,
+			   	symb_mem_usage.expansions);
+		    }
+#endif
+	    	} else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		    if ( !iam )
+		        fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
+#endif
+		    *info = iinfo;
+		    return;
+	        }
+	    } /* end serial symbolic factorization */
+	    else {  /* parallel symbolic factorization */
+	    	t = SuperLU_timer_();
+	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable, 
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage); 
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	      	    fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
+#endif
+		    *info = flinfo;
+		    return;
+                }
+	    }
+
+            /* Destroy global GA */
+            if ( parSymbFact == NO || options->RowPerm != NO )
+                Destroy_CompCol_Matrix_dist(&GA);
+            if ( parSymbFact == NO )
+ 	        Destroy_CompCol_Permuted_dist(&GAC);
+
+	} /* end if Fact ... */
+
+        if (sizes) SUPERLU_FREE (sizes);
+        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
+	if (symb_comm != MPI_COMM_NULL)
+	  MPI_Comm_free (&symb_comm); 
+
+	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
+	    /* CASE OF SERIAL SYMBOLIC */
+  	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. 
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+  	       distribution routine. */
+	    t = SuperLU_timer_();
+	    dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct,
+                                      Glu_freeable, LUstruct, grid);
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+
+  	    /* Deallocate storage used in symbolic factorization. */
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        iinfo = symbfact_SubFree(Glu_freeable);
+	        SUPERLU_FREE(Glu_freeable);
+	    }
+	} else { /* CASE OF PARALLEL SYMBOLIC */
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+	       distribution routine. */
+	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+    	    t = SuperLU_timer_();
+	    dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct,
+		  			   &Pslu_freeable, LUstruct, grid);
+	    if (dist_mem_use > 0)
+	        ABORT ("Not enough memory available for dist_psymbtonum\n");
+            
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+	}
+
+	/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]);*/
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+
+#if 0
+
+// #ifdef GPU_PROF
+
+//  if(!iam )
+//  {
+//      char* ttemp;
+
+//      ttemp = getenv("IO_FILE");
+//      if(ttemp!=NULL)
+//      {   
+//          printf("File being opend is %s\n",ttemp );
+//          FILE* fp;
+//          fp = fopen(ttemp,"w");
+//          if(!fp)
+//          {
+//              fprintf(stderr," Couldn't open output file %s\n",ttemp);
+//          }
+
+//          int nsup=Glu_persist->supno[n-1]+1;
+//          int ii;
+//          for (ii = 0; ii < nsup; ++ii)
+//          {
+//                  fprintf(fp,"%d,%d,%d,%d,%d,%d\n",gs1.mnk_min_stats[ii],gs1.mnk_min_stats[ii+nsup],
+//                  gs1.mnk_min_stats[ii+2*nsup],
+//                  gs1.mnk_max_stats[ii],gs1.mnk_max_stats[ii+nsup],gs1.mnk_max_stats[ii+2*nsup]);
+//          }
+
+//          // lastly put the timeing stats that we need
+
+//          fprintf(fp,"Min %lf Max %lf totaltime %lf \n",gs1.osDgemmMin,gs1.osDgemmMax,stat->utime[FACT]);
+//          fclose(fp);
+//      }
+
+//  }
+// #endif
+
+#endif
+
+	if ( options->PrintStat ) {
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+
+	    zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
+	    if (parSymbFact == TRUE) {
+	        /* The memory used in the redistribution routine
+		   includes the memory used for storing the symbolic
+  		   structure and the memory allocated for numerical
+		   factorization */
+	        temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use);
+                if ( options->RowPerm != NO )
+                    temp = SUPERLU_MAX(temp, GA_mem_use);
+            } else {
+	        temp = SUPERLU_MAX (
+                         symb_mem_usage.total + GA_mem_use, /* symbfact step */
+		         symb_mem_usage.for_lu + dist_mem_use +
+                             num_mem_usage.for_lu  /* distribution step */
+                       );
+            }
+            
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+
+            if (!iam) {
+		printf("\n** Memory Usage **********************************\n");
+                printf("** NUMfact space (MB): (sum-of-all-processes)\n"
+		       "    L\\U :        %8.2f |  Total : %8.2f\n",
+		       for_lu * 1e-6, total * 1e-6);
+                printf("** Total highmark (MB):\n"
+		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+		       avg * 1e-6,  
+		       avg / grid->nprow / grid->npcol * 1e-6,
+		       max * 1e-6);
+		printf("**************************************************\n");
+            }
+	} /* end printing stats */
+    
+    } /* end if (!factored) */
+
+    
+    if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+	/* Need to reset the solve's communication pattern,
+	   because perm_r[] and/or perm_c[] is changed.    */
+	if ( options->SolveInitialized == YES ) { /* Initialized before */
+	    zSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+	    options->SolveInitialized = NO;   /* Reset the solve state */
+	}
+     }
+#if 0
+    /* Need to revisit: Why the following is not good enough for X-to-B
+       distribution -- inv_perm_c changed */
+	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+	             LUstruct->Glu_persist, SOLVEstruct);
+#endif
+
+	
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = doublecomplexMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed. 
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+                        zd_mult(&b_col[i], &b_col[i], R[irow]);
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    zd_mult(&b_col[i], &b_col[i], C[irow]);
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+#if 0 /* Sherry */
+	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+#endif
+            memcpy(x_col, b_col, m_loc * sizeof(doublecomplex));
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	if ( options->SolveInitialized == NO ) { /* First time */
+	    zSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
+		       SOLVEstruct);
+            /* Inside this routine, SolveInitialized is set to YES.
+	       For repeated call to pzgssvx(), no need to re-initialilze
+	       the Solve data & communication structures, unless a new
+	       factorization with Fact == DOFACT or SamePattern is asked for. */
+	} 
+
+	pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
+		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    int_t *it;
+            int_t *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+	          /* This was allocated and set to NULL in zSolveInit() */
+	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
+
+	    t = SuperLU_timer_();
+	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
+	        /* All these cases need to re-initialize gsmv structure */
+	        if ( options->RefineInitialized )
+		    pzgsmv_finalize(SOLVEstruct->gsmv_comm);
+	        pzgsmv_init(A, SOLVEstruct->row_to_proc, grid,
+			    SOLVEstruct->gsmv_comm);
+	       
+                /* Save a copy of the transformed local col indices
+		   in colind_gsmv[]. */
+	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
+	        if ( !(it = intMalloc_dist(nnz_loc)) )
+		    ABORT("Malloc fails for colind_gsmv[]");
+	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+	        options->RefineInitialized = YES;
+	    } else if ( Fact == SamePattern ||
+			Fact == SamePattern_SameRowPerm ) {
+	        doublecomplex atemp;
+	        int_t k, jcol, p;
+	        /* Swap to beginning the part of A corresponding to the
+		   local part of X, as was done in pzgsmv_init() */
+	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+		    k = rowptr[i];
+		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+		        jcol = colind[j];
+		        p = SOLVEstruct->row_to_proc[jcol];
+		        if ( p == iam ) { /* Local */
+		            atemp = a[k]; a[k] = a[j]; a[j] = atemp;
+		            ++k;
+		        }
+		    }
+	        }
+	      
+	        /* Re-use the local col indices of A obtained from the
+		   previous call to pzgsmv_init() */
+	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
+	    }
+
+	    if ( nrhs == 1 ) { /* Use the existing solve structure */
+	        SOLVEstruct1 = SOLVEstruct;
+	    } else { /* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS. 
+			So we use SOLVEstruct1 for the refinement step.
+		     */
+	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
+		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
+		    ABORT("Malloc fails for SOLVEstruct1");
+	        /* Copy the same stuff */
+	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+		
+		/* Initialize the *gstrs_comm for 1 RHS. */
+		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+		    ABORT("Malloc fails for gstrs_comm[]");
+		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
+			     Glu_persist, SOLVEstruct1);
+	    }
+
+	    pzgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
+		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+            /* Deallocate the storage associated with SOLVEstruct1 */
+	    if ( nrhs > 1 ) {
+	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
+	        SUPERLU_FREE(SOLVEstruct1);
+	    }
+
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	} /* end if IterRefine */
+
+	/* Permute the solution matrix B <= Pc'*X. */
+	pzPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
+			       SOLVEstruct->inv_perm_c,
+			       X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+	printf("\n (%d) .. After pzPermute_Dense_Matrix(): b =\n", iam);
+	for (i = 0; i < m_loc; ++i)
+	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
+#endif
+	
+	/* Transform the solution matrix X to a solution of the original
+	   system before equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+                        zd_mult(&b_col[i], &b_col[i], C[irow]);
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    zd_mult(&b_col[i], &b_col[i], R[irow]);
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 && *info == 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it was not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW: 
+		SUPERLU_FREE(C);
+		break;
+	    case COL: 
+		SUPERLU_FREE(R);
+		break;
+	}
+    }
+
+#if 0
+    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+ 	Destroy_CompCol_Permuted_dist(&GAC);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgssvx()");
+#endif
+
+}
diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c
new file mode 100644
index 0000000..247f9e8
--- /dev/null
+++ b/SRC/pzgssvx_ABglobal.c
@@ -0,0 +1,1104 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Solves a system of linear equations A*X=B,
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015   version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pzgssvx_ABglobal solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ *
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using 
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has several options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously 
+ *      solved problem to save time by reusing part or all of 
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *      -  A, the input matrix
+ *
+ *      as well as the following options, which are described in more 
+ *      detail below:
+ *
+ *      -  options->Equil,   to specify how to scale the rows and columns
+ *                           of A to "equilibrate" it (to try to reduce its
+ *                           condition number and so improve the
+ *                           accuracy of the computed solution)
+ *
+ *      -  options->RowPerm, to specify how to permute the rows of A
+ *                           (typically to control numerical stability)
+ *
+ *      -  options->ColPerm, to specify how to permute the columns of A
+ *                           (typically to control fill-in and enhance
+ *                           parallelism during factorization)
+ *
+ *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                           pivots encountered during factorization
+ *                           (to control numerical stability)
+ *
+ *      The outputs returned include
+ *         
+ *      -  ScalePermstruct,  modified to describe how the input matrix A
+ *                           was equilibrated and permuted:
+ *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                        columns of A were scaled
+ *         -  ScalePermstruct->R, array of row scale factors
+ *         -  ScalePermstruct->C, array of column scale factors
+ *         -  ScalePermstruct->perm_r, row permutation vector
+ *         -  ScalePermstruct->perm_c, column permutation vector
+ *
+ *            (part of ScalePermstruct may also need to be supplied on input,
+ *             depending on options->RowPerm and options->ColPerm as described 
+ *             later).
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *                Pc*Pr*diag(R)*A*diag(C)
+ *             where 
+ *                Pr and Pc are row and columns permutation matrices determined
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
+ *                  respectively, and 
+ *                diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  ScalePermstruct->C
+ *
+ *      -  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
+ *               in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *     
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In this
+ *            case the algorithm saves time by reusing the previously computed
+ *            column permutation vector stored in ScalePermstruct->perm_c
+ *            and the "elimination tree" of A stored in LUstruct->etree.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->RowPerm
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply 
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->perm_c, the column permutation
+ *      -  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *         
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated and row permuted
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are ignored.
+ *      This is because the permutations from ScalePermstruct->perm_r and
+ *      ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply 
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
+ *                                     column scaled
+ *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *                             if any
+ *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous 
+ *                                  matrix
+ *      -  all of LUstruct, the previously computed information about L and U
+ *                (the actual numerical values of L and U stored in
+ *                 LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *         
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated 
+ *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous 
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply 
+ *
+ *      -  A, the unfactored matrix, only in the case that iterative refinment
+ *            is to be done (specifically A must be the output A from 
+ *            the previous call, so that it has been scaled and permuted)
+ *      -  all of ScalePermstruct
+ *      -  all of LUstruct, including the actual numerical values of L and U
+ *
+ *      all of which are unmodified on output.
+ *         
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *         
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or 
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *           
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *         
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix*
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *         The number of linear equations is A->nrow. The type of A must be:
+ *         Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. That is, A is stored in
+ *         compressed column format (also known as Harwell-Boeing format).
+ *         See supermatrix.h for the definition of 'SuperMatrix'.
+ *         This routine only handles square A, however, the LU factorization
+ *         routine pzgstrf can factorize rectangular matrices.
+ *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
+ *         depending on ScalePermstruct->DiagScale, options->RowPerm and
+ *         options->colpem:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->RowPerm != NATURAL, A is further overwritten by
+ *                Pr*diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                Pc*Pr*diag(R)*A*diag(C).
+ *         If all the above condition are true, the LU decomposition is
+ *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ *         NOTE: Currently, A must reside in all processes when calling
+ *               this routine.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t*
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was 
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *         
+ * B       (input/output) doublecomplex*
+ *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
+ *         On exit, the solution matrix if info = 0;
+ *
+ *         NOTE: Currently, B must reside in all processes when calling
+ *               this routine.
+ *
+ * ldb     (input) int (global)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t*
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * berr    (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ *
+ * See superlu_zdefs.h for the definitions of various data types.
+ * </pre>
+ */
+void
+pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, 
+		 ScalePermstruct_t *ScalePermstruct,
+		 doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
+		 LUstruct_t *LUstruct, double *berr,
+		 SuperLUStat_t *stat, int *info)
+{
+    SuperMatrix AC;
+    NCformat *Astore;
+    NCPformat *ACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
+	      routine. They will be freed after DDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    doublecomplex   *a;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *colptr, *rowind;
+    int_t    Equil, factored, job, notran, colequ, rowequ;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
+    int      iam;
+    int      ldx;  /* LDA for matrix X (global). */
+    char     equed[1], norm[1];
+    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    doublecomplex   *X, *b_col, *b_work, *x_col;
+    double   t;
+    static superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+
+    /* Test input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < 0 || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	fprintf(stderr, "Extra precise iterative refinement yet to support.");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
+         A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < A->nrow )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("pzgssvx_ABglobal", grid, -*info);
+	return;
+    }
+
+    /* Initialization */
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    iam = grid->iam;
+    job = 5;
+    m = A->nrow;
+    n = A->ncol;
+    Astore = A->Store;
+    nnz = Astore->nnz;
+    a = Astore->nzval;
+    colptr = Astore->colptr;
+    rowind = Astore->rowind;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgssvx_ABglobal()");
+#endif
+
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW: 
+	        if ( !(C = (double *) doubleMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL: 
+		if ( !(R = (double *) doubleMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       Diagonal scaling to equilibrate the matrix.
+       ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			zd_mult(&a[i], &a[i], R[i]); /* Scale rows. */
+		    }
+		}
+		break;
+	      case COL:
+		for (j = 0; j < n; ++j)
+		    for (i = colptr[j]; i < colptr[j+1]; ++i)
+			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
+		break;
+	      case BOTH: 
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			zd_mult(&a[i], &a[i], R[irow]); /* Scale rows. */
+			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
+		    }
+		}
+	        break;
+	    }
+	} else {
+	    if ( !iam ) {
+		/* Compute row and column scalings to equilibrate matrix A. */
+		zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
+	    
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		} else {
+		    if ( iinfo > 0 ) {
+			if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", 
+				    iinfo);
+#endif
+			} else {
+#if ( PRNTlevel>=1 )
+                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", 
+				     iinfo-n);
+#endif
+                        }
+		    }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		} 
+	    }
+	
+            if ( iinfo == 0 ) {
+	        /* Equilibrate matrix A. */
+	        zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
+	        if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+	        } else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+	        } else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+	        } else ScalePermstruct->DiagScale = NOEQUIL;
+            }
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		/*fflush(stdout);*/
+	    }
+#endif
+	} /* if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... */
+    
+    /* ------------------------------------------------------------
+       Permute rows of A. 
+       ------------------------------------------------------------*/
+    if ( options->RowPerm != NO ) {
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
+	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
+	    for (j = 0; j < n; ++j) {
+		for (i = colptr[j]; i < colptr[j+1]; ++i) {
+		    irow = rowind[i];
+		    rowind[i] = perm_r[irow];
+		}
+	    }
+	} else if ( !factored ) {
+	    if ( job == 5 ) {
+		/* Allocate storage for scaling factors. */
+		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
+		    ABORT("SUPERLU_MALLOC fails for R1[]");
+		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
+		    ABORT("SUPERLU_MALLOC fails for C1[]");
+	    }
+
+	    if ( !iam ) {
+		/* Process 0 finds a row permutation for large diagonal. */
+		iinfo = zldperm_dist(job, m, nnz, colptr, rowind, a,
+                                perm_r, R1, C1);
+
+                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );		
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		    if ( job == 5 && Equil ) {
+		       MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		       MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+		   }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		   MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		   if ( job == 5 && Equil ) {
+		      MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
+		      MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
+		   }
+		}
+	    }
+
+	    if ( iinfo && job == 5) {
+	        SUPERLU_FREE(R1);
+	        SUPERLU_FREE(C1);
+   	    }
+
+#if ( PRNTlevel>=2 )
+	    dmin = dmach_dist("Overflow");
+	    dsum = 0.0;
+	    dprod = 1.0;
+#endif
+	    if ( iinfo == 0 ) {
+	      if ( job == 5 ) {
+		if ( Equil ) {
+		    for (i = 0; i < n; ++i) {
+			R1[i] = exp(R1[i]);
+			C1[i] = exp(C1[i]);
+		    }
+		    for (j = 0; j < n; ++j) {
+			for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			    irow = rowind[i];
+			    zd_mult(&a[i], &a[i], R1[irow]); /* Scale rows. */
+			    zd_mult(&a[i], &a[i], C1[j]); /* Scale columns. */
+			    rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			    if ( rowind[i] == j ) /* New diagonal */
+				dprod *= slud_z_abs1(&a[i]);
+#endif
+			}
+		    }
+
+		    /* Multiply together the scaling factors. */
+		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		    else for (i = 0; i < m; ++i) R[i] = R1[i];
+		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		    else for (i = 0; i < n; ++i) C[i] = C1[i];
+		    
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = colequ = 1;
+		} else { /* No equilibration. */
+		    for (j = 0; j < n; ++j) {
+			for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			    irow = rowind[i];
+			    rowind[i] = perm_r[irow];
+			}
+		    }
+		}
+		SUPERLU_FREE (R1);
+		SUPERLU_FREE (C1);
+	      } else { /* job = 2,3,4 */
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			if ( rowind[i] == j ) { /* New diagonal */
+			    if ( job == 2 || job == 3 )
+				dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i]));
+			    else if ( job == 4 )
+				dsum += slud_z_abs1(&a[i]);
+			    else if ( job == 5 )
+				dprod *= slud_z_abs1(&a[i]);
+			}
+#endif
+		    } /* end for i ... */
+		} /* end for j ... */
+              } /* end else */
+            } else { /* if iinfo != 0 */
+		for (i = 0; i < m; ++i) perm_r[i] = i;
+	    }
+
+#if ( PRNTlevel>=2 )
+	    if ( job == 2 || job == 3 ) {
+		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	    } else if ( job == 4 ) {
+		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	    } else if ( job == 5 ) {
+		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	    }
+#endif
+	    
+        } /* else !factored */
+
+	t = SuperLU_timer_() - t;
+	stat->utime[ROWPERM] = t;
+    
+    } else { /* options->RowPerm == NOROWPERM */
+        for (i = 0; i < m; ++i) perm_r[i] = i;
+    }
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = zlangs_dist(norm, A);
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+	if ( permc_spec != MY_PERMC && Fact == DOFACT )
+	    /* Use an ordering provided by SuperLU */
+	    get_perm_c_dist(iam, permc_spec, A, perm_c);
+
+	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	   (a.k.a. column etree), depending on the choice of ColPerm.
+	   Adjust perm_c[] to be consistent with a postorder of etree.
+	   Permute columns of A to form A*Pc'. */
+	sp_colorder(options, A, perm_c, etree, &AC);
+
+	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
+	ACstore = AC.Store;
+	for (j = 0; j < n; ++j) 
+	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
+		irow = ACstore->rowind[i];
+		ACstore->rowind[i] = perm_c[irow];
+	    }
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Perform a symbolic factorization on matrix A and set up the
+	   nonzero data structures which are suitable for supernodal GENP. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+#if ( PRNTlevel>=1 ) 
+	    if ( !iam ) 
+		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+	    t = SuperLU_timer_();
+	    if ( !(Glu_freeable = (Glu_freeable_t *)
+		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		ABORT("Malloc fails for Glu_freeable.");
+
+	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
+			     Glu_persist, Glu_freeable);
+
+	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+
+	    if ( iinfo <= 0 ) {
+		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 ) 
+		if ( !iam ) {
+		    printf("\tNo of supers %ld\n", (long long)Glu_persist->supno[n-1]+1);
+		    printf("\tSize of G(L) %ld\n", (long long)Glu_freeable->xlsub[n]);
+		    printf("\tSize of G(U) %ld\n", (long long)Glu_freeable->xusub[n]);
+		    printf("\tint %d, short %d, float %d, double %d\n", 
+			   (int) sizeof(int_t), (int) sizeof(short), 
+ 			   (int) sizeof(float), (int) sizeof(double));
+		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+			   symb_mem_usage.for_lu*1e-6, 
+			   symb_mem_usage.total*1e-6,
+			   symb_mem_usage.expansions);
+		}
+#endif
+	    } else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		if ( !iam )
+		    fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo);
+#endif
+                *info = iinfo;  
+                return;
+	    }
+	}
+
+	/* Distribute the L and U factors onto the process grid. */
+	t = SuperLU_timer_();
+	dist_mem_use = zdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
+	stat->utime[DIST] = SuperLU_timer_() - t;
+
+	/* Deallocate storage used in symbolic factor. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    iinfo = symbfact_SubFree(Glu_freeable);
+	    SUPERLU_FREE(Glu_freeable);
+	}
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+
+#if ( PRNTlevel>=1 )
+	{
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+	    zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    temp = SUPERLU_MAX(symb_mem_usage.total,
+			       symb_mem_usage.for_lu +
+			       (float)dist_mem_use + num_mem_usage.for_lu);
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+	    if ( !iam ) {
+		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
+		       for_lu*1e-6, total*1e-6);
+		printf("\tAll space (MB):"
+		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
+		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
+		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
+		printf(".. pzgstrf INFO = %d\n", *info);
+	    }
+	}
+#endif
+    
+    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
+	/* Permute columns of A to form A*Pc' using the existing perm_c.
+	 * NOTE: rows of A were previously permuted to Pc*A.
+	 */
+	sp_colorder(options, A, perm_c, NULL, &AC);
+    } /* if !factored ... */
+	
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = doublecomplexMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed. 
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
+		b_col += ldb;
+	    }
+	}
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pr*B.
+	   ------------------------------------------------------------*/
+	if ( options->RowPerm != NO ) {
+	    if ( notran ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
+		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		    b_col += ldb;
+		}
+	    }
+	}
+
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pc*B.
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
+		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	pzgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    t = SuperLU_timer_();
+	    pzgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
+			      X, ldx, nrhs, berr, stat, info);
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	}
+
+	/* Permute the solution matrix X <= Pc'*X. */
+	for (j = 0; j < nrhs; j++) {
+	    b_col = &B[j*ldb];
+	    x_col = &X[j*ldx];
+	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
+	}
+	
+	/* Transform the solution matrix X to a solution of the original system
+	   before the equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it is not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW: 
+		SUPERLU_FREE(C);
+		break;
+	    case COL: 
+		SUPERLU_FREE(R);
+		break;
+	}
+    }
+    if ( !factored || (factored && options->IterRefine) )
+	Destroy_CompCol_Permuted_dist(&AC);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgssvx_ABglobal()");
+#endif
+}
+
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
new file mode 100644
index 0000000..61c3aa4
--- /dev/null
+++ b/SRC/pzgstrf.c
@@ -0,0 +1,1820 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Performs LU factorization in parallel
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *     September 1, 1999
+ *     Feburary 7, 2001  use MPI_Isend/MPI_Irecv
+ *     October 15, 2008  latency-reducing panel factorization
+ *     July    12, 2011  static scheduling and arbitrary look-ahead
+ *     March   13, 2013  change NTAGS to MPI_TAG_UB value
+ *     September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
+ *     December 31, 2015 rename xMACH to xMACH_DIST
+ *
+ * Sketch of the algorithm 
+ *
+ * ======================= 
+ *    
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pzgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ *
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pzgstrf2(k), one column at a time
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j)
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i)
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ * </pre>
+ */
+
+#include <math.h>
+/*#include "mkl.h"*/
+#include "superlu_zdefs.h"
+
+#ifdef GPU_ACC
+#include "cublas_utils.h"
+/*#include "cublas_zgemm.h"*/
+// #define NUM_CUDA_STREAMS 16
+// #define NUM_CUDA_STREAMS 16
+#endif 
+
+/* Various defininations     */
+/* 
+    Name    : SUPERNODE_PROFILE  
+    Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec
+    obtained,bandwidth achived:
+    Overhead : Low 
+*/
+// #define SUPERNODE_PROFILE   
+
+/* 
+    Name    :   BAELINE
+    Purpose : baseline to compare performance against
+    Overhead : NA : this wont be used for running experiments
+*/
+// #define BASELINE
+
+/* 
+    Name    :   PHI_FRAMEWORK
+    Purpose : To simulate and test algorithm used for offloading Phi
+    Overhead : NA : this wont be used for running experiments
+*/
+#define PHI_FRAMEWORK
+
+#define PZGSTRF2 pzgstrf2_trsm
+#define PZGSTRS2 pzgstrs2_omp
+
+extern void PZGSTRF2 (superlu_dist_options_t *, int_t, int_t, double,
+                        Glu_persist_t *, gridinfo_t *, LocalLU_t *,
+                        MPI_Request *, int, SuperLUStat_t *, int *);
+#ifdef _CRAY
+extern void PZGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *,
+                      LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+extern void PZGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *,
+                      LocalLU_t *, SuperLUStat_t *);
+#endif
+
+#ifdef ISORT
+extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2);
+extern void isort1 (int_t N, int_t * ARRAY);
+
+#else
+
+int
+superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
+
+/************************************************************************/
+
+#include "zscatter.c"
+
+/************************************************************************/
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t
+pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
+       LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int *info)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs1 = _cptofcd ("L", strlen ("L"));
+    _fcd ftcs2 = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs3 = _cptofcd ("U", strlen ("U"));
+#endif
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf;
+    int_t **Lsub_buf_2, **Usub_buf_2;
+    doublecomplex **Lval_buf_2, **Uval_buf_2;          /* pointers to starts of bufs */
+    doublecomplex *lusup, *lusup1, *uval, *Uval_buf;   /* pointer to current buf     */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+        lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+        nlb, nub, nsupc, rel, rukp, il, iu;
+    int_t Pc, Pr;
+    int iam, kcol, krow, yourcol, mycol, myrow, pi, pj;
+    int j, k, lk, nsupers;  /* k - current panel to work on */
+    int k0;        /* counter of the next supernode to be factored */
+    int kk, kk0, kk1, kk2, jj0; /* panels in the look-ahead window */
+    int iukp0, rukp0, flag0, flag1;
+    int nsupr, nbrow, segsize;
+    int msg0, msg2;
+    int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    doublecomplex **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t *index;
+    doublecomplex *nzval;
+    int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    doublecomplex *ucol;
+    int *indirect, *indirect2;
+    doublecomplex *tempv, *tempv2d;
+    int iinfo;
+    int *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    doublecomplex *tempU2d, *tempu;
+    int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
+        *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
+        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
+    float edag_supno_l_bytes;
+#ifdef ISORT
+    int_t *iperm_u;
+#endif
+    int *msgcnt;   /* Count the size of the message xfer'd in each buffer:
+		    *     0 : transferred in Lsub_buf[]
+		    *     1 : transferred in Lval_buf[]
+		    *     2 : transferred in Usub_buf[]
+		    *     3 : transferred in Uval_buf[]
+		    */
+    int **msgcnts, **msgcntsU; /* counts for each panel in the
+                                  look-ahead window */
+    int *factored;  /* factored[j]==0 : L col panel j is factorized */
+    int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+    int num_look_aheads, look_id, *look_ahead;
+    int_t *perm_c_supno, *iperm_c_supno;
+    MPI_Request *recv_req, **recv_reqs, **send_reqs, **send_reqs_u,
+        **recv_reqs_u;
+    MPI_Request *send_req, *U_diag_blk_send_req = NULL;
+    MPI_Status status;
+    void *attr_val;
+    int flag;
+
+    int iword = sizeof (int_t);
+    int dword = sizeof (doublecomplex);
+
+    /* For measuring load imbalence in omp threads*/
+    double omp_load_imblc = 0.0;
+    double *omp_loop_time;
+
+    double CPUOffloadTimer      = 0;
+    double CPUOffloadFlop       = 0;
+    double CPUOffloadMop        = 0;
+    double schur_flop_timer     = 0.0;
+    double pdgstrf2_timer       = 0.0;
+    double pdgstrs2_timer       = 0.0;
+    double lookaheadupdatetimer = 0.0;
+    double InitTimer            = 0.0; /* including compute schedule, malloc */
+    double tt_start, tt_end;
+
+#if !defined( GPU_ACC )
+    /* Counter for couting memory operations */
+    double scatter_mem_op_counter  = 0.0;
+    double scatter_mem_op_timer    = 0.0;
+    double scatterL_mem_op_counter = 0.0;
+    double scatterL_mem_op_timer   = 0.0;
+    double scatterU_mem_op_counter = 0.0;
+    double scatterU_mem_op_timer   = 0.0;
+
+    double GatherLTimer            = 0.0;
+    double LookAheadRowSepMOP      = 0.0;
+    double GatherUTimer             = 0.0;
+    double GatherMOP               = 0.0;
+    double LookAheadGEMMTimer      = 0.0;
+    double LookAheadGEMMFlOp       = 0.0;
+    double LookAheadScatterTimer   = 0.0;
+    double LookAheadScatterMOP     = 0.0;
+    double RemainGEMMTimer         = 0.0;
+    double RemainScatterTimer      = 0.0;
+    double NetSchurUpTimer         = 0.0;
+    double schur_flop_counter      = 0.0;
+#endif
+
+#if ( PRNTlevel>= 1)
+    /* count GEMM max dimensions */
+    int gemm_max_m = 0, gemm_max_n = 0, gemm_max_k = 0;
+#endif
+
+#if ( DEBUGlevel>=2 )
+    int_t num_copy = 0, num_update = 0;
+#endif
+#if ( PRNTlevel==3 )
+    int zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if (m < 0)
+        *info = -2;
+    else if (n < 0)
+        *info = -3;
+    if (*info) {
+        pxerr_dist ("pzgstrf", grid, -*info);
+        return (-1);
+    }
+
+    /* Quick return if possible. */
+    if (m == 0 || n == 0) return 0;
+ 
+    /* 
+     * Initialization.  
+     */
+    iam = grid->iam;
+    Pc = grid->npcol; 
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    mycol = MYCOL (iam, grid);
+    nsupers = Glu_persist->supno[n - 1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = smach_dist("Epsilon");
+    thresh = s_eps * anorm;
+
+    MPI_Attr_get (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    if (!flag) {
+        fprintf (stderr, "Could not get TAG_UB\n");
+        return (-1);
+    }
+    int tag_ub = *(int *) attr_val;
+
+#if ( PRNTlevel>=1 )
+    if (!iam)
+        printf ("MPI tag upper bound = %d\n", tag_ub);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    if (s_eps == 0.0)
+        printf (" ***** warning s_eps = %e *****\n", s_eps);
+    CHECK_MALLOC (iam, "Enter pdgstrf()");
+#endif
+
+    stat->ops[FACT]      = 0.0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+
+    /* make sure the range of look-ahead window [0, MAX_LOOKAHEADS-1] */
+    num_look_aheads = SUPERLU_MAX(0, SUPERLU_MIN(options->num_lookaheads, MAX_LOOKAHEADS - 1));
+
+    if (Pr * Pc > 1) {
+        if (!(U_diag_blk_send_req =
+              (MPI_Request *) SUPERLU_MALLOC (Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for U_diag_blk_send_req[].");
+	/* flag no outstanding Isend */
+        U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; /* used 0 before */
+
+        /* allocating buffers for look-ahead */
+        i = Llu->bufmax[0];
+        if (i != 0) {
+            if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
+                ABORT ("Malloc fails for Lsub_buf.");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[1];
+        if (i != 0) {
+            if (!(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
+                ABORT ("Malloc fails for Lval_buf[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[2];
+        if (i != 0) {
+            if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Usub_buf_2[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[3];
+        if (i != 0) {
+            if (!(Llu->Uval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Uval_buf_2[].");
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+        }
+    }
+
+    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) 
+		* iword +
+		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) 
+		* dword, stat );
+
+    /* creating pointers to the look-ahead buffers */
+    if (! (Lsub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Lsub_buf_2[].");
+    if (! (Lval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (doublecomplex *))))
+        ABORT ("Malloc fails for Lval_buf_2[].");
+    if (! (Usub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Uval_buf_2[].");
+    if (! (Uval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (doublecomplex *))))
+        ABORT ("Malloc fails for buf_2[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        Lval_buf_2[i] = Llu->Lval_buf_2[i];
+        Lsub_buf_2[i] = Llu->Lsub_buf_2[i];
+        Uval_buf_2[i] = Llu->Uval_buf_2[i];
+        Usub_buf_2[i] = Llu->Usub_buf_2[i];
+    }
+
+    if (!(msgcnts = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcnts[].");
+    if (!(msgcntsU = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcntsU[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(msgcnts[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcnts[].");
+        if (!(msgcntsU[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcntsU[].");
+    }
+
+    if (! (recv_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs_u[].");
+    if (! (send_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (send_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (recv_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(recv_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req_u[i].");
+        if (!(send_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_req_u[i].");
+        if (!(send_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pc * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_reqs[i].");
+        if (!(recv_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (4 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req[].");
+        send_reqs[i][0] = send_reqs[i][1] = MPI_REQUEST_NULL;
+        recv_reqs[i][0] = recv_reqs[i][1] = MPI_REQUEST_NULL;
+    }
+
+    if (!(factored = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factored[].");
+    if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factoredU[].");
+    for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+    log_memory(2 * nsupers * iword, stat);
+
+    int num_threads = 1;
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    {
+        if (omp_get_thread_num () == 0) {
+            num_threads = omp_get_num_threads ();
+        }
+    }
+#endif
+
+#if 0
+    omp_loop_time = (double *) _mm_malloc (sizeof (double) * num_threads,64);
+#else
+    omp_loop_time = (double *) doubleMalloc_dist(num_threads);
+#endif
+
+#if ( PRNTlevel>=1 )
+    if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+#endif
+    double tt1 = SuperLU_timer_ ();
+
+    nblocks = 0;
+    ncb = nsupers / Pc; /* number of column blocks, horizontal */
+    nrb = nsupers / Pr; /* number of row blocks, vertical  */
+
+    /* in order to have dynamic scheduling */
+    int *full_u_cols;
+    int *blk_ldu;
+#if 0
+    full_u_cols = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+    blk_ldu = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+#else
+    full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
+    blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
+#endif
+    log_memory(2 * ncb * iword, stat);
+
+
+    /* insert a check condition here */
+
+#if 0  /* Sherry: not used? */
+    /* This bunch is used for static scheduling */
+    pair *full_col_count = (pair *) _mm_malloc (sizeof (pair) * ncb,64);
+    int_t *count_cols, *sum_cols, *partition;
+    count_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    sum_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    partition = (int_t *) _mm_malloc (sizeof (int_t) * num_threads * ncb,64);
+    int_t ldp = ncb;
+#endif
+
+    /* ##################################################################
+     *  Compute a good static schedule based on the factorization task graph.
+     * ################################################################## */
+    perm_c_supno = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+    iperm_c_supno = perm_c_supno + nsupers;
+
+    static_schedule(options, m, n, LUstruct, grid, stat,
+		    perm_c_supno, iperm_c_supno, info);
+
+#if ( DEBUGlevel >= 2 )
+    PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno);
+    
+    /* Turn off static schedule */
+    printf("[%d] .. Turn off static schedule for debugging ..\n", iam);
+    for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i;
+#endif
+     /* ################################################################## */
+
+    /* constructing look-ahead table to indicate the last dependency */
+    int *look_ahead_l; /* Sherry: add comment on look_ahead_l[] */
+    stat->num_look_aheads = num_look_aheads;
+
+    look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
+    look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
+    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+    log_memory(3 * nsupers * iword, stat);
+
+    /* go through U-factor */
+    for (lb = 0; lb < nrb; ++lb) {
+        ib = lb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[lb];
+        if (index) { /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k]; /* global block number */
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+    if (myrow < nsupers % grid->nprow) {
+        ib = nrb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[nrb];
+        if (index) {             /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k];
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+
+    if (options->SymPattern == NO) {
+        /* go through L-factor */
+        for (lb = 0; lb < ncb; lb++) {
+            ib = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            ib = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+    }
+    MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, grid->comm);
+    SUPERLU_FREE (look_ahead_l);
+
+#ifdef ISORT
+    iperm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+    perm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+#else
+    perm_u = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+#endif
+    log_memory(nsupers * iword, stat);
+
+    k = sp_ienv_dist (3);       /* max supernode size */
+#if 0
+    if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) )
+         ABORT("Malloc fails for ujrow[].");
+#else
+    /* Instead of half storage, we'll do full storage */
+    if (!(Llu->ujrow = doublecomplexCalloc_dist (k * k)))
+        ABORT ("Malloc fails for ujrow[].");
+    log_memory(k * k * iword, stat);
+#endif
+
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf (".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm,
+                thresh);
+        printf
+            (".. Buffer size: Lsub %ld\tLval %ld\tUsub %ld\tUval %ld\tLDA %ld\n",
+             (long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
+             (long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
+             (long int) Llu->bufmax[4]);
+    }
+#endif
+   
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv; 
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    k = CEILING (nsupers, Pr);  /* Number of local block rows */
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int i = 0; i < nsupers; ++i) {
+        int tpc = PCOL (i, grid);
+        if (mycol == tpc) {
+            lk = LBj (i, grid);
+            lsub = Lrowind_bc_ptr[lk];
+            if (lsub != NULL) {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+
+    /* Buffer size is max of look ahead window */
+    /* int_t buffer_size =
+         SUPERLU_MAX (max_row_size * num_threads * ldt,
+                      get_max_buffer_size ());           */
+            
+#ifdef GPU_ACC
+    int cublas_nb = get_cublas_nb();
+    int nstreams = get_num_cuda_streams ();
+
+    int buffer_size  = SUPERLU_MAX(max_row_size*nstreams*cublas_nb,get_max_buffer_size());
+    /* array holding last column blk for each partition,
+       used in SchCompUdt--CUDA.c         */
+  #if 0
+    int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
+  #else
+    int *stream_end_col = SUPERLU_MALLOC( nstreams * sizeof(int) );
+  #endif
+
+#else /* not to use GPU */
+
+    int Threads_per_process = get_thread_per_process();
+    int buffer_size  = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size());
+#endif /* end ifdef GPU_ACC */
+
+#if 0
+    /* symmetric assumption -- using L's supernode to estimate. */
+    /* Note that in following expression 8 can be anything
+       as long as its not too big */
+    int bigu_size = 8 * sp_ienv_dist (3) * (max_row_size);
+#else
+    int_t bigu_size = estimate_bigu_size( nsupers, ldt, 
+					  Ufstnz_br_ptr,
+					  Glu_persist, grid, perm_u );
+#endif
+
+    /* bigU and bigV are either on CPU or on GPU, not both. */
+    doublecomplex* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
+                     bigU has the same size either on CPU or on CPU. */
+    doublecomplex* bigV; /* for GEMM output matrix, i.e. update matrix. 
+                     On CPU, bigV is small for block-by-block update.
+	             On GPU, bigV is large to hold the aggregate GEMM output.*/
+
+#if ( PRNTlevel>=1 )
+    if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+#endif
+
+#ifdef GPU_ACC
+
+    if ( checkCuda(cudaHostAlloc((void**)&bigU,  bigu_size * sizeof(doublecomplex), cudaHostAllocDefault)) )
+        ABORT("Malloc fails for zgemm buffer U ");
+
+    int bigv_size = buffer_size;
+#if ( PRNTlevel>=1 )
+    if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
+#endif
+    if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,cudaHostAllocDefault)) )
+        ABORT("Malloc fails for zgemm buffer V");
+ 
+    DisplayHeader();
+
+#if ( PRNTlevel>=1 )
+    printf(" Starting with %d Cuda Streams \n",nstreams );
+#endif
+
+    cublasHandle_t *handle;
+    handle = (cublasHandle_t *) SUPERLU_MALLOC(sizeof(cublasHandle_t)*nstreams);
+    for(int i = 0; i < nstreams; i++) handle[i] = create_handle();
+
+    // creating streams 
+    cudaStream_t *streams;
+    streams = (cudaStream_t *) SUPERLU_MALLOC(sizeof(cudaStream_t)*nstreams);
+    for (int i = 0; i < nstreams; ++i)
+        checkCuda( cudaStreamCreate(&streams[i]) );
+    
+    // allocating data in device 
+    doublecomplex *dA, *dB, *dC;
+    cudaError_t cudaStat;
+#if 0
+    // cudaStat = cudaMalloc( (void**)&dA, m*k*sizeof(double));
+    // HOw much should be the size of dA?
+    // for time being just making it 
+    // cudaStat = cudaMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double));
+#endif
+
+    cudaStat = cudaMalloc( (void**)&dA, max_row_size*sp_ienv_dist(3)* sizeof(doublecomplex));
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating A in the device %ld \n",m*k*sizeof(doublecomplex) );
+        return 1;
+    }
+
+    // size of B should be max_supernode_size*buffer
+
+    cudaStat = cudaMalloc((void**)&dB, bigu_size * sizeof(doublecomplex));
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(doublecomplex));
+        return 1;
+    }
+
+    cudaStat = cudaMalloc((void**)&dC, buffer_size* sizeof(doublecomplex) );
+    if (cudaStat!= cudaSuccess) {
+        fprintf(stderr, "!!!! Error in allocating C in the device \n" );
+        return 1;
+    }
+
+    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
+			  + bigu_size + buffer_size ) * dword;
+
+#else  /* not to use GPU */
+    
+    if ( !(bigU = doublecomplexMalloc_dist(bigu_size)) )
+        ABORT ("Malloc fails for zgemm u buff U"); 
+          //Maximum size of bigU= sqrt(buffsize) ?
+
+    int bigv_size = 8 * ldt * ldt * num_threads;
+#if ( PRNTlevel>=1 )
+    if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
+#endif
+    if ( !(bigV = doublecomplexMalloc_dist(bigv_size)) )
+        ABORT ("Malloc failed for zgemm buffer V");
+
+#endif /* end ifdef GPU_ACC */
+
+    log_memory((bigv_size + bigu_size) * dword, stat);
+
+    // mlock(bigU,(bigu_size) * sizeof (double));   
+
+#if ( PRNTlevel>=1 )
+    if(!iam) {
+	printf ("  Max row size is %d \n", max_row_size);
+        printf ("  Threads per process %d \n", num_threads);
+	/* printf ("  Using buffer_size of %d \n", buffer_size); */
+    }
+#endif
+
+    if (!(tempv2d = doublecomplexCalloc_dist (2 * ((size_t) ldt) * ldt)))
+        ABORT ("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt * ldt;
+    if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+    if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+    if (!(iuip = intMalloc_dist (k)))  ABORT ("Malloc fails for iuip[].");
+    if (!(ruip = intMalloc_dist (k)))  ABORT ("Malloc fails for ruip[].");
+
+    log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+	       + 2 * k * iword, stat);
+
+    int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
+        *RemainFullRow,*RemainStRow,*Remain_lptr,*Remain_ib;
+
+    lookAheadFullRow   = intMalloc_dist( (num_look_aheads+1) );
+    lookAheadStRow     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_lptr     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_ib       = intMalloc_dist( (num_look_aheads+1) );
+
+    int_t mrb=    (nsupers+Pr-1) / Pr;
+    int_t mcb=    (nsupers+Pc-1) / Pc;
+    
+    RemainFullRow   = intMalloc_dist(mrb); 
+    RemainStRow     = intMalloc_dist(mrb);
+#if 0
+    Remain_lptr     = (int *) _mm_malloc(sizeof(int)*mrb,1);
+#else
+    Remain_lptr     = intMalloc_dist(mrb);
+#endif
+    // mlock(Remain_lptr, sizeof(int)*mrb );
+    Remain_ib       = intMalloc_dist(mrb);
+    
+    Remain_info_t *Remain_info;
+#if 0
+    Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64);
+#else
+    Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
+#endif
+    log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
+
+    doublecomplex *lookAhead_L_buff, *Remain_L_buff;
+    Ublock_info_t *Ublock_info;
+    ldt = sp_ienv_dist (3);       /* max supernode size */
+    lookAhead_L_buff = doublecomplexMalloc_dist(ldt*ldt* (num_look_aheads+1) );
+    log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
+
+#if 0
+    Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex)*(Llu->bufmax[1]),64);
+    Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64);
+    int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+    int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+    int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
+#else
+    Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1]);
+    Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
+    int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+    int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+    int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+#endif
+    log_memory(Llu->bufmax[1] * dword, stat);
+
+    InitTimer = SuperLU_timer_() - tt1;
+
+    double pxgstrfTimer = SuperLU_timer_();
+
+    /* ##################################################################
+       ** Handle first block column separately to start the pipeline. **
+       ################################################################## */
+    look_id = 0;
+    msgcnt = msgcnts[0]; /* First count in the window */
+    send_req = send_reqs[0];
+    recv_req = recv_reqs[0];
+
+    k0 = 0;
+    k = perm_c_supno[0];
+    kcol = PCOL (k, grid);
+    krow = PROW (k, grid);
+    if (mycol == kcol) {
+        double ttt1 = SuperLU_timer_();
+
+	/* panel factorization */
+        PZGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_()-ttt1; 
+
+        scp = &grid->rscp;      /* The scope of process row. */
+
+        /* Multicasts numeric values of L(:,0) to process rows. */
+        lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        if (lsub) {
+            msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub[1] * SuperSize (k);
+        } else {
+            msgcnt[0] = msgcnt[1] = 0;
+        }
+
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+
+                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+                        iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+            } /* end if */
+        }  /* end for pj ... */
+    } else {  /* Post immediate receives. */
+        if (ToRecv[k] >= 1) {   /* Recv block column L(:,0). */
+            scp = &grid->rscp;  /* The scope of process row. */
+            MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
+                       SLU_MPI_TAG (0, 0) /* 0 */ ,
+                       scp->comm, &recv_req[0]);
+            MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol,
+                       SLU_MPI_TAG (1, 0) /* 1 */ ,
+                       scp->comm, &recv_req[1]);
+        }
+    } /* end if mycol == 0 */
+
+    factored[k] = 0; /* flag column k as factored. */
+
+    /* post receive of first U-row */
+    if (myrow != krow) {
+        if (ToRecv[k] == 2) {   /* Recv block row U(k,:). */
+            scp = &grid->cscp;  /* The scope of process column. */
+            Usub_buf = Llu->Usub_buf_2[0];
+            Uval_buf = Llu->Uval_buf_2[0];
+            MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                       SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][0]);
+            MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
+                       SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][1]);
+        }
+    }
+
+    /* ##################################################################
+       **** MAIN LOOP ****
+       ################################################################## */
+    for (k0 = 0; k0 < nsupers; ++k0) {
+        k = perm_c_supno[k0];
+
+        /* ============================================ *
+         * ======= look-ahead the new L columns ======= *
+         * ============================================ */
+        /* tt1 = SuperLU_timer_(); */
+        if (k0 == 0) { /* look-ahead all the columns in the window */
+            kk1 = k0 + 1;
+            kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        } else {  /* look-ahead one new column after the current window */
+            kk1 = k0 + num_look_aheads;
+            kk2 = SUPERLU_MIN (kk1, nsupers - 1);
+        }
+
+        for (kk0 = kk1; kk0 <= kk2; kk0++) {
+	    /* loop through look-ahead window in L */
+
+            kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
+            look_id = kk0 % (1 + num_look_aheads); /* which column in window */
+
+            if (look_ahead[kk] < k0) { /* does not depend on current column */
+                kcol = PCOL (kk, grid);
+                if (mycol == kcol) { /* I own this panel */
+
+                    /* Panel factorization -- Factor diagonal and subdiagonal
+                       L blocks and test for exact singularity.  */
+                    factored[kk] = 0; /* flag column kk as factored */
+                    double ttt1 = SuperLU_timer_();
+
+                    PZGSTRF2 (options, kk0, kk, thresh, Glu_persist,
+                              grid, Llu, U_diag_blk_send_req, tag_ub, stat, info);
+
+                     pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+
+                    /* Multicasts numeric values of L(:,kk) to process rows. */
+                    /* ttt1 = SuperLU_timer_(); */
+                    msgcnt = msgcnts[look_id];  /* point to the proper count array */
+                    send_req = send_reqs[look_id];
+
+                    lk = LBj (kk, grid);    /* Local block number in L */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    if (lsub1) {
+                        msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
+                        msgcnt[1] = lsub1[1] * SuperSize (kk); /* Lval_buf[] size */
+                    } else {
+                        msgcnt[0] = 0;
+                        msgcnt[1] = 0;
+                    }
+                    scp = &grid->rscp;  /* The scope of process row. */
+                    for (pj = 0; pj < Pc; ++pj) {
+                        if (ToSendR[lk][pj] != EMPTY) {
+                            lusup1 = Lnzval_bc_ptr[lk];
+                            MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                       SLU_MPI_TAG (0, kk0),  /* (4*kk0)%tag_ub */
+                                       scp->comm, &send_req[pj]);
+                            MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                                       SLU_MPI_TAG (1, kk0),  /* (4*kk0+1)%tag_ub */
+                                       scp->comm, &send_req[pj + Pc]);
+#if ( DEBUGlevel>=2 )
+			    printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
+				    iam, kk, msgcnt[0], msgcnt[1], pj);
+#endif
+                        }
+                    }
+                    /* stat->time9 += SuperLU_timer_() - ttt1; */
+                } else {     /* Post Recv of block column L(:,kk). */
+                    /* double ttt1 = SuperLU_timer_(); */
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        recv_req = recv_reqs[look_id];
+
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   SuperLU_MPI_DOUBLE_COMPLEX, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+                    }
+                    /* stat->time10 += SuperLU_timer_() - ttt1; */
+                }  /* end if mycol == Pc(kk) */
+            }  /* end if look-ahead in L supernodes */
+
+            /* post irecv for U-row look-ahead */
+            krow = PROW (kk, grid);
+            if (myrow != krow) {
+                if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    Usub_buf = Llu->Usub_buf_2[look_id];
+                    Uval_buf = Llu->Uval_buf_2[look_id];
+
+                    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                               SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][0]);
+                    MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
+                               SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][1]);
+                }
+            }
+
+        }  /* end for each column in look-ahead window for L supernodes */
+
+        /* stat->time4 += SuperLU_timer_()-tt1; */
+
+        /* ================================= *
+         * ==== look-ahead the U rows    === *
+         * ================================= */
+        kk1 = k0;
+        kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = kk1; kk0 < kk2; kk0++) {
+            kk = perm_c_supno[kk0]; /* order determined from static schedule */  
+            if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+                kcol = PCOL (kk, grid);
+                krow = PROW (kk, grid);
+                lk = LBj (kk, grid);  /* Local block number across row. NOT USED?? -- Sherry */
+
+                look_id = kk0 % (1 + num_look_aheads);
+                msgcnt = msgcntsU[look_id];
+                recv_req = recv_reqs[look_id];
+
+                /* ================================================= *
+                 * Check if diagonal block has been received         *
+                 * for panel factorization of U in look-ahead window *
+                 * ================================================= */
+
+                if (mycol == kcol) {  /* I own this column panel, no need
+                                         to receive L  */
+                    flag0 = flag1 = 1;
+                    msgcnt[0] = msgcnt[1] = -1; /* No need to transfer Lsub, nor Lval */
+                } else { /* Check to receive L(:,kk) from the left */
+                    flag0 = flag1 = 0;
+                    if ( ToRecv[kk] >= 1 ) {
+                        if ( recv_req[0] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[0], &flag0, &status);
+                            if ( flag0 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                                recv_req[0] = MPI_REQUEST_NULL;
+                            }
+                        } else flag0 = 1;
+
+                        if ( recv_req[1] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[1], &flag1, &status);
+                            if ( flag1 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[1]);
+                                recv_req[1] = MPI_REQUEST_NULL;
+                            }
+                        } else flag1 = 1;
+                    } else msgcnt[0] = 0;
+                }
+
+                if (flag0 && flag1) { /* L(:,kk) is ready */
+                    /* tt1 = SuperLU_timer_(); */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    if (myrow == krow) {
+                        factoredU[kk0] = 1;
+                        /* Parallel triangular solve across process row *krow* --
+                           U(k,j) = L(k,k) \ A(k,j).  */
+                        /* double ttt2 = SuperLU_timer_(); */
+                        double ttt2 = SuperLU_timer_();
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+			{
+                            PZGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
+                                      stat);
+                        }
+    
+                        pdgstrs2_timer += SuperLU_timer_()-ttt2;
+                        /* stat->time8 += SuperLU_timer_()-ttt2; */
+
+                        /* Multicasts U(kk,:) to process columns. */
+                        lk = LBi (kk, grid);
+                        usub = Ufstnz_br_ptr[lk];
+                        uval = Unzval_br_ptr[lk];
+                        if (usub) {
+                            msgcnt[2] = usub[2]; /* metadata size */
+                            msgcnt[3] = usub[1]; /* Uval[] size */
+                        } else {
+                            msgcnt[2] = msgcnt[3] = 0;
+                        }
+
+                        if (ToSendD[lk] == YES) {
+                            for (pi = 0; pi < Pr; ++pi) {
+                                if (pi != myrow) {
+#if ( PROFlevel>=1 )
+                                    TIC (t1);
+#endif
+
+                                    MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                                               SLU_MPI_TAG (2, kk0), /* (4*kk0+2)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi]);
+                                    MPI_Isend (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX,
+                                               pi, SLU_MPI_TAG (3, kk0), /* (4*kk0+3)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi + Pr]);
+
+#if ( PROFlevel>=1 )
+                                    TOC (t2, t1);
+                                    stat->utime[COMM] += t2;
+                                    msg_cnt += 2;
+                                    msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                                    printf ("[%d] Send U(%4d,:) to Pr %2d\n",
+                                            iam, k, pi);
+#endif
+                                }   /* if pi ... */
+                            }   /* for pi ... */
+                        }       /* if ToSendD ... */
+
+                        /* stat->time2 += SuperLU_timer_()-tt1; */
+
+                    } /* end if myrow == krow */
+                } /* end if flag0 ... */
+            } /* end if factoredU[] ... */
+        } /* end for kk0 ... */
+
+        /* ============================================== *
+         * == start processing the current row of U(k,:) *
+         * ============================================== */
+        knsupc = SuperSize (k);
+        krow = PROW (k, grid);
+        kcol = PCOL (k, grid);
+
+        /* tt1 = SuperLU_timer_(); */
+        look_id = k0 % (1 + num_look_aheads);
+        recv_req = recv_reqs[look_id];
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+        Usub_buf = Llu->Usub_buf_2[look_id];
+        Uval_buf = Llu->Uval_buf_2[look_id];
+
+        if (mycol == kcol) {
+            lk = LBj (k, grid); /* Local block number in L */
+
+            for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup buffer */
+                if (ToSendR[lk][pj] != EMPTY) {
+                    MPI_Wait (&send_req[pj], &status);
+                    MPI_Wait (&send_req[pj + Pc], &status);
+                }
+            }
+            lsub = Lrowind_bc_ptr[lk];
+            lusup = Lnzval_bc_ptr[lk];
+        } else {
+            if (ToRecv[k] >= 1) { /* Recv block column L(:,k). */
+
+                scp = &grid->rscp;  /* The scope of process row. */
+
+                /* ============================================= *
+                 * Waiting for L(:,kk) for outer-product uptate  *
+                 * if iam in U(kk,:), then the diagonal block    *
+		 * did not reach in time for panel factorization *
+		 * of U(k,:)           	                         *
+                 * ============================================= */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                if (recv_req[0] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[0], &status);
+                    MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                    recv_req[0] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[0] = msgcntsU[look_id][0];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", 
+			   iam, k, look_id, msgcnt[0]);
+#endif
+                }
+
+                if (recv_req[1] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[1], &status);
+                    MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1]);
+                    recv_req[1] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[1] = msgcntsU[look_id][1];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", 
+			   iam, k, look_id, msgcnt[1]);
+#endif
+                }
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
+                     iam, k, msgcnt[0], msgcnt[1], kcol);
+                fflush (stdout);
+#endif
+
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[0])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[0] = 0;
+	    }
+
+            lsub = Lsub_buf_2[look_id];
+            lusup = Lval_buf_2[look_id];
+        }                       /* if mycol = Pc(k) */
+        /* stat->time1 += SuperLU_timer_()-tt1; */
+
+        scp = &grid->cscp;      /* The scope of process column. */
+
+        /* tt1 = SuperLU_timer_(); */
+        if (myrow == krow) { /* I own U(k,:) */
+            lk = LBi (k, grid);
+            usub = Ufstnz_br_ptr[lk];
+            uval = Unzval_br_ptr[lk];
+
+            if (factoredU[k0] == -1) {
+                /* Parallel triangular solve across process row *krow* --
+                   U(k,j) = L(k,k) \ A(k,j).  */
+                 double ttt2 = SuperLU_timer_(); 
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+                {
+                    PZGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
+                }
+                pdgstrs2_timer += SuperLU_timer_() - ttt2; 
+
+	        /* Sherry -- need to set factoredU[k0] = 1; ?? */
+
+                /* Multicasts U(k,:) along process columns. */
+                if ( usub ) {
+                    msgcnt[2] = usub[2]; /* metadata size */
+                    msgcnt[3] = usub[1]; /* Uval[] size */
+                } else {
+                    msgcnt[2] = msgcnt[3] = 0;
+                }
+
+                if (ToSendD[lk] == YES) {
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) {
+#if ( PROFlevel>=1 )
+                            TIC (t1);
+#endif
+                            MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                                      SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */
+                                      scp->comm);
+                            MPI_Send (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
+                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ 
+                                      scp->comm);
+#if ( PROFlevel>=1 )
+                            TOC (t2, t1);
+                            stat->utime[COMM] += t2;
+                            msg_cnt += 2;
+                            msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                            printf ("[%d] Send U(%4d,:) down to Pr %2d\n", iam, k, pi);
+#endif
+                        } /* if pi ... */
+                    } /* for pi ... */
+                } /* if ToSendD ... */
+
+            } else { /* Panel U(k,:) already factorized */
+
+               /* ================================================ *
+                 * Wait for downward sending of U(k,:) to complete *
+		 * for outer-product update                        *
+                 * =============================================== */
+
+                if (ToSendD[lk] == YES) {
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) {
+                            MPI_Wait (&send_reqs_u[look_id][pi], &status);
+                            MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
+                        }
+                    }
+                }
+                msgcnt[2] = msgcntsU[look_id][2];
+                msgcnt[3] = msgcntsU[look_id][3];
+            }
+            /* stat->time2 += SuperLU_timer_()-tt1; */
+
+        } else {    /* myrow != krow */
+
+            /* ========================================= *
+             * wait for U(k,:) for outer-product updates *
+             * ========================================= */
+
+            if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Wait (&recv_reqs_u[look_id][0], &status);
+                MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+                MPI_Wait (&recv_reqs_u[look_id][1], &status);
+                MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3]);
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+#endif
+                usub = Usub_buf;
+                uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[2])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[2] = 0;
+	    }
+            /* stat->time6 += SuperLU_timer_()-tt1; */
+        } /* end if myrow == Pr(k) */
+
+        /*
+         * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+         *  for (j = k+1; k < N; ++k) {
+         *     for (i = k+1; i < N; ++i)
+         *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+         *              && L(i,k) != 0 && U(k,j) != 0 )
+         *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+         */
+        msg0 = msgcnt[0];
+        msg2 = msgcnt[2];
+        /* tt1 = SuperLU_timer_(); */
+        if (msg0 && msg2) {     /* L(:,k) and U(k,:) are not empty. */
+            nsupr = lsub[1];    /* LDA of lusup. */
+            if (myrow == krow) { /* Skip diagonal block L(k,k). */
+                lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+                luptr0 = knsupc;
+                nlb = lsub[0] - 1;
+            } else {
+                lptr0 = BC_HEADER;
+                luptr0 = 0;
+                nlb = lsub[0];
+            }
+            iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+            rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+            nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+            klst = FstBlockC (k + 1);
+
+            /* -------------------------------------------------------------
+               Update the look-ahead block columns A(:,k+1:k+num_look_ahead)
+               ------------------------------------------------------------- */
+            iukp0 = iukp;
+            rukp0 = rukp;
+            /* reorder the remaining columns in bottome-up */
+            /* TAU_STATIC_TIMER_START("LOOK_AHEAD_UPDATE"); */
+            for (jj = 0; jj < nub; jj++) {
+#ifdef ISORT
+                iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+                perm_u[jj] = jj;
+#else
+                perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+                perm_u[2 * jj + 1] = jj;
+#endif
+                jb = usub[iukp];    /* Global block number of block U(k,j). */
+                nsupc = SuperSize (jb);
+                iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+                iukp += nsupc;
+            }
+            iukp = iukp0;
+#ifdef ISORT
+            isort (nub, iperm_u, perm_u);
+#else
+            qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+                   &superlu_sort_perm);
+#endif
+            j = jj0 = 0;
+
+/************************************************************************/
+            double ttx =SuperLU_timer_();
+
+#include "zlook_ahead_update.c"
+
+            lookaheadupdatetimer += SuperLU_timer_() - ttx;
+/************************************************************************/
+
+            /*ifdef OMP_LOOK_AHEAD */
+            /* TAU_STATIC_TIMER_STOP("LOOK_AHEAD_UPDATE"); */
+        }                       /* if L(:,k) and U(k,:) not empty */
+
+        /* stat->time3 += SuperLU_timer_()-tt1; */
+
+        /* ================== */
+        /* == post receive == */
+        /* ================== */
+        kk1 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = k0 + 1; kk0 <= kk1; kk0++) {
+            kk = perm_c_supno[kk0];
+            kcol = PCOL (kk, grid);
+
+            if (look_ahead[kk] == k0) {
+                if (mycol != kcol) {
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+
+                        look_id = kk0 % (1 + num_look_aheads);
+                        recv_req = recv_reqs[look_id];
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   SuperLU_MPI_DOUBLE_COMPLEX, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+                    }
+                } else {
+                    lk = LBj (kk, grid);    /* Local block number. */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    lusup1 = Lnzval_bc_ptr[lk];
+                    if (factored[kk] == -1) {
+                        /* Factor diagonal and subdiagonal blocks and
+			   test for exact singularity.  */
+                        factored[kk] = 0; /* flag column kk as factored */
+                        double ttt1 = SuperLU_timer_(); 
+                        PZGSTRF2 (options, kk0, kk, thresh,
+                                  Glu_persist, grid, Llu, U_diag_blk_send_req,
+                                  tag_ub, stat, info);
+                        pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+
+                        /* Process column *kcol+1* multicasts numeric
+			   values of L(:,k+1) to process rows. */
+                        look_id = kk0 % (1 + num_look_aheads);
+                        send_req = send_reqs[look_id];
+                        msgcnt = msgcnts[look_id];
+
+                        if (lsub1) {
+                            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+                            msgcnt[1] = lsub1[1] * SuperSize (kk);
+                        } else {
+                            msgcnt[0] = 0;
+                            msgcnt[1] = 0;
+                        }
+
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        for (pj = 0; pj < Pc; ++pj) {
+                            if (ToSendR[lk][pj] != EMPTY) {
+                                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                           SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                           scp->comm, &send_req[pj]);
+                                MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                                           SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                           scp->comm, &send_req[pj + Pc]);
+                            }
+                        }
+                    }           /* for pj ... */
+                }
+            }
+        }
+
+        double tsch = SuperLU_timer_();
+
+	/*******************************************************************/
+
+#ifdef GPU_ACC
+
+#include "zSchCompUdt-cuda.c"
+
+#else 
+
+/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+#include "zSchCompUdt-2Ddynamic.c"
+
+#endif 
+	/*uncomment following to compare against SuperLU 3.3 baseline*/
+        /* #include "SchCompUdt--baseline.c"  */
+	/************************************************************************/
+        
+        NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+    }  /* for k0 = 0, ... */
+
+    /* ##################################################################
+       ** END MAIN LOOP: for k0 = ...
+       ################################################################## */
+    
+    pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
+
+    /* updating total flops */
+#if ( PRNTlevel>=1 )
+    if ( iam==0 ) {
+	printf("\nInitialization time\t%8.2lf seconds\n"
+	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
+        printf("\n---- Time breakdown in factorization ----\n");
+	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
+        printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
+        printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
+        printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
+	       
+        printf(".. Time in GEMM %8.2lf \n",
+	       LookAheadGEMMTimer + RemainGEMMTimer);
+        printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
+        printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
+
+        printf(".. Time to Scatter %8.2lf \n", 
+	       LookAheadScatterTimer + RemainScatterTimer);
+        printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
+        printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
+
+        printf("Total Time in Factorization            \t: %8.2lf seconds, \n", pxgstrfTimer);
+        printf("Total time in Schur update with offload\t  %8.2lf seconds,\n",CPUOffloadTimer );
+        printf("--------\n");
+	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
+    }
+#endif
+    
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+            zPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+    // printf("Debug : MPI buffers 1\n");
+
+    /********************************************************
+     * Free memory                                          *
+     ********************************************************/
+
+    if (Pr * Pc > 1) {
+        SUPERLU_FREE (Lsub_buf_2[0]);   /* also free Lsub_buf_2[1] */
+        SUPERLU_FREE (Lval_buf_2[0]);   /* also free Lval_buf_2[1] */
+        if (Llu->bufmax[2] != 0)
+            SUPERLU_FREE (Usub_buf_2[0]);
+        if (Llu->bufmax[3] != 0)
+            SUPERLU_FREE (Uval_buf_2[0]);
+        if (U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) {
+            /* wait for last Isend requests to complete, deallocate objects */
+            for (krow = 0; krow < Pr; ++krow) {
+                if (krow != myrow)
+                    MPI_Wait (U_diag_blk_send_req + krow, &status);
+            }
+        }
+        SUPERLU_FREE (U_diag_blk_send_req);
+    }
+
+    log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword +
+		  (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword),
+		stat );
+    
+    SUPERLU_FREE (Lsub_buf_2);
+    SUPERLU_FREE (Lval_buf_2);
+    SUPERLU_FREE (Usub_buf_2);
+    SUPERLU_FREE (Uval_buf_2);
+    SUPERLU_FREE (perm_c_supno);
+    SUPERLU_FREE (perm_u);
+#ifdef ISORT
+    SUPERLU_FREE (iperm_u);
+#endif
+    SUPERLU_FREE (look_ahead);
+    SUPERLU_FREE (factoredU);
+    SUPERLU_FREE (factored);
+    log_memory(-(6 * nsupers * iword), stat);
+
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (msgcnts[i]);
+        SUPERLU_FREE (msgcntsU[i]);
+    }
+    SUPERLU_FREE (msgcnts);
+    SUPERLU_FREE (msgcntsU);
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (send_reqs_u[i]);
+        SUPERLU_FREE (recv_reqs_u[i]);
+        SUPERLU_FREE (send_reqs[i]);
+        SUPERLU_FREE (recv_reqs[i]);
+    }
+
+    SUPERLU_FREE (recv_reqs_u);
+    SUPERLU_FREE (send_reqs_u);
+    SUPERLU_FREE (recv_reqs);
+    SUPERLU_FREE (send_reqs);
+
+    // printf("Debug : MPI buffers 3\n");
+
+#ifdef GPU_ACC
+    checkCuda (cudaFreeHost (bigV));
+    checkCuda (cudaFreeHost (bigU));
+    cudaFree( (void*)dA ); /* Sherry added */
+    cudaFree( (void*)dB );
+    cudaFree( (void*)dC );
+    SUPERLU_FREE( handle );
+    SUPERLU_FREE( streams );
+    SUPERLU_FREE( stream_end_col );
+#else
+    SUPERLU_FREE (bigV);
+    SUPERLU_FREE (bigU);
+#endif
+
+    log_memory(-(bigv_size + bigu_size) * dword, stat);
+    // printf("Debug : MPI buffers 5\n");
+
+    SUPERLU_FREE (Llu->ujrow);
+    SUPERLU_FREE (tempv2d);
+    SUPERLU_FREE (indirect);
+    SUPERLU_FREE (indirect2); /* Sherry added */
+    SUPERLU_FREE (iuip);
+    SUPERLU_FREE (ruip);
+
+    ldt = sp_ienv_dist(3);
+    log_memory( -(3 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+		  + 2 * k * iword), stat );
+
+    /* Sherry added */
+    SUPERLU_FREE(omp_loop_time);
+    SUPERLU_FREE(full_u_cols);
+    SUPERLU_FREE(blk_ldu);
+    log_memory(-2 * ncb * dword, stat);
+
+    SUPERLU_FREE(lookAheadFullRow);
+    SUPERLU_FREE(lookAheadStRow);
+    SUPERLU_FREE(lookAhead_lptr);
+    SUPERLU_FREE(lookAhead_ib);
+
+    SUPERLU_FREE(RemainFullRow);
+    SUPERLU_FREE(RemainStRow);
+    SUPERLU_FREE(Remain_lptr);
+    SUPERLU_FREE(Remain_ib);
+    SUPERLU_FREE(Remain_info);
+    SUPERLU_FREE(lookAhead_L_buff);
+    SUPERLU_FREE(Remain_L_buff);
+    log_memory( -(4 * mrb * iword + mrb * sizeof(Remain_info_t) + 
+		  ldt * ldt * (num_look_aheads + 1) * dword +
+		  Llu->bufmax[1] * dword), stat );
+
+    SUPERLU_FREE(Ublock_info);
+    SUPERLU_FREE(Ublock_info_iukp);
+    SUPERLU_FREE(Ublock_info_rukp);
+    SUPERLU_FREE(Ublock_info_jb);
+
+
+#if ( PROFlevel>=1 )
+    TIC (t1);
+#endif
+
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+    // printf("test out\n");
+
+#if ( PROFlevel>=1 )
+    TOC (t2, t1);
+    stat->utime[COMM] += t2;
+    {
+        float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+
+        MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_cnt, &msg_cnt_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        if (!iam) {
+            printf ("\tPZGSTRF comm stat:"
+                    "\tAvg\tMax\t\tAvg\tMax\n"
+                    "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+                    msg_cnt_sum / Pr / Pc, msg_cnt_max,
+                    msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+        }
+    }
+#endif
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce (&zero_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce (&total_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            zPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
+            zPrintUblocks (iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit pzgstrf()");
+#endif
+
+    return 0;
+} /* PZGSTRF */
+
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
new file mode 100644
index 0000000..3f63915
--- /dev/null
+++ b/SRC/pzgstrf2.c
@@ -0,0 +1,376 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Performs panel LU factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/* This pdgstrf2 is based on TRSM function */
+void
+pzgstrf2_trsm
+    (superlu_dist_options_t * options, int_t k0, int_t k, double thresh,
+     Glu_persist_t * Glu_persist, gridinfo_t * grid, LocalLU_t * Llu,
+     MPI_Request * U_diag_blk_send_req, int tag_ub,
+     SuperLUStat_t * stat, int *info)
+{
+    /* printf("entering pzgstrf2 %d \n", grid->iam); */
+    int cols_left, iam, l, pkk, pr;
+    int incx = 1, incy = 1;
+
+    int nsupr;                  /* number of rows in the block (LDA) */
+    int nsupc;                /* number of columns in the block */
+    int luptr;
+    int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt;
+    int_t *xsup = Glu_persist->xsup;
+    doublecomplex *lusup, temp;
+    doublecomplex *ujrow, *ublk_ptr;   /* pointer to the U block */
+    doublecomplex one = {1.0, 0.0}, alpha = {-1.0, 0.0};
+    int_t Pr;
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+
+    /* Initialization. */
+    iam = grid->iam;
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    krow = PROW (k, grid);
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    j = LBj (k, grid);          /* Local block number */
+    jfst = FstBlockC (k);
+    jlst = FstBlockC (k + 1);
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize (k);
+    if (Llu->Lrowind_bc_ptr[j])
+        nsupr = Llu->Lrowind_bc_ptr[j][1];
+    else
+        nsupr = 0;
+#ifdef PI_DEBUG
+    printf ("rank %d  Iter %d  k=%d \t ztrsm nsuper %d \n",
+            iam, k0, k, nsupr);
+#endif
+    ublk_ptr = ujrow = Llu->ujrow;
+
+    luptr = 0;                  /* Point to the diagonal entries. */
+    cols_left = nsupc;          /* supernode size */
+    int ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    u_diag_cnt = 0;
+    incy = ld_ujrow;
+
+    if ( U_diag_blk_send_req && 
+	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
+        /* There are pending sends - wait for all Isend to complete */
+        for (pr = 0; pr < Pr; ++pr)
+            if (pr != myrow) {
+                MPI_Wait (U_diag_blk_send_req + pr, &status);
+            }
+
+	/* flag no more outstanding send request. */
+	U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
+    }
+
+    if (iam == pkk) {            /* diagonal process */
+        for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
+            /* Diagonal pivot */
+            i = luptr;
+           if ( options->ReplaceTinyPivot == YES ) {
+                if ( slud_z_abs1(&lusup[i]) < thresh && 
+		     lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */
+
+#if ( PRNTlevel>=2 )
+                    printf ("(%d) .. col %d, tiny pivot %e  ",
+                            iam, jfst + j, lusup[i]);
+#endif
+                    /* Keep the new diagonal entry with the same sign. */
+                    if ( lusup[i].r < 0 ) lusup[i].r = -thresh;
+                    else lusup[i].r = thresh;
+                    lusup[i].i = 0.0;
+#if ( PRNTlevel>=2 )
+                    printf ("replaced by %e\n", lusup[i]);
+#endif
+                    ++(stat->TinyPivots);
+                }
+            }
+
+#if 0
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+                 ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */
+#endif
+
+            /* storing U in full form  */
+            int st;
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) {
+                st = j * ld_ujrow + j;
+                ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */
+            }
+
+            /* Test for singularity. */
+            if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) {
+                *info = j + jfst + 1;
+            } else {              /* Scale the j-th column within diag. block. */
+                slud_z_div(&temp, &one, &ujrow[0]);
+                for (i = luptr + 1; i < luptr - j + nsupc; ++i)
+                    zz_mult(&lusup[i], &lusup[i], &temp);
+                stat->ops[FACT] += 6*(nsupc-j-1) + 10;
+            }
+
+            /* Rank-1 update of the trailing submatrix within diag. block. */
+            if (--cols_left) {
+                /* l = nsupr - j - 1;  */
+                l = nsupc - j - 1;  /* Piyush */
+                zgeru_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
+                       &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
+                       &nsupr);
+                stat->ops[FACT] += 8 * l * cols_left;
+            }
+
+            /* ujrow = ublk_ptr + u_diag_cnt;  */
+            ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */
+            luptr += nsupr + 1; /* move to next column */
+
+        }                       /* for column j ...  first loop */
+
+	/* ++++++++++second step ====== */
+
+        ublk_ptr = ujrow = Llu->ujrow;
+
+        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block */
+            /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+            for (pr = 0; pr < Pr; ++pr)
+                if (pr != krow) {
+                    /* tag = ((k0<<2)+2) % tag_ub;        */
+                    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+                    MPI_Isend (ublk_ptr, nsupc * nsupc, SuperLU_MPI_DOUBLE_COMPLEX, pr,
+                               SLU_MPI_TAG (4, k0) /* tag */ ,
+                               comm, U_diag_blk_send_req + pr);
+
+                }
+
+	    /* flag outstanding Isend */
+            U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
+        }
+
+        /* pragma below would be changed by an MKL call */
+
+        char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
+
+        l = nsupr - nsupc;
+        // n = nsupc;
+	doublecomplex alpha = {1.0, 0.0};
+#ifdef PI_DEBUG
+        printf ("calling ztrsm\n");
+        printf ("ztrsm diagonal param 11:  %d \n", nsupr);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+        ztrsm_ (&side, &uplo, &transa, &diag,
+                &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
+		1, 1, 1, 1);
+#else
+        ztrsm_ (&side, &uplo, &transa, &diag,
+                &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
+#endif
+
+    } else {  /* non-diagonal process */
+        /* ================================================ *
+         * Receive the diagonal block of U                  *
+         * for panel factorization of L(:,k)                *
+         * note: we block for panel factorization of L(:,k) *
+         * but panel factorization of U(:,k) don't          *
+         * ================================================ */
+
+        /* tag = ((k0<<2)+2) % tag_ub;        */
+        /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+        // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+        MPI_Recv (ublk_ptr, (nsupc * nsupc), SuperLU_MPI_DOUBLE_COMPLEX, krow,
+                  SLU_MPI_TAG (4, k0) /* tag */ ,
+                  comm, &status);
+        if (nsupr > 0) {
+            char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
+            doublecomplex alpha = {1.0, 0.0};
+
+#ifdef PI_DEBUG
+            printf ("ztrsm non diagonal param 11:  %d \n", nsupr);
+            if (!lusup)
+                printf (" Rank :%d \t Empty block column occured :\n", iam);
+#endif
+#if defined (USE_VENDOR_BLAS)
+            ztrsm_ (&side, &uplo, &transa, &diag,
+                    &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
+#else
+            ztrsm_ (&side, &uplo, &transa, &diag,
+                    &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
+#endif
+        }
+
+    }                           /* end if pkk ... */
+
+    /* printf("exiting pzgstrf2 %d \n", grid->iam);  */
+
+}  /* PZGSTRF2_trsm */
+
+
+/************************************************************************/
+void pzgstrs2_omp
+/************************************************************************/
+(int_t k0, int_t k, Glu_persist_t * Glu_persist,
+ gridinfo_t * grid, LocalLU_t * Llu, SuperLUStat_t * stat)
+{
+#ifdef PI_DEBUG
+    printf("====Entering pzgstrs2==== \n");
+#endif
+    int iam, pkk;
+    int incx = 1;
+    int nsupr;                /* number of rows in the block L(:,k) (LDA) */
+    int segsize;
+    int nsupc;                /* number of columns in the block */
+    int_t luptr, iukp, rukp;
+    int_t b, gb, j, klst, knsupc, lk, nb;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *usub;
+    doublecomplex *lusup, *uval;
+
+#ifdef _OPENMP
+    int thread_id = omp_get_thread_num ();
+    int num_thread = omp_get_num_threads ();
+#else
+    int thread_id = 0;
+    int num_thread = 1;
+#endif
+
+    /* Quick return. */
+    lk = LBi (k, grid);         /* Local block number */
+    if (!Llu->Unzval_br_ptr[lk]) return;
+
+    /* Initialization. */
+    iam = grid->iam;
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
+    int gb_col_cycle;  /* cycle through block columns  */
+    klst = FstBlockC (k + 1);
+    knsupc = SuperSize (k);
+    usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if (iam == pkk) {
+        lk = LBj (k, grid);
+        nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+        lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+        nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1];   /* LDA of lusup[] */
+        lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b)  {
+        /* assuming column cyclic distribution of data among threads */
+        gb = usub[iukp];
+        gb_col_cycle = gb / grid->npcol;
+        nsupc = SuperSize (gb);
+        iukp += UB_DESCRIPTOR;
+
+        /* Loop through all the segments in the block. */
+        for (j = 0; j < nsupc; ++j) {
+#ifdef PI_DEBUG
+            printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
+#endif 
+            segsize = klst - usub[iukp++];
+            if (segsize) {    /* Nonzero segment. */
+                luptr = (knsupc - segsize) * (nsupr + 1);
+
+		/* if gb belongs to present thread then do the factorize */
+                if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
+#ifdef PI_DEBUG
+                    printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
+#endif
+#if defined (USE_VENDOR_BLAS)
+                    ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx, 1, 1, 1);
+#else
+                    ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx);
+#endif
+                }
+
+                if (thread_id == 0)
+                    stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
+                rukp += segsize;
+            }
+        }
+    }                           /* for b ... */
+
+} /* PZGSTRS2_omp */
+
diff --git a/SRC/pzgstrf_irecv.c b/SRC/pzgstrf_irecv.c
new file mode 100644
index 0000000..b4d65b7
--- /dev/null
+++ b/SRC/pzgstrf_irecv.c
@@ -0,0 +1,1296 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Performs LU factorization in parallel
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *
+ *
+ * Sketch of the algorithm
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( pzgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ * 
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           pzgstrf2(k), one column at a time 
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j) 
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i) 
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ *
+ * Remaining issues
+ *   (1) Use local indices for L subscripts and SPA.  [DONE]
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*
+ * Internal prototypes
+ */
+static void pzgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
+		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
+#ifdef _CRAY
+static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
+#else
+static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
+		     LocalLU_t *, SuperLUStat_t *);
+#endif
+
+/************************************************************************/
+
+/*! \brief
+ * 
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *  PZGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ * 
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_zdefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_zdefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t pzgstrf
+/************************************************************************/
+(
+ superlu_options_t *options, int m, int n, double anorm,
+ LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
+ )
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd("N", strlen("N"));
+    _fcd ftcs1 = _cptofcd("L", strlen("L"));
+    _fcd ftcs2 = _cptofcd("N", strlen("N"));
+    _fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf,
+          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
+    doublecomplex *lusup, *lusup1, *uval, *Uval_buf,
+           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+          nlb, nub, nsupc, rel, rukp;
+    int_t Pc, Pr;
+    int   iam, kcol, krow, mycol, myrow, pi, pj;
+    int   j, k, lk, nsupers;
+    int   nsupr, nbrow, segsize;
+    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
+		      *     0 : transferred in Lsub_buf[]
+		      *     1 : transferred in Lval_buf[]
+		      *     2 : transferred in Usub_buf[] 
+		      *     3 : transferred in Uval_buf[]
+		      */
+    int_t  msg0, msg2;
+    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    doublecomplex **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t  *index;
+    doublecomplex *nzval;
+    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
+    doublecomplex *ucol;
+    int_t  *indirect;
+    doublecomplex *tempv, *tempv2d;
+    int_t iinfo;
+    int_t *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    doublecomplex *tempU2d, *tempu;
+    int    full, ldt, ldu, lead_zero, ncols;
+    MPI_Request recv_req[4], *send_req;
+    MPI_Status status;
+#if ( DEBUGlevel>=2 ) 
+    int_t num_copy=0, num_update=0;
+#endif
+#if ( PRNTlevel==3 )
+    int_t  zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    int_t iword = sizeof(int_t), zword = sizeof(doublecomplex);
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( m < 0 ) *info = -2;
+    else if ( n < 0 ) *info = -3;
+    if ( *info ) {
+	pxerbla("pzgstrf", grid, -*info);
+	return (-1);
+    }
+
+    /* Quick return if possible. */
+    if ( m == 0 || n == 0 ) return 0;
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = slamch_("Epsilon");
+    thresh = s_eps * anorm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgstrf()");
+#endif
+
+    stat->ops[FACT] = 0.0;
+
+    if ( Pr*Pc > 1 ) {
+	i = Llu->bufmax[0];
+	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lsub_buf.");
+	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
+	i = Llu->bufmax[1];
+	if ( !(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist(2 * ((size_t)i))) )
+	    ABORT("Malloc fails for Lval_buf[].");
+	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
+	if ( Llu->bufmax[2] != 0 ) 
+	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
+		ABORT("Malloc fails for Usub_buf[].");
+	if ( Llu->bufmax[3] != 0 ) 
+	    if ( !(Llu->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3])) )
+		ABORT("Malloc fails for Uval_buf[].");
+	if ( !(send_req =
+	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
+	    ABORT("Malloc fails for send_req[].");
+    }
+    if ( !(Llu->ujrow = doublecomplexMalloc_dist(sp_ienv_dist(3))) )
+	ABORT("Malloc fails for ujrow[].");
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
+	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
+	       Llu->bufmax[0], Llu->bufmax[1], 
+	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
+    }
+#endif
+
+    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
+    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
+    Lval_buf_2[0] = Llu->Lval_buf_2[0];
+    Lval_buf_2[1] = Llu->Lval_buf_2[1];
+    Usub_buf = Llu->Usub_buf;
+    Uval_buf = Llu->Uval_buf;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv;
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
+    if ( !(tempv2d = doublecomplexCalloc_dist(2*((size_t)ldt)*ldt)) )
+	ABORT("Calloc fails for tempv2d[].");
+    tempU2d = tempv2d + ldt*ldt;
+    if ( !(indirect = intMalloc_dist(ldt)) )
+	ABORT("Malloc fails for indirect[].");
+    k = CEILING( nsupers, Pr ); /* Number of local block rows */
+    if ( !(iuip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for iuip[].");
+    if ( !(ruip = intMalloc_dist(k)) )
+	ABORT("Malloc fails for ruip[].");
+
+
+    /* ---------------------------------------------------------------
+       Handle the first block column separately to start the pipeline.
+       --------------------------------------------------------------- */
+    if ( mycol == 0 ) {
+	pzgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
+
+	scp = &grid->rscp; /* The scope of process row. */
+
+	/* Process column *kcol* multicasts numeric values of L(:,k) 
+	   to process rows. */
+	lsub = Lrowind_bc_ptr[0];
+	lusup = Lnzval_bc_ptr[0];
+	if ( lsub ) {
+	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
+	    msgcnt[1] = lsub[1] * SuperSize( 0 );
+	} else {
+	    msgcnt[0] = msgcnt[1] = 0;
+	}
+	
+	for (pj = 0; pj < Pc; ++pj) {
+	    if ( ToSendR[0][pj] != EMPTY ) {
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
+			  &send_req[pj] );
+		MPI_Isend( lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, 1, scp->comm,
+			  &send_req[pj+Pc] );
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+		       iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+		msg_cnt += 2;
+		msg_vol += msgcnt[0]*iword + msgcnt[1]*zword;
+#endif
+	    }
+	} /* for pj ... */
+    } else { /* Post immediate receives. */
+	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
+	    scp = &grid->rscp; /* The scope of process row. */
+	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
+		      0, scp->comm, &recv_req[0] );
+	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, 0,
+		      1, scp->comm, &recv_req[1] );
+#if ( DEBUGlevel>=2 )
+	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
+#endif
+	}
+    } /* if mycol == 0 */
+
+    /* ------------------------------------------
+       MAIN LOOP: Loop through all block columns.
+       ------------------------------------------ */
+    for (k = 0; k < nsupers; ++k) {
+
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+
+	if ( mycol == kcol ) {
+	    lk = LBj( k, grid ); /* Local block number. */
+
+	    for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup. */
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    MPI_Wait( &send_req[pj], &status );
+		    MPI_Wait( &send_req[pj+Pc], &status );
+		}
+	    }
+	    lsub = Lrowind_bc_ptr[lk];
+	    lusup = Lnzval_bc_ptr[lk];
+	} else {
+	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
+		scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[0]);*/
+		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
+			 (4*k)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[0], &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
+		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, 
+		  Llu->bufmax[1]);*/
+		/*MPI_Recv( Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, 
+			 (4*k+1)%NTAGS, scp->comm, &status );*/
+		MPI_Wait( &recv_req[1], &status );
+		MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1] );
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
+		       iam, k, msgcnt[0], msgcnt[1], kcol);
+		fflush(stdout);
+#endif
+		lsub = Lsub_buf_2[k%2];
+		lusup = Lval_buf_2[k%2];
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[0] ) ++zero_msg;
+#endif
+	    } else msgcnt[0] = 0;
+	} /* if mycol = Pc(k) */
+
+	scp = &grid->cscp; /* The scope of process column. */
+
+	if ( myrow == krow ) {
+	    /* Parallel triangular solve across process row *krow* --
+	       U(k,j) = L(k,k) \ A(k,j).  */
+#ifdef _CRAY
+	    pzgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
+#else
+	    pzgstrs2(n, k, Glu_persist, grid, Llu, stat);
+#endif
+
+	    /* Multicasts U(k,:) to process columns. */
+	    lk = LBi( k, grid );
+	    usub = Ufstnz_br_ptr[lk];
+	    uval = Unzval_br_ptr[lk];
+	    if ( usub )	{
+		msgcnt[2] = usub[2];
+		msgcnt[3] = usub[1];
+	    } else {
+		msgcnt[2] = msgcnt[3] = 0;
+	    }
+
+	    if ( ToSendD[lk] == YES ) {
+		for (pi = 0; pi < Pr; ++pi) {
+		    if ( pi != myrow ) {
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
+				 (4*k+2)%NTAGS, scp->comm);
+			MPI_Send( uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
+				 (4*k+3)%NTAGS, scp->comm);
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[COMM] += t2;
+			msg_cnt += 2;
+			msg_vol += msgcnt[2]*iword + msgcnt[3]*zword;
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
+#endif
+		    } /* if pi ... */
+		} /* for pi ... */
+	    } /* if ToSendD ... */
+	} else { /* myrow != krow */
+	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
+		  Llu->bufmax[2]);*/
+		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+			 (4*k+2)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
+		/*probe_recv(iam, krow, (4*k+3)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, 
+		  Llu->bufmax[3]);*/
+		MPI_Recv( Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow, 
+			 (4*k+3)%NTAGS, scp->comm, &status );
+		MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3] );
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[COMM] += t2;
+#endif
+		usub = Usub_buf;
+		uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+		++total_msg;
+		if ( !msgcnt[2] ) ++zero_msg;
+#endif
+	    } else msgcnt[2] = 0;
+	} /* if myrow == Pr(k) */
+	  
+	/* 
+	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+	 *  for (j = k+1; k < N; ++k) {
+	 *     for (i = k+1; i < N; ++i) 
+	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+	 *              && L(i,k) != 0 && U(k,j) != 0 )
+	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+	 */
+	msg0 = msgcnt[0];
+	msg2 = msgcnt[2];
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    nsupr = lsub[1]; /* LDA of lusup. */
+	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
+		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
+		luptr0 = knsupc;
+		nlb = lsub[0] - 1;
+	    } else {
+		lptr0 = BC_HEADER;
+		luptr0 = 0;
+		nlb = lsub[0];
+	    }
+	    lptr = lptr0;
+	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
+		ib = lsub[lptr];
+		lib = LBi( ib, grid );
+		iuip[lib] = BR_HEADER;
+		ruip[lib] = 0;
+		lptr += LB_DESCRIPTOR + lsub[lptr+1];
+	    }
+	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
+	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
+	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
+	    klst = FstBlockC( k+1 );
+	    
+	    /* ---------------------------------------------------
+	       Update the first block column A(:,k+1).
+	       --------------------------------------------------- */
+	    jb = usub[iukp];   /* Global block number of block U(k,j). */
+	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
+		--nub;
+		lptr = lptr0;
+		luptr = luptr0;
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+			 iam, full, k, jb, ldu, ncols, nsupc);
+		  ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+				tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr]; /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 8 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0, it = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    z_sub(&ucol[rel], &ucol[rel], &tempv[it]);
+				    ++it;
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (it = 0, i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    z_sub(&nzval[indirect[rel]],
+					  &nzval[indirect[rel]],
+					  &tempv[it]);
+				    ++it;
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    }  /* if jb == k+1 */
+	} /* if L(:,k) and U(k,:) not empty */
+
+
+	if ( k+1 < nsupers ) {
+	  kcol = PCOL( k+1, grid );
+	  if ( mycol == kcol ) {
+	    /* Factor diagonal and subdiagonal blocks and test for exact
+	       singularity.  */
+	    pzgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
+
+	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
+	       to process rows. */
+	    lk = LBj( k+1, grid ); /* Local block number. */
+	    lsub1 = Lrowind_bc_ptr[lk];
+ 	    if ( lsub1 ) {
+		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
+		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
+	    } else {
+		msgcnt[0] = 0;
+		msgcnt[1] = 0;
+	    }
+	    scp = &grid->rscp; /* The scope of process row. */
+	    for (pj = 0; pj < Pc; ++pj) {
+		if ( ToSendR[lk][pj] != EMPTY ) {
+		    lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+		    TIC(t1);
+#endif
+		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
+			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
+		    MPI_Isend( lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
+#if ( PROFlevel>=1 )
+		    TOC(t2, t1);
+		    stat->utime[COMM] += t2;
+		    msg_cnt += 2;
+		    msg_vol += msgcnt[0]*iword + msgcnt[1]*zword;
+#endif
+#if ( DEBUGlevel>=2 )
+		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+			   iam, k+1, msgcnt[0], msgcnt[1], pj);
+#endif
+		}
+	    } /* for pj ... */
+	  } else { /* Post Recv of block column L(:,k+1). */
+	    if ( ToRecv[k+1] >= 1 ) {
+		scp = &grid->rscp; /* The scope of process row. */
+		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
+			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
+		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, 
+			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
+#if ( DEBUGlevel>=2 )
+		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
+#endif
+	    }
+	  } /* if mycol == Pc(k+1) */
+        } /* if k+1 < nsupers */
+
+	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+	    /* ---------------------------------------------------
+	       Update all other blocks using block row U(k,:)
+	       --------------------------------------------------- */
+	    for (j = 0; j < nub; ++j) { 
+		lptr = lptr0;
+		luptr = luptr0;
+		jb = usub[iukp];  /* Global block number of block U(k,j). */
+		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
+		nsupc = SuperSize( jb );
+		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+		/* Prepare to call DGEMM. */
+		jj = iukp;
+		while ( usub[jj] == klst ) ++jj;
+		ldu = klst - usub[jj++];
+		ncols = 1;
+		full = 1;
+		for (; jj < iukp+nsupc; ++jj) {
+		    segsize = klst - usub[jj];
+		    if ( segsize ) {
+		        ++ncols;
+			if ( segsize != ldu ) full = 0;
+		        if ( segsize > ldu ) ldu = segsize;
+		    }
+		}
+#if ( DEBUGlevel>=3 )
+		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+		       iam, full, k, jb, ldu, ncols, nsupc);
+		++num_update;
+#endif
+		if ( full ) {
+		    tempu = &uval[rukp];
+		} else { /* Copy block U(k,j) into tempU2d. */
+#if ( DEBUGlevel>=3 )
+		    ++num_copy;
+#endif
+		    tempu = tempU2d;
+		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
+		        segsize = klst - usub[jj];
+			if ( segsize ) {
+			    lead_zero = ldu - segsize;
+			    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+			    tempu += lead_zero;
+			    for (i = 0; i < segsize; ++i)
+			        tempu[i] = uval[rukp+i];
+			    rukp += segsize;
+			    tempu += segsize;
+			}
+		    }
+		    tempu = tempU2d;
+		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+		} /* if full ... */
+
+		for (lb = 0; lb < nlb; ++lb) { 
+		    ib = lsub[lptr];       /* Row block L(i,k). */
+		    nbrow = lsub[lptr+1];  /* Number of full rows. */
+		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+		    tempv = tempv2d;
+#ifdef _CRAY
+		    CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
+			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			  tempu, &ldu, &beta, tempv, &ldt);
+#elif defined (USE_VENDOR_BLAS)
+		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
+#else
+		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
+			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
+			   tempu, &ldu, &beta, tempv, &ldt);
+#endif
+		    stat->ops[FACT] += 8 * nbrow * ldu * ncols;
+
+		    /* Now gather the result into the destination block. */
+		    if ( ib < jb ) { /* A(i,j) is in U. */
+			ilst = FstBlockC( ib+1 );
+			lib = LBi( ib, grid );
+			index = Ufstnz_br_ptr[lib];
+			ijb = index[iuip[lib]];
+			while ( ijb < jb ) { /* Search for dest block. */
+			    ruip[lib] += index[iuip[lib]+1];
+			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
+			    ijb = index[iuip[lib]];
+			}
+			/* Skip descriptor.  Now point to fstnz index of 
+			   block U(i,j). */
+			iuip[lib] += UB_DESCRIPTOR;
+
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    fnz = index[iuip[lib]++];
+			    if ( segsize ) { /* Nonzero segment in U(k.j). */
+				ucol = &Unzval_br_ptr[lib][ruip[lib]];
+				for (i = 0 ; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    z_sub(&ucol[rel], &ucol[rel], &tempv[i]);
+				}
+				tempv += ldt;
+			    }
+			    ruip[lib] += ilst - fnz;
+			}
+		    } else { /* A(i,j) is in L. */
+			index = Lrowind_bc_ptr[ljb];
+			ldv = index[1];   /* LDA of the dest lusup. */
+			lptrj = BC_HEADER;
+			luptrj = 0;
+			ijb = index[lptrj];
+			while ( ijb != ib ) { /* Search for dest block -- 
+						 blocks are not ordered! */
+			    luptrj += index[lptrj+1];
+			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
+			    ijb = index[lptrj];
+			}
+			/*
+			 * Build indirect table. This is needed because the
+			 * indices are not sorted for the L blocks.
+			 */
+			fnz = FstBlockC( ib );
+			lptrj += LB_DESCRIPTOR;
+			for (i = 0; i < index[lptrj-1]; ++i) {
+			    rel = index[lptrj + i] - fnz;
+			    indirect[rel] = i;
+			}
+			nzval = Lnzval_bc_ptr[ljb] + luptrj;
+			tempv = tempv2d;
+			for (jj = 0; jj < nsupc; ++jj) {
+			    segsize = klst - usub[iukp + jj];
+			    if ( segsize ) {
+/*#pragma _CRI cache_bypass nzval,tempv*/
+				for (i = 0; i < nbrow; ++i) {
+				    rel = lsub[lptr + i] - fnz;
+				    z_sub(&nzval[indirect[rel]], 
+					  &nzval[indirect[rel]],
+					  &tempv[i]);
+				}
+				tempv += ldt;
+			    }
+			    nzval += ldv;
+			}
+		    } /* if ib < jb ... */
+		    lptr += nbrow;
+		    luptr += nbrow;
+		} /* for lb ... */
+		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+		iukp += nsupc;
+	    } /* for j ... */
+	} /* if  k L(:,k) and U(k,:) are not empty */
+
+    } 
+    /* ------------------------------------------
+       END MAIN LOOP: for k = ...
+       ------------------------------------------ */
+
+
+    if ( Pr*Pc > 1 ) {
+	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
+	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
+	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
+	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
+	SUPERLU_FREE(send_req);
+    }
+
+    SUPERLU_FREE(Llu->ujrow);
+    SUPERLU_FREE(tempv2d);
+    SUPERLU_FREE(indirect);
+    SUPERLU_FREE(iuip);
+    SUPERLU_FREE(ruip);
+
+    /* Prepare error message. */
+    if ( *info == 0 ) *info = n + 1;
+#if ( PROFlevel>=1 )
+    TIC(t1);
+#endif
+    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
+#if ( PROFlevel>=1 )
+    TOC(t2, t1);
+    stat->utime[COMM] += t2;
+    {
+	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+	
+	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_cnt, &msg_cnt_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_sum,
+		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	MPI_Reduce( &msg_vol, &msg_vol_max,
+		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	if ( !iam ) {
+	    printf("\tPZGSTRF comm stat:"
+		   "\tAvg\tMax\t\tAvg\tMax\n"
+		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
+		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
+	}
+    }
+#endif
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
+    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    for (i = 0; i < Pr * Pc; ++i) {
+	if ( iam == i ) {
+	    zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    zPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+	    printf("(%d)\n", iam);
+	    PrintInt10("Recv", nsupers, Llu->ToRecv);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgstrf()");
+#endif
+} /* PZGSTRF */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the process column that owns block column *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+static void pzgstrf2
+/************************************************************************/
+(
+ superlu_options_t *options,
+ int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, int* info
+ )
+{
+    int    c, iam, l, pkk;
+    int    incx = 1, incy = 1;
+    int    nsupr; /* number of rows in the block (LDA) */
+    int    luptr;
+    int_t  i, krow, j, jfst, jlst;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  *xsup = Glu_persist->xsup;
+    doublecomplex *lusup, temp;
+    doublecomplex *ujrow;
+    doublecomplex one = {1.0, 0.0}, alpha = {-1.0, 0.0};
+    *info = 0;
+
+    /* Quick return. */
+
+    /* Initialization. */
+    iam   = grid->iam;
+    krow  = PROW( k, grid );
+    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    j     = LBj( k, grid ); /* Local block number */
+    jfst  = FstBlockC( k );
+    jlst  = FstBlockC( k+1 );
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize( k );
+    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
+    ujrow = Llu->ujrow;
+
+    luptr = 0; /* Point to the diagonal entries. */
+    c = nsupc;
+    for (j = 0; j < jlst - jfst; ++j) {
+	/* Broadcast the j-th row (nsupc - j) elements to
+	   the process column. */
+	if ( iam == pkk ) { /* Diagonal process. */
+	    i = luptr;
+	    if ( options->ReplaceTinyPivot == YES ) {
+		if ( z_abs1(&lusup[i]) < thresh ) { /* Diagonal */
+#if ( PRNTlevel>=2 )
+		    printf("(%d) .. col %d, tiny pivot %e  ",
+			   iam, jfst+j, lusup[i]);
+#endif
+		    /* Keep the replaced diagonal with the same sign. */
+		    if ( lusup[i].r < 0 ) lusup[i].r = -thresh;
+		    else lusup[i].r = thresh;
+		    lusup[i].i = 0.0;
+#if ( PRNTlevel>=2 )
+		    printf("replaced by %e\n", lusup[i]);
+#endif
+		    ++(stat->TinyPivots);
+		}
+	    }
+	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
+	}
+#if 0
+	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
+#else
+	MPI_Bcast(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (grid->cscp).comm);
+	/*bcast_tree(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (24*k+j)%NTAGS,
+		   grid, COMM_COLUMN, &c);*/
+#endif
+
+#if ( DEBUGlevel>=2 )
+if ( k == 3329 && j == 2 ) {
+	if ( iam == pkk ) {
+	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	} else {
+	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
+	}
+}
+#endif
+
+	if ( !lusup ) { /* Empty block column. */
+	    --c;
+	    if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) *info = j+jfst+1;
+	    continue;
+	}
+
+	/* Test for singularity. */
+	if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) {
+	    *info = j+jfst+1;
+	} else {
+	    /* Scale the j-th column of the matrix. */
+	    z_div(&temp, &one, &ujrow[0]);
+	    if ( iam == pkk ) {
+		for (i = luptr+1; i < luptr-j+nsupr; ++i)
+		    zz_mult(&lusup[i], &lusup[i], &temp);
+		stat->ops[FACT] += 6*(nsupr-j-1) + 10;
+	    } else {
+		for (i = luptr; i < luptr+nsupr; ++i)
+		    zz_mult(&lusup[i], &lusup[i], &temp);
+		stat->ops[FACT] += 6*nsupr + 10;
+	    }
+	}
+	    
+	/* Rank-1 update of the trailing submatrix. */
+	if ( --c ) {
+	    if ( iam == pkk ) {
+		l = nsupr - j - 1;
+#ifdef _CRAY
+		CGERU(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#else
+		zgeru_(&l, &c, &alpha, &lusup[luptr+1], &incx,
+		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
+#endif
+		stat->ops[FACT] += 8 * l * c;
+	    } else {
+#ifdef _CRAY
+		CGERU(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#else
+		zgeru_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
+		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
+#endif
+		stat->ops[FACT] += 8 * nsupr * c;
+	    }
+	}
+	
+	/* Move to the next column. */
+	if ( iam == pkk ) luptr += nsupr + 1;
+	else luptr += nsupr;
+
+    } /* for j ... */
+
+} /* PZGSTRF2 */
+
+
+/************************************************************************/
+static void pzgstrs2
+/************************************************************************/
+#ifdef _CRAY
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
+ )
+#else
+(
+ int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+ LocalLU_t *Llu, SuperLUStat_t *stat
+ )
+#endif
+/* 
+ * Purpose
+ * =======
+ *   Perform parallel triangular solves
+ *           U(k,:) := A(k,:) \ L(k,k). 
+ *   Only the process row that owns block row *k* participates
+ *   in the work.
+ * 
+ * Arguments
+ * =========
+ *
+ * m      (input) int (global)
+ *        Number of rows in the matrix.
+ *
+ * k      (input) int (global)
+ *        The row number of the block row to be factorized.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization; 
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ */
+{
+    int    iam, pkk;
+    int    incx = 1;
+    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int    segsize;
+    int_t  nsupc; /* number of columns in the block */
+    int_t  luptr, iukp, rukp;
+    int_t  b, gb, j, klst, knsupc, lk, nb;
+    int_t  *xsup = Glu_persist->xsup;
+    int_t  *usub;
+    doublecomplex *lusup, *uval;
+
+    /* Quick return. */
+    lk = LBi( k, grid ); /* Local block number */
+    if ( !Llu->Unzval_br_ptr[lk] ) return;
+
+    /* Initialization. */
+    iam  = grid->iam;
+    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
+    klst = FstBlockC( k+1 );
+    knsupc = SuperSize( k );
+    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    if ( iam == pkk ) {
+	lk = LBj( k, grid );
+	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+	lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
+	lusup = Llu->Lval_buf_2[k%2];
+    }
+
+    /* Loop through all the row blocks. */
+    for (b = 0; b < nb; ++b) {
+	gb = usub[iukp];
+	nsupc = SuperSize( gb );
+	iukp += UB_DESCRIPTOR;
+
+	/* Loop through all the segments in the block. */
+	for (j = 0; j < nsupc; ++j) {
+	    segsize = klst - usub[iukp++]; 
+	    if ( segsize ) { /* Nonzero segment. */
+		luptr = (knsupc - segsize) * (nsupr + 1);
+#ifdef _CRAY
+		CTRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
+		      &uval[rukp], &incx);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx, 1, 1, 1);
+#else
+		ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
+		       &uval[rukp], &incx);
+#endif
+		stat->ops[FACT] += 4 * segsize * (segsize + 1)
+		    + 10 * segsize;  /* complex division */
+		rukp += segsize;
+	    }
+	}
+    } /* for b ... */
+
+} /* PZGSTRS2 */
+
+static int
+probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
+	   int buf_size)
+{
+    MPI_Status status;
+    int count; 
+
+    MPI_Probe( source, tag, comm, &status );
+    MPI_Get_count( &status, datatype, &count );
+    if ( count > buf_size ) {
+        printf("(%d) Recv'ed count %d > buffer size $d\n",
+	       iam, count, buf_size);
+	exit(-1);
+    }
+    return 0;
+}
diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c
new file mode 100644
index 0000000..5a11f84
--- /dev/null
+++ b/SRC/pzgstrs.c
@@ -0,0 +1,1350 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations A*X = B with a
+ * general N-by-N matrix A using the LU factors computed previously.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+/*
+ * Sketch of the algorithm for L-solve:
+ * =======================
+ *
+ * Self-scheduling loop:
+ *
+ *   while ( not finished ) { .. use message counter to control
+ *
+ *      reveive a message;
+ * 	
+ * 	if ( message is Xk ) {
+ * 	    perform local block modifications into lsum[];
+ *                 lsum[i] -= L_i,k * X[k]
+ *          if all local updates done, Isend lsum[] to diagonal process;
+ *
+ *      } else if ( message is LSUM ) { .. this must be a diagonal process 
+ *          accumulate LSUM;
+ *          if ( all LSUM are received ) {
+ *              perform triangular solve for Xi;
+ *              Isend Xi down to the current process column;
+ *              perform local block modifications into lsum[];
+ *          }
+ *      }
+ *   }
+ *
+ * 
+ * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array)
+ * =======================
+ *
+ * lsum[] array (local)
+ *   + lsum has "nrhs" columns, row-wise is partitioned by supernodes
+ *   + stored by row blocks, column wise storage within a row block
+ *   + prepend a header recording the global block number.
+ *
+ *         lsum[]                        ilsum[nsupers + 1]
+ *
+ *         -----
+ *         | | |  <- header of size 2     ---
+ *         --------- <--------------------| |
+ *         | | | | |			  ---
+ * 	   | | | | |	      |-----------| |		
+ *         | | | | | 	      |           ---
+ *	   ---------          |   |-------| |
+ *         | | |  <- header   |   |       ---
+ *         --------- <--------|   |  |----| |
+ *         | | | | |		  |  |    ---
+ * 	   | | | | |              |  |
+ *         | | | | |              |  |
+ *	   ---------              |  |
+ *         | | |  <- header       |  |
+ *         --------- <------------|  |
+ *         | | | | |                 |
+ * 	   | | | | |                 |
+ *         | | | | |                 |
+ *	   --------- <---------------|
+ */
+  
+/*#define ISEND_IRECV*/
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ * 
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ * 
+ * B      (input) doublecomplex*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) doublecomplex*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) SOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+
+int_t
+pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb,
+                      int_t fst_row, int_t *ilsum, doublecomplex *x,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist,
+		      gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct)
+{
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
+    int_t  *send_ibuf, *recv_ibuf;
+    doublecomplex *send_dbuf, *recv_dbuf;
+    int_t  *xsup, *supno;
+    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk;
+    int    p, procs;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    SendCnt      = gstrs_comm->B_to_X_SendCnt;
+    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    /* ------------------------------------------------------------
+       NOW COMMUNICATE THE ACTUAL DATA.
+       ------------------------------------------------------------*/
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls[p] * nrhs;
+    }
+
+    /* Copy the row indices and values to the send buffer. */
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	k = ptr_to_ibuf[p];
+	send_ibuf[k] = irow;
+	k = ptr_to_dbuf[p];
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    send_dbuf[k++] = B[i + j*ldb];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+
+    /* Communicate the (permuted) row indices. */
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+
+    /* Communicate the numerical values. */
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  grid->comm);
+    
+    /* ------------------------------------------------------------
+       Copy buffer into X on the diagonal processes.
+       ------------------------------------------------------------*/
+    ii = 0;
+    for (p = 0; p < procs; ++p) {
+        jj = rdispls_nrhs[p];
+        for (i = 0; i < RecvCnt[p]; ++i) {
+	    /* Only the diagonal processes do this; the off-diagonal processes
+	       have 0 RecvCnt. */
+	    irow = recv_ibuf[ii]; /* The permuted row index. */
+	    k = BlockNum( irow );
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );  /* Local block number. */
+	    l = X_BLK( lk );
+            x[l - XK_H].r = k; /* Block number prepended in the header. */
+            x[l - XK_H].i = 0;
+	    irow = irow - FstBlockC(k); /* Relative row number in X-block */
+	    RHS_ITERATE(j) {
+	        x[l + irow + j*knsupc] = recv_dbuf[jj++];
+	    }
+	    ++ii;
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()");
+#endif
+    return 0;
+} /* pzReDistribute_B_to_X */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine pxgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * </pre>
+ */
+
+int_t
+pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t fst_row,
+		      int_t nrhs, doublecomplex *x, int_t *ilsum,
+		      ScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
+		      SOLVEstruct_t *SOLVEstruct)
+{
+    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
+    int_t  *xsup, *supno;
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *send_ibuf, *recv_ibuf;
+    doublecomplex *send_dbuf, *recv_dbuf;
+    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+    int  iam, p, q, pkk, procs;
+    int_t  num_diag_procs, *diag_procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+ 
+    SendCnt      = gstrs_comm->X_to_B_SendCnt;
+    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+    for (p = 0; p < procs; ++p) {
+        ptr_to_ibuf[p] = sdispls[p];
+        ptr_to_dbuf[p] = sdispls_nrhs[p];
+    }
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid ); /* Local block number */
+		irow = FstBlockC( k );
+		l = X_BLK( lk );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
+#else
+		    ii = irow;
+#endif
+		    q = row_to_proc[ii];
+		    jj = ptr_to_ibuf[q];
+		    send_ibuf[jj] = ii;
+		    jj = ptr_to_dbuf[q];
+		    RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
+		        send_dbuf[jj++] = x[l + i + j*knsupc];
+		    }
+		    ++ptr_to_ibuf[q];
+		    ptr_to_dbuf[q] += nrhs;
+		    ++irow;
+		}
+	    }
+	}
+    }
+    
+    /* ------------------------------------------------------------
+        COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
+       ------------------------------------------------------------*/
+    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, 
+		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  grid->comm);
+
+    /* ------------------------------------------------------------
+       COPY THE BUFFER INTO B.
+       ------------------------------------------------------------*/
+    for (i = 0, k = 0; i < m_loc; ++i) {
+	irow = recv_ibuf[i];
+	irow -= fst_row; /* Relative row number */
+	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+	    B[irow + j*ldb] = recv_dbuf[k++];
+	}
+    }
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()");
+#endif
+    return 0;
+
+} /* pzReDistribute_X_to_B */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSTRS solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by PZGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PZGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_zdefs.h for the definition of 'LUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) doublecomplex*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ * 
+ * SOLVEstruct (input) SOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>       
+ */
+
+void
+pzgstrs(int_t n, LUstruct_t *LUstruct, 
+	ScalePermstruct_t *ScalePermstruct,
+	gridinfo_t *grid, doublecomplex *B,
+	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
+	SOLVEstruct_t *SOLVEstruct,
+	SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex alpha = {1.0, 0.0};
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
+    doublecomplex *x;     /* X component at step k. */
+		    /* NOTE: x and lsum are of same size. */
+    doublecomplex *lusup, *dest;
+    doublecomplex *recvbuf, *tempv;
+    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *supno, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    doublecomplex **Lnzval_bc_ptr;
+    MPI_Status status;
+    MPI_Request *send_req, recv_req;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve --
+                             Count the number of local block products to
+                             be summed into lsum[lk]. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+                             from processes in this row. 
+                             It is only valid on the diagonal processes. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for U-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+ 
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PZGSTRS", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = supno[n-1] + 1;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgstrs()");
+#endif
+
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doublecomplexCalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
+	ABORT("Calloc fails for x[].");
+    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+    /* Redistribute B into X on the diagonal processes. */
+    pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+    /* Set up the headers in lsum[]. */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/
+	    lsum[il - LSUM_H].i = 0;
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	
+#if ( PROFlevel>=2 )
+	t_reduce_tmp = SuperLU_timer_();
+#endif
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+#if ( PROFlevel>=2 )
+	t_reduce += SuperLU_timer_() - t_reduce_tmp;
+#endif
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#if 0
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+
+        k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j) {
+		  for (i = 0; i < knsupc; ++i)
+		      z_add(&x[i + ii + j*knsupc],
+			    &x[i + ii + j*knsupc],
+			    &tempv[i + j*knsupc]);
+	      }
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		      + 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+
+			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
+                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                     &send_req[Llu->SolveMsgSent++]);
+#if 0
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				    SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel==2 )
+    {
+      printf("(%d) .. After L-solve: y =\n", iam);
+      for (i = 0, k = 0; k < nsupers; ++k) {
+	  krow = PROW( k, grid );
+	  kcol = PCOL( k, grid );
+	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	      knsupc = SuperSize( k );
+	      lk = LBi( k, grid );
+	      ii = X_BLK( lk );
+	      for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	      fflush(stdout);
+	  }
+	  MPI_Barrier( grid->comm );
+      }
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
+    Llu->SolveMsgSent = 0;
+
+    MPI_Barrier( grid->comm );
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PZGSTRS. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid ); /* root process in this row scope */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid ); /* root process in this row scope. */
+		if ( mycol == kcol ) { /* diagonal process */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j) {
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
+	    }
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nub is the number of local block columns. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb) {
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    }
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#if 0
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
+                                  grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+        k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM: /* Receiver must be a diagonal process */
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j) {
+		    for (i = 0; i < knsupc; ++i)
+                        z_add(&x[i + ii + j*knsupc],
+			      &x[i + ii + j*knsupc],
+			      &tempv[i + j*knsupc]);
+		}
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                       &send_req[Llu->SolveMsgSent++] );
+#if 0
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
+                                      grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		    }
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=3 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+#if ( DEBUGlevel>=2 )
+    {
+	doublecomplex *x_col;
+	int diag;
+	printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
+	ii = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    knsupc = SuperSize( k );
+	    krow = PROW( k, grid );
+	    kcol = PCOL( k, grid );
+	    diag = PNUM( krow, kcol, grid);
+	    if ( iam == diag ) { /* Diagonal process. */
+		lk = LBi( k, grid );
+		jj = X_BLK( lk );
+		x_col = &x[jj];
+		RHS_ITERATE(j) {
+		    for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
+			printf("\t(%d)\t%4d\t%.10f\n",
+			       iam, xsup[k]+i, x_col[i]);
+		    }
+		    x_col += knsupc;
+		}
+	    }
+	    ii += knsupc;
+	} /* for k ... */
+    }
+#endif
+
+    pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i) {
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    }
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+
+    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
+    SUPERLU_FREE(send_req);
+
+    MPI_Barrier( grid->comm );
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgstrs()");
+#endif
+
+    return;
+} /* PZGSTRS */
+
diff --git a/SRC/pzgstrs1.c b/SRC/pzgstrs1.c
new file mode 100644
index 0000000..8924de8
--- /dev/null
+++ b/SRC/pzgstrs1.c
@@ -0,0 +1,913 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSTRS1 solves a system of distributed linear equations
+ *
+ *                   op( sub(A) ) * X = sub( B )
+ *
+ * with a general N-by-N distributed matrix sub( A ) using the LU
+ * factorization computed by PZGSTRF.
+ *
+ * This routine is used only in the iterative refinement routine
+ * pzgsrfs_ABXglobal, assuming that the right-hand side is already
+ * distributed in the diagonal processes.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures to store L and U factors,
+ *        and the permutation vectors.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t' structure.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * x      (input/output) doublecomplex*
+ *        On entry, the right hand side matrix.
+ *        On exit, the solution matrix if info = 0;
+ *
+ *        NOTE: the right-hand side matrix is already distributed on
+ *              the diagonal processes.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves; 
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>      
+ */
+
+void pzgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+	      doublecomplex *x, int nrhs, SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex alpha = {1.0, 0.0};
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
+    doublecomplex *lusup, *dest;
+    doublecomplex *recvbuf, *tempv;
+    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int    iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    doublecomplex **Lnzval_bc_ptr;
+    MPI_Status status;
+#ifdef ISEND_IRECV
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -8;
+    if ( *info ) {
+	pxerr_dist("PZGSTRS1", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgstrs1()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PZGSTRS1. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#ifdef ISEND_IRECV
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Compute ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
+    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Prepended the block number in the header for lsum[].
+     */
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H].r = k; 
+	    lsum[il - LSUM_H].i = 0; 
+	}
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( !frecv[lk] && !fmod[lk] ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 SuperLU_MPI_DOUBLE_COMPLEX, 
+                                 pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /*
+     * Compute the internal nodes asynchronously by all processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM:
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc],
+			    &tempv[i + j*knsupc]);
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p)
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				     &send_req[Llu->SolveMsgSent++] );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam,  status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) printf("\n.. After L-solve: y =\n");
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PZGSTRS1. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nlb is the number of local block rows. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb)
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( !brecv[lk] && !bmod[lk] ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				   &send_req[Llu->SolveMsgSent++] );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, 
+                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+	k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM:
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc],
+			      &tempv[i + j*knsupc]);
+
+		if ( !(--brecv[lk]) && !bmod[lk] ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p)
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgstrs1()");
+#endif
+
+} /* PZGSTRS1 */
diff --git a/SRC/pzgstrs_Bglobal.c b/SRC/pzgstrs_Bglobal.c
new file mode 100644
index 0000000..e769f35
--- /dev/null
+++ b/SRC/pzgstrs_Bglobal.c
@@ -0,0 +1,1050 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+static void gather_diag_to_all(int_t, int_t, doublecomplex [], Glu_persist_t *,
+                               LocalLU_t *, gridinfo_t *, int_t, int_t [],
+                               int_t [], doublecomplex [], int_t, doublecomplex []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * pzgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by pzgstrf.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pzgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) doublecomplex*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>    
+ */
+
+void
+pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
+                doublecomplex *B, int_t ldb, int nrhs, 
+                SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    doublecomplex alpha = {1.0, 0.0};
+    doublecomplex zero = {0.0, 0.0};
+    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
+    doublecomplex *x;     /* X component at step k. */
+    doublecomplex *lusup, *dest;
+    doublecomplex *recvbuf, *tempv;
+    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    doublecomplex **Lnzval_bc_ptr;
+    MPI_Status status;
+#if defined (ISEND_IRECV) || defined (BSEND)
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+ 
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PZGSTRS_BGLOBAL", grid, -*info);
+	return;
+    }
+	
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pzgstrs_Bglobal()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#if defined (ISEND_IRECV) || defined (BSEND)
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs 
+        + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+    
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Copy B into X on the diagonal processes.
+     */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */
+	    lsum[il - LSUM_H].i = 0;
+	    kcol = PCOL( k, grid );
+	    if ( mycol == kcol ) { /* Diagonal process. */
+		jj = X_BLK( lk );
+		x[jj - XK_H].r = k; /* Block number prepended in the header. */
+		x[jj - XK_H].i = 0;
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
+			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
+	    }
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  SuperLU_MPI_DOUBLE_COMPLEX, 
+                                  pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+		
+		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   send_req,stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#if 1
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+		 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+#else
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+		  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &request );
+	MPI_Wait( &request, &status );
+#endif
+
+	k = (*recvbuf).r;
+
+
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+	
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      z_add(&x[i + ii + j*knsupc],
+			    &x[i + ii + j*knsupc],
+			    &tempv[i + j*knsupc]);
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+		      + 10 * knsupc * nrhs; /* complex division */
+
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, 
+				     &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )	      
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    printf("\n(%d) .. After L-solve: y =\n", iam);
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nub is the number of local block columns. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column. 
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb) {
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    }
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i, 
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+		    + 10 * knsupc * nrhs; /* complex division */
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] ) 
+		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asychronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+	
+	k = (*recvbuf).r;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM: /* Receiver must be a diagonal process */
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			z_add(&x[i + ii + j*knsupc],
+			      &x[i + ii + j*knsupc],
+			      &tempv[i + j*knsupc]);
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		    }
+		    /*
+		     * Perform local block modifications: 
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+		
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif		
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+
+    /* Copy the solution X into B (on all processes). */
+    {
+	int_t num_diag_procs, *diag_procs, *diag_len;
+	doublecomplex *work;
+
+	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		       &diag_procs, &diag_len);
+	jj = diag_len[0];
+	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
+	if ( !(work = doublecomplexMalloc_dist(((size_t)jj)*nrhs)) )
+	    ABORT("Malloc fails for work[]");
+	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
+			   grid, num_diag_procs, diag_procs, diag_len,
+			   B, ldb, work);
+	SUPERLU_FREE(diag_procs);
+	SUPERLU_FREE(diag_len);
+	SUPERLU_FREE(work);
+    }
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+#ifdef BSEND
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit pzgstrs_Bglobal()");
+#endif
+
+} /* PZGSTRS_BGLOBAL */
+
+
+/*
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ */
+static void
+gather_diag_to_all(int_t n, int_t nrhs, doublecomplex x[],
+		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
+		   gridinfo_t *grid, int_t num_diag_procs,
+		   int_t diag_procs[], int_t diag_len[],
+		   doublecomplex y[], int_t ldy, doublecomplex work[])
+{
+    int_t i, ii, j, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    doublecomplex *x_col, *y_col;
+    
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
+		x_col = &x[ii];
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
+		    lwork += knsupc;
+		    x_col += knsupc;
+		}
+	    }
+	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p]*nrhs, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    y_col = &y[ii];
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
+		lwork += knsupc;
+		y_col += ldy;
+	    }
+	}
+    }
+} /* GATHER_DIAG_TO_ALL */
+
diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c
new file mode 100644
index 0000000..23bd35e
--- /dev/null
+++ b/SRC/pzgstrs_lsum.c
@@ -0,0 +1,385 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
+		   doublecomplex*, int*, doublecomplex*, int*);
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+void zlsum_fmod
+/************************************************************************/
+(
+ doublecomplex *lsum,    /* Sum of local modifications.                        */
+ doublecomplex *x,       /* X array (local)                                    */
+ doublecomplex *xk,      /* X[k].                                              */
+ doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int   knsupc,    /* Size of supernode k.                               */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t nlb,       /* Number of L blocks.                                */
+ int_t lptr,      /* Starting position in lsub[*].                      */
+ int_t luptr,     /* Starting position in lusup[*].                     */
+ int_t *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+)
+{
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+    doublecomplex *lusup, *lusup1;
+    doublecomplex *dest;
+    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
+    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
+    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    lsub = Llu->Lrowind_bc_ptr[lk];
+    lusup = Llu->Lnzval_bc_ptr[lk];
+    nsupr = lsub[1];
+
+    for (lb = 0; lb < nlb; ++lb) {
+	ik = lsub[lptr]; /* Global block number, row-wise. */
+	nbrow = lsub[lptr+1];
+#ifdef _CRAY
+	CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+	      &alpha, &lusup[luptr], &nsupr, xk,
+	      &knsupc, &beta, rtemp, &nbrow );
+#elif defined (USE_VENDOR_BLAS)
+	zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow, 1, 1 );
+#else
+	zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow );
+#endif
+	stat->ops[SOLVE] += 8 * nbrow * nrhs * knsupc + 2 * nbrow * nrhs;
+   
+	lk = LBi( ik, grid ); /* Local block number, row-wise. */
+	iknsupc = SuperSize( ik );
+	il = LSUM_BLK( lk );
+	dest = &lsum[il];
+	lptr += LB_DESCRIPTOR;
+	rel = xsup[ik]; /* Global row index of block ik. */
+	for (i = 0; i < nbrow; ++i) {
+	    irow = lsub[lptr++] - rel; /* Relative row. */
+	    RHS_ITERATE(j)
+		z_sub(&dest[irow + j*iknsupc],
+		      &dest[irow + j*iknsupc],
+		      &rtemp[i + j*nbrow]);
+	}
+	luptr += nbrow;
+		    
+	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
+	    ikcol = PCOL( ik, grid );
+	    p = PNUM( myrow, ikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( lk );
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			z_add(&x[i + ii + j*iknsupc],
+			      &x[i + ii + j*iknsupc],
+			      &lsum[i + il + j*iknsupc]);
+		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
+		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( ik, grid );/* Local block number, column-wise. */
+		    lsub1 = Llu->Lrowind_bc_ptr[lk];
+		    lusup1 = Llu->Lnzval_bc_ptr[lk];
+		    nsupr1 = lsub1[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+			  lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+			   lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+		
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( fsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, ikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                    }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    nlb1 = lsub1[0] - 1;
+		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
+		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
+
+		    zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
+			       fmod, nlb1, lptr1, luptr1, xsup,
+			       grid, Llu, send_req, stat);
+		} /* if frecv[lk] == 0 */
+	    } /* if iam == p */
+	} /* if fmod[lk] == 0 */
+
+    } /* for lb ... */
+
+} /* zLSUM_FMOD */
+
+
+/************************************************************************/
+void zlsum_bmod
+/************************************************************************/
+(
+ doublecomplex *lsum,        /* Sum of local modifications.                    */
+ doublecomplex *x,           /* X array (local).                               */
+ doublecomplex *xk,          /* X[k].                                          */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ LocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+ )
+{
+/*
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+ */
+    doublecomplex alpha = {1.0, 0.0};
+    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+           j, jj, lk, lk1, nub, ub, uptr;
+    int_t  *usub;
+    doublecomplex *uval, *dest, *y;
+    doublecomplex temp;
+    int_t  *lsub;
+    doublecomplex *lusup;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *brecv = Llu->brecv;
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    knsupc = SuperSize( k );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
+    for (ub = 0; ub < nub; ++ub) {
+	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+	usub = Llu->Ufstnz_br_ptr[ik];
+	uval = Llu->Unzval_br_ptr[ik];
+	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+	i += UB_DESCRIPTOR;
+	il = LSUM_BLK( ik );
+	gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+	iknsupc = SuperSize( gik );
+	ikfrow = FstBlockC( gik );
+	iklrow = FstBlockC( gik+1 );
+
+	RHS_ITERATE(j) {
+	    dest = &lsum[il + j*iknsupc];
+	    y = &xk[j*knsupc];
+	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+	    for (jj = 0; jj < knsupc; ++jj) {
+		fnz = usub[i + jj];
+		if ( fnz < iklrow ) { /* Nonzero segment. */
+		    /* AXPY */
+		    for (irow = fnz; irow < iklrow; ++irow) {
+			zz_mult(&temp, &uval[uptr], &y[jj]);
+			z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow],
+			      &temp);
+			++uptr;
+		    }
+		    stat->ops[SOLVE] += 8 * (iklrow - fnz);
+		}
+	    } /* for jj ... */
+	}
+
+	if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */
+	    gikcol = PCOL( gik, grid );
+	    p = PNUM( myrow, gikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			  SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( ik );
+		dest = &x[ii];
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			z_add(&dest[i + j*iknsupc], &dest[i + j*iknsupc],
+			      &lsum[i + il + j*iknsupc]);
+		if ( !brecv[ik] ) { /* Becomes a leaf node. */
+		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
+		    lk1 = LBj( gik, grid ); /* Local block number. */
+		    lsub = Llu->Lrowind_bc_ptr[lk1];
+		    lusup = Llu->Lnzval_bc_ptr[lk1];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+			   lusup, &nsupr, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
+			+ 10 * iknsupc * nrhs; /* complex division */
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, gik);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( bsendx_plist[lk1][p] != EMPTY ) {
+			    pi = PNUM( p, gikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                     }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    if ( Urbs[lk1] )
+			zlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if brecv[ik] == 0 */
+	    }
+	} /* if bmod[ik] == 0 */
+
+    } /* for ub ... */
+
+} /* zlSUM_BMOD */
+
diff --git a/SRC/pzlangs.c b/SRC/pzlangs.c
new file mode 100644
index 0000000..0681e9c
--- /dev/null
+++ b/SRC/pzlangs.c
@@ -0,0 +1,144 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
+ *
+ * <pre>
+ * File name:	pzlangs.c
+ * History:     Modified from lapack routine ZLANGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre> 
+    Purpose   
+    =======   
+
+    PZLANGS returns the value of the one norm, or the Frobenius norm, or 
+    the infinity norm, or the element of largest absolute value of a 
+    real matrix A.   
+
+    Description   
+    ===========   
+
+    PZLANGE returns the value   
+
+       PZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+                 (   
+                 ( norm1(A),         NORM = '1', 'O' or 'o'   
+                 (   
+                 ( normI(A),         NORM = 'I' or 'i'   
+                 (   
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+
+    Arguments   
+    =========   
+
+    NORM    (input) CHARACTER*1   
+            Specifies the value to be returned in DLANGE as described above.   
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A. 
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+   ===================================================================== 
+</pre>
+*/
+
+double pzlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
+{   
+    /* Local variables */
+    NRformat_loc *Astore;
+    int_t    m_loc;
+    doublecomplex   *Aval;
+    int_t    i, j, jcol;
+    double   value=0., sum;
+    double   *rwork;
+    double   tempvalue;
+    double   *temprwork;
+
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    Aval   = (doublecomplex *) Astore->nzval;
+    
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+    } else if ( strncmp(norm, "M", 1)==0 ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+		value = SUPERLU_MAX( value, slud_z_abs(&Aval[j]) );
+	}
+
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+#if 0
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+		sum += fabs(Aval[i]);
+	    value = SUPERLU_MAX(value,sum);
+	}
+#else /* XSL ==> */
+	if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) )
+	    ABORT("doubleCalloc_dist fails for rwork.");
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+		rwork[jcol] += slud_z_abs(&Aval[j]);
+	    }
+	}
+
+	if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) )
+	    ABORT("doubleCalloc_dist fails for temprwork.");
+	MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm);
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    value = SUPERLU_MAX(value, temprwork[j]);
+	}
+	SUPERLU_FREE (temprwork);
+	SUPERLU_FREE (rwork);
+#endif	
+    } else if ( strncmp(norm, "I", 1)==0 ) {
+	/* Find normI(A). */
+	value = 0.;
+	sum = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	        sum += slud_z_abs(&Aval[j]);
+	    value = SUPERLU_MAX(value, sum);
+	}
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else {
+	ABORT("Illegal norm specified.");
+    }
+    
+    return (value);
+
+} /* pzlangs */
diff --git a/SRC/pzlaqgs.c b/SRC/pzlaqgs.c
new file mode 100644
index 0000000..988fa60
--- /dev/null
+++ b/SRC/pzlaqgs.c
@@ -0,0 +1,152 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Equilibrates a general sparse M by N matrix
+ *
+ * <pre>
+ * File name:	pzlaqgs.c
+ * History:     Modified from LAPACK routine ZLAQGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose   
+    =======   
+
+    PZLAQGS equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments   
+    =========   
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of 
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+	    
+    R       (input) double*, dimension (A->nrow)
+            The row scale factors for A.
+	    
+    C       (input) double*, dimension (A->ncol)
+            The column scale factors for A.
+	    
+    ROWCND  (input) double
+            Ratio of the smallest R(i) to the largest R(i).
+	    
+    COLCND  (input) double
+            Ratio of the smallest C(i) to the largest C(i).
+	    
+    AMAX    (input) double
+            Absolute value of largest matrix entry.
+	    
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.   
+            = 'N':  No equilibration   
+            = 'R':  Row equilibration, i.e., A has been premultiplied by  
+                    diag(R).   
+            = 'C':  Column equilibration, i.e., A has been postmultiplied  
+                    by diag(C).   
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).   
+
+    Internal Parameters   
+    ===================   
+
+    THRESH is a threshold value used to decide if row or column scaling   
+    should be done based on the ratio of the row or column scaling   
+    factors.  If ROWCND < THRESH, row scaling is done, and if   
+    COLCND < THRESH, column scaling is done.   
+
+    LARGE and SMALL are threshold values used to decide if row scaling   
+    should be done based on the absolute size of the largest matrix   
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+
+    ===================================================================== 
+</pre>
+*/
+
+void
+pzlaqgs(SuperMatrix *A, double *r, double *c, 
+       double rowcnd, double colcnd, double amax, char *equed)
+{
+
+#define THRESH    (0.1)
+    
+    /* Local variables */
+    NRformat_loc *Astore;
+    doublecomplex *Aval;
+    int_t i, j, irow, jcol, m_loc;
+    double large, small;
+    double temp;
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+    
+    /* Initialize LARGE and SMALL. */
+    small = dmach_dist("Safe minimum") / dmach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    irow = Astore->fst_row;
+	    for (i = 0; i < m_loc; ++i) {
+	        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+		    jcol = Astore->colind[j];
+                    zd_mult(&Aval[j], &Aval[j], c[jcol]);
+	      }
+	      ++irow;
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+                zd_mult(&Aval[j], &Aval[j], r[irow]);
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Both row and column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+                temp = r[irow] * c[jcol];
+                zd_mult(&Aval[j], &Aval[j], temp);
+	    }
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* pzlaqgs */
+
diff --git a/SRC/pzsymbfact_distdata.c b/SRC/pzsymbfact_distdata.c
new file mode 100644
index 0000000..34cf021
--- /dev/null
+++ b/SRC/pzsymbfact_distdata.c
@@ -0,0 +1,1973 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Redistribute the symbolic structure of L and U from the distribution
+ *
+ * <pre>
+ * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
+ * -- Distributes the data from parallel symbolic factorization 
+ * -- to numeric factorization
+ * INRIA France -  July 1, 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ * </pre>
+ */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+
+#include "superlu_zdefs.h"
+#include "psymbfact.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Redistribute the symbolic structure of L and U from the distribution
+ * used in the parallel symbolic factorization step to the distdibution
+ * used in the parallel numeric factorization step.  On exit, the L and U
+ * structure for the 2D distribution used in the numeric factorization step is
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
+ * information is also computed and it is stored in Glu_persist->supno
+ * and Glu_persist->xsup.
+ *
+ * This routine allocates memory for storing the structure of L and U
+ * and the supernodes information.  This represents the arrays:
+ * p_xlsub, p_lsub, p_xusub, p_usub,
+ * Glu_persist->supno,  Glu_persist->xsup.
+ *
+ * This routine also deallocates memory allocated during symbolic 
+ * factorization routine.  That is, the folloing arrays are freed:
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
+ * Pslu_freeable->xusub, Pslu_freeable->usub, 
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
+ * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (Input) int_t
+ *        Order of the input matrix
+ * Pslu_freeable  (Input) Pslu_freeable_t *
+ *        Local L and U structure, 
+ *        global to local indexing information.
+ * 
+ * Glu_persist (Output) Glu_persist_t *
+ *        Stores on output the information on supernodes mapping.
+ * 
+ * p_xlsub (Output) int_t **
+ *         Pointer to structure of L distributed on a 2D grid 
+ *         of processors, stored by columns.
+ * 
+ * p_lsub  (Output) int_t **
+ *         Structure of L distributed on a 2D grid of processors, 
+ *         stored by columns.
+ *
+ * p_xusub (Output) int_t **
+ *         Pointer to structure of U distributed on a 2D grid 
+ *         of processors, stored by rows.
+ * 
+ * p_usub  (Output) int_t **
+ *         Structure of U distributed on a 2D grid of processors, 
+ *         stored by rows.
+ * 
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU.
+ *   > 0, number of bytes allocated in this routine when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+static float
+dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
+	     Glu_persist_t *Glu_persist, 
+	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
+	     gridinfo_t *grid
+	     )
+{
+  int   iam, nprocs, pc, pr, p, np, p_diag;
+  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
+    *tmp_ptrToSend, *mem;
+  int_t *nnzToRecv_l, *nnzToRecv_u;
+  int_t *send_1, *send_2, nsend_1, nsend_2;
+  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+  int_t nsupers, nsupers_i, nsupers_j;
+  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
+  int_t maxszsn, maxNvtcsPProc;
+  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
+  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
+  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
+  int_t *globToLoc, nvtcs_loc;
+  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
+    RecvCnt_l, RecvCnt_u, ind_loc;
+  int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
+  int_t nelts, isize;
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword;
+  
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_symbLU()");
+#endif
+  nprocs = (int) grid->nprow * grid->npcol;
+  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
+  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
+  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
+  xsup_end_s    = Pslu_freeable->xsup_end_loc;
+  supno_s       = Pslu_freeable->supno_loc;
+  rcv_luind     = NULL;
+  iword = sizeof(int_t);
+  dword = sizeof(doublecomplex);
+  memAux = 0.; memRet = 0.;
+  
+  mem           = intCalloc_dist(12 * nprocs);
+  if (!mem)
+    return (ERROR_RET);
+  memAux     = (float) (12 * nprocs * sizeof(int_t));
+  nnzToRecv     = mem;
+  nnzToSend     = nnzToRecv + 2*nprocs;
+  nnzToSend_l   = nnzToSend + 2 * nprocs;
+  nnzToSend_u   = nnzToSend_l + nprocs;
+  send_1        = nnzToSend_u + nprocs;
+  send_2        = send_1 + nprocs;
+  tmp_ptrToSend = send_2 + nprocs;
+  nnzToRecv_l   = tmp_ptrToSend + nprocs;
+  nnzToRecv_u   = nnzToRecv_l + nprocs;
+  
+  ptrToSend = nnzToSend;
+  ptrToRecv = nnzToSend + nprocs;
+
+  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
+  intBuf1 = nvtcs + nprocs;
+  intBuf2 = nvtcs + 2 * nprocs;
+  intBuf3 = nvtcs + 3 * nprocs;
+  intBuf4 = nvtcs + 4 * nprocs;
+  memAux += 5 * nprocs * sizeof(int);
+
+  maxszsn   = sp_ienv_dist(3);
+  
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(supno_n = intMalloc_dist(n+1)) ) {
+    fprintf (stderr, "Malloc fails for supno_n[].");
+    return (memAux);
+  }
+  memRet += (float) ((n+1) * sizeof(int_t));
+
+  /* ------------------------------------------------------------
+     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
+     ------------------------------------------------------------*/
+  
+  if (nvtcs_loc > INT_MAX)
+    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
+  intNvtcs_loc = (int) nvtcs_loc;
+  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
+	      0, grid->comm);
+
+  if (!iam) {
+    /* set ptrToRecv to point to the beginning of the data for
+       each processor */
+    for (k = 0, p = 0; p < nprocs; p++) {
+      ptrToRecv[p] = k;
+      k += nvtcs[p];
+    }
+  }
+  
+  if (nprocs > 1) {
+    temp = NULL;
+    if (!iam ) {
+      if ( !(temp = intMalloc_dist (n+1)) ) {
+	fprintf (stderr, "Malloc fails for temp[].");
+	return (memAux + memRet);
+      }
+      memAux += (float) (n+1) * iword;
+    }
+#if defined (_LONGINT)
+    for (p=0; p<nprocs; p++) {
+      if (ptrToRecv[p] > INT_MAX)
+	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+      intBuf1[p] = (int) ptrToRecv[p];
+    }
+#else  /* Default */
+    intBuf1 = ptrToRecv;
+#endif
+    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
+		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
+  }
+  else
+    temp = supno_s;
+
+  if (!iam) {
+    nsupers = 0;
+    p = (int) OWNER( globToLoc[0] );
+    gb = temp[ptrToRecv[p]];
+    supno_n[0] = nsupers;
+    ptrToRecv[p] ++;
+    szsn = 1;
+    for (j = 1; j < n; j ++) {
+      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
+	nsupers ++;
+	p  = (int) OWNER( globToLoc[j] );
+	gb = temp[ptrToRecv[p]];
+	szsn = 1;
+      }
+      else {
+	szsn ++;
+      }
+      ptrToRecv[p] ++;
+      supno_n[j] = nsupers;
+    }
+    nsupers++;
+    if (nprocs > 1) {
+      SUPERLU_FREE (temp);
+      memAux -= (float) (n+1) * iword;
+    }
+    supno_n[n] = nsupers;
+  }
+
+  /* reset to 0 nnzToSend */
+  for (p = 0; p < 2 *nprocs; p++)
+    nnzToSend[p] = 0;
+  
+  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
+  nsupers = supno_n[n];
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
+    fprintf (stderr, "Malloc fails for xsup_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers+1) * iword;  
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
+     ------------------------------------------------------------*/
+  gb = EMPTY;
+  for (i = 0; i < n; i++) {
+    if (gb != supno_n[i]) {
+      /* a new supernode starts */
+      gb = supno_n[i];
+      xsup_n[gb] = i;
+    }
+  }
+  xsup_n[nsupers] = n;
+  
+  for (p = 0; p < nprocs; p++) {
+    send_1[p] = FALSE;
+    send_2[p] = FALSE;
+  }
+  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
+    i = xsup_n[gb_n];
+    if (iam == (int) OWNER( globToLoc[i] )) {
+      pc = PCOL( gb_n, grid );
+      pr = PROW( gb_n, grid );
+      p_diag = PNUM( pr, pc, grid);
+      
+      i_loc = LOCAL_IND( globToLoc[i] );
+      gb_s  = supno_s[i_loc];
+      fst_s = xsup_beg_s[gb_s];
+      lst_s = xsup_end_s[gb_s];
+      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
+	k = lsub_s[j];
+	if (k >= i) {
+	  gb = supno_n[k];
+	  p = (int) PNUM( PROW(gb, grid), pc, grid );
+	  nnzToSend[2*p] ++;
+	  send_1[p] = TRUE;
+	}
+      }
+      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
+	k = usub_s[j];
+	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
+	  gb = supno_n[k];
+	  p = PNUM( pr, PCOL(gb, grid), grid);
+	  nnzToSend[2*p+1] ++;	
+	  send_2[p] = TRUE;
+	}
+      }
+      
+      nsend_2 = 0;
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	nnzToSend[2*p+1] += 2;
+	if (send_2[p])  nsend_2 ++;	  
+      }
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
+	if (send_2[p] || p == p_diag) {
+	  if (p == p_diag && !send_2[p])
+	    nnzToSend[2*p+1] += nsend_2;
+	  else
+	    nnzToSend[2*p+1] += nsend_2-1;
+	  send_2[p] = FALSE;
+	}
+      nsend_1 = 0;
+      for (p = pc; p < nprocs; p += grid->npcol) {
+	nnzToSend[2*p] += 2;
+	if (send_1[p]) nsend_1 ++;
+      }
+      for (p = pc; p < nprocs; p += grid->npcol) 
+	if (send_1[p]) {
+	  nnzToSend[2*p] += nsend_1-1;
+	  send_1[p] = FALSE;
+	}
+	else
+	  nnzToSend[2*p] += nsend_1;
+    }
+  }
+  
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
+		grid->comm);
+  
+  nnz_loc_l = nnz_loc_u = 0;
+  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
+  for (p = 0; p < nprocs; p++) {
+    if ( p != iam ) {
+      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
+      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
+      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
+      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    } else {
+      nnz_loc_l += nnzToRecv[2*p];
+      nnz_loc_u += nnzToRecv[2*p+1];
+      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
+      nnzToRecv_l[p] = nnzToRecv[2*p]; 
+      nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    }
+  }
+  
+  /* Allocate space for storing the symbolic structure after redistribution. */
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for xlsub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_j+1) * iword;
+
+  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for xusub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_i+1) * iword;  
+
+  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
+  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
+    if (!(rcv_luind = 
+	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
+      fprintf (stderr, "Malloc fails for rcv_luind[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
+      * iword;
+  }
+  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
+      fprintf (stderr, "Malloc fails for index[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+  } 
+  
+  /* ------------------------------------------------------------------
+     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
+     ------------------------------------------------------------------*/
+  sendL = TRUE;
+  sendU = FALSE;
+  while (sendL || sendU) {
+    if (sendL) {
+      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
+      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
+    }
+    if (sendU) {
+      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
+      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
+    }
+    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
+      if ( p != iam ) {
+	ptrToSend[p] = i;  i += nnzToSend[p];
+      }
+      ptrToRecv[p] = j;  j += nnzToRecv[p];
+    }
+    nnzToRecv[iam] = 0;
+    
+    ind_loc = ptrToRecv[iam];
+    for (gb_n = 0; gb_n < nsupers; gb_n++) {
+      nsend_2 = 0;    
+      i = xsup_n[gb_n];
+      if (iam == OWNER( globToLoc[i] )) {
+	pc = PCOL( gb_n, grid );
+	pr = PROW( gb_n, grid );
+	p_diag = PNUM( pr, pc, grid );
+	
+	i_loc = LOCAL_IND( globToLoc[i] );
+	gb_s  = supno_s[i_loc];
+	fst_s = xsup_beg_s[gb_s];
+	lst_s = xsup_end_s[gb_s];
+	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+
+	if (sendL) {
+	  p = pc;                np = grid->nprow;	  
+	} else {
+	  p = pr * grid->npcol;  np = grid->npcol;
+	}
+	for (j = 0; j < np; j++) {
+	  if (p == iam) {
+	    rcv_luind[ind_loc] = gb_n;
+	    rcv_luind[ind_loc+1] = 0;
+	    tmp_ptrToSend[p] = ind_loc + 1;
+	    ind_loc += 2;	 
+	  }
+	  else {
+	    snd_luind[ptrToSend[p]] = gb_n;
+	    snd_luind[ptrToSend[p]+1] = 0;
+	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
+	    ptrToSend[p] += 2;	 
+	  }
+	  if (sendL) p += grid->npcol;
+	  if (sendU) p++;
+	}
+	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
+	  k = sub_s[j];
+	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
+	    gb = supno_n[k];
+	    if (sendL)
+	      p = PNUM( PROW(gb, grid), pc, grid );
+	    else 
+	      p = PNUM( pr, PCOL(gb, grid), grid);
+	    if (send_1[p] == FALSE) {
+	      send_1[p] = TRUE;
+	      send_2[nsend_2] = k; nsend_2 ++;
+	    }
+	    if (p == iam) {
+	      rcv_luind[ind_loc] = k;  ind_loc++;
+	      if (sendL)
+		xsub_n[LBj( gb_n, grid )] ++;
+	      else
+		xsub_n[LBi( gb_n, grid )] ++;
+	    }
+	    else {
+	      snd_luind[ptrToSend[p]] = k;
+	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+	    }
+	  }
+	}
+	if (sendL)
+	  for (p = pc; p < nprocs; p += grid->npcol) {
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if (PNUM(PROW(gb, grid), pc, grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBj( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }
+		}
+	      }
+	      send_1[p] = FALSE;
+	  }  
+	if (sendU)
+	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	    if (send_1[p] || p == p_diag) {	      
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBi( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }	     
+		}
+	      } 
+	      send_1[p] = FALSE;
+	    }
+	  }
+      }
+    }
+    
+    /* reset ptrToSnd to point to the beginning of the data for
+       each processor (structure needed in MPI_Alltoallv) */
+    for (i = 0, p = 0; p < nprocs; p++) {
+      ptrToSend[p] = i;  i += nnzToSend[p];
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       Note: it uses MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    if (nprocs > 1) {
+#if defined (_LONGINT)
+      nnzToSend[iam] = 0;
+      for (p=0; p<nprocs; p++) {
+	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
+	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
+	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+	intBuf1[p] = (int) nnzToSend[p];
+	intBuf2[p] = (int) ptrToSend[p];
+	intBuf3[p] = (int) nnzToRecv[p];
+	intBuf4[p] = (int) ptrToRecv[p];
+      }
+#else  /* Default */
+      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
+      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
+#endif
+
+      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
+		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
+		     grid->comm);
+    }
+    if (sendL)
+      nnzToRecv[iam] = nnz_loc_l;
+    else 
+      nnzToRecv[iam] = nnz_loc_u;
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE.
+       -------------------------------------------------------------*/
+    if (sendU) 
+      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+	SUPERLU_FREE (snd_luind);
+	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+      }
+    
+    /* ------------------------------------------------------------
+       CONVERT THE FORMAT.
+       ------------------------------------------------------------*/
+    /* Initialize the array of column of L/ row of U pointers */
+    k = 0;
+    for (p = 0; p < nprocs; p ++) {
+      if (p != iam) {
+	i = k;
+	while (i < k + nnzToRecv[p]) {
+	  gb = rcv_luind[i];
+	  nelts = rcv_luind[i+1];
+	  if (sendL)
+	    xsub_n[LBj( gb, grid )] = nelts;
+	  else
+	    xsub_n[LBi( gb, grid )] = nelts;
+	  i += nelts + 2;
+	}
+      }
+      k += nnzToRecv[p];
+    }
+
+    if (sendL) j = nsupers_j;
+    else j = nsupers_i;
+    k = 0; 
+    isize = xsub_n[0];
+    xsub_n[0] = 0; 
+    for (gb_l = 1; gb_l < j; gb_l++) {
+      k += isize;
+      isize = xsub_n[gb_l];
+      xsub_n[gb_l] = k;
+    }
+    xsub_n[gb_l] = k + isize;
+    nnz_loc = xsub_n[gb_l];
+    if (sendL) {
+      lsub_n = NULL;
+      if (nnz_loc) {
+	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for lsub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = lsub_n;
+    }
+    if (sendU) {
+      usub_n = NULL;
+      if (nnz_loc) {
+	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for usub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = usub_n;
+    }
+    
+    /* Copy the data into the L column / U row oriented storage */
+    k = 0;
+    for (p = 0; p < nprocs; p++) {
+      i = k;
+      while (i < k + nnzToRecv[p]) {
+	gb = rcv_luind[i];
+	if (gb >= nsupers)
+	  printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
+		  iam, p, gb, nsupers, i, i-k);
+	i += 2;
+	if (sendL) gb_l = LBj( gb, grid );
+	if (sendU) gb_l = LBi( gb, grid );
+	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
+	  sub_n[j] = rcv_luind[i];
+	}
+      }      
+      k += nnzToRecv[p];
+    }
+    if (sendL) {
+      sendL = FALSE;  sendU = TRUE;
+    }
+    else
+      sendU = FALSE;
+  }
+
+  /* deallocate memory allocated during symbolic factorization routine */
+  if (rcv_luind != NULL) {
+    SUPERLU_FREE (rcv_luind);
+    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
+  }
+  SUPERLU_FREE (mem);  
+  memAux -= (float) (12 * nprocs * iword);
+  SUPERLU_FREE(nvtcs);
+  memAux -= (float) (5 * nprocs * sizeof(int));
+  
+  if (xlsub_s != NULL) {
+    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
+  }
+  if (xusub_s != NULL) {
+    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
+  }
+  SUPERLU_FREE (globToLoc); 
+  if (supno_s != NULL) {
+    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
+    SUPERLU_FREE (supno_s);
+  }
+  
+  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
+  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
+  *p_xusub = xusub_n; *p_usub = usub_n;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit dist_symbLU()");
+#endif
+  
+  return (-memRet);
+}
+ 
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.  The lower part is
+ *   stored using a column format and the upper part
+ *   is stored using a row format.
+ * 
+ * Arguments
+ * =========
+ * 
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (Input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_persist  (Input) Glu_persist_t *
+ *        Information on supernodes mapping.
+ * 
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * p_ainf_colptr (Output) int_t**
+ *         Pointer to the lower part of A distributed on a 2D grid 
+ *         of processors, stored by columns.
+ *
+ * p_ainf_rowind (Output) int_t**
+ *         Structure of of the lower part of A distributed on a 
+ *         2D grid of processors, stored by columns.
+ *
+ * p_ainf_val    (Output) doublecomplex**
+ *         Numerical values of the lower part of A, distributed on a 
+ *         2D grid of processors, stored by columns.
+ *
+ * p_asup_rowptr (Output) int_t**
+ *         Pointer to the upper part of A distributed on a 2D grid 
+ *         of processors, stored by rows.
+ *
+ * p_asup_colind (Output) int_t**
+ *         Structure of of the upper part of A distributed on a 
+ *         2D grid of processors, stored by rows.
+ *
+ * p_asup_val    (Output) doublecomplex**
+ *         Numerical values of the upper part of A, distributed on a 
+ *         2D grid of processors, stored by rows.
+ *
+ * ilsum_i  (Input) int_t *
+ *       Starting position of each supernode in 
+ *       the full array (local, block row wise).
+ *
+ * ilsum_j  (Input) int_t *
+ *       Starting position of each supernode in 
+ *       the full array (local, block column wise).
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+ 
+static float
+zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
+	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
+	int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val,
+	int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val,
+	int_t *ilsum_i, int_t *ilsum_j
+	)
+{
+  int    iam, p, procs;
+  NRformat_loc *Astore;
+  int_t  *perm_r; /* row permutation vector */
+  int_t  *perm_c; /* column permutation vector */
+  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
+  int_t  nsupers, nsupers_i, nsupers_j;
+  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
+  int_t  SendCnt; /* number of remote nonzeros to be sent */
+  int_t  RecvCnt; /* number of remote nonzeros to be sent */
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  doublecomplex *asup_val, *ainf_val;
+  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+  int_t  *ia, *ja, **ia_send, *index, *itemp;
+  int_t  *ptr_to_send;
+  doublecomplex *aij, **aij_send, *nzval, *dtemp;
+  doublecomplex *nzval_a;
+  MPI_Request *send_req;
+  MPI_Status  status;
+  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+  int_t *supno = Glu_persist->supno;   
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword, szbuf;
+
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter zdist_A()");
+#endif
+  iword = sizeof(int_t);
+  dword = sizeof(double);
+  
+  perm_r = ScalePermstruct->perm_r;
+  perm_c = ScalePermstruct->perm_c;
+  procs = grid->nprow * grid->npcol;
+  Astore = (NRformat_loc *) A->Store;
+  n = A->ncol;
+  m_loc = Astore->m_loc;
+  fst_row = Astore->fst_row;
+  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (ERROR_RET);
+  }
+  memAux = (float) (2 * procs * iword);
+  memRet = 0.;
+  nnzToSend = nnzToRecv + procs;
+  nsupers  = supno[n-1] + 1;  
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF A.
+     ------------------------------------------------------------*/
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+      ++nnzToSend[p]; 
+    }
+  }
+  
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		grid->comm);
+  
+  maxnnzToRecv = 0;
+  nnz_loc = SendCnt = RecvCnt = 0;
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      SendCnt += nnzToSend[p];
+      RecvCnt += nnzToRecv[p];
+      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+    } else {
+      nnz_loc += nnzToRecv[p];
+      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+    }
+  }
+  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+  szbuf = k;
+
+  /* Allocate space for storing the triplets after redistribution. */
+  if ( !(ia = intMalloc_dist(2*k)) ) {
+    fprintf (stderr, "Malloc fails for ia[].");
+    return (memAux);
+  }
+  memAux += (float) (2*k*iword);
+  ja = ia + k;
+  if ( !(aij = doublecomplexMalloc_dist(k)) ) {
+    fprintf (stderr, "Malloc fails for aij[].");
+    return (memAux);
+  }
+  memAux += (float) (k*dword);
+  
+  /* Allocate temporary storage for sending/receiving the A triplets. */
+  if ( procs > 1 ) {
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
+      fprintf (stderr, "Malloc fails for send_req[].");
+      return (memAux);
+    }
+    memAux += (float) (2*procs *sizeof(MPI_Request));
+    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
+      fprintf(stderr, "Malloc fails for ia_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(int_t*));
+    if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) {
+      fprintf(stderr, "Malloc fails for aij_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(doublecomplex*));    
+    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for index[].");
+      return (memAux);
+    }
+    memAux += (float) (2*SendCnt*iword);
+    if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for nzval[].");
+      return (memAux);
+    }
+    memAux += (float) (SendCnt * dword);
+    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
+      fprintf(stderr, "Malloc fails for ptr_to_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs * iword);
+    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for itemp[].");
+      return (memAux);
+    }
+    memAux += (float) (2*maxnnzToRecv*iword);
+    if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for dtemp[].");
+      return (memAux);
+    }
+    memAux += (float) (maxnnzToRecv * dword);
+    
+    for (i = 0, j = 0, p = 0; p < procs; ++p) {
+      if ( p != iam ) {
+	ia_send[p] = &index[i];
+	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	aij_send[p] = &nzval[j];
+	j += nnzToSend[p];
+      }
+    }
+  } /* if procs > 1 */
+  
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
+    return (memAux);
+  }
+  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
+  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
+    return (memAux+memRet);
+  }
+  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
+  
+  /* ------------------------------------------------------------
+     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF A.
+     ------------------------------------------------------------*/
+  nnz_loc = 0; /* Reset the local nonzero count. */
+  nnz_loc_ainf = nnz_loc_asup = 0;
+  nzval_a = Astore->nzval;
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+      
+      if ( p != iam ) { /* remote */
+	k = ptr_to_send[p];
+	ia_send[p][k] = irow;
+	ia_send[p][k + nnzToSend[p]] = jcol;
+	aij_send[p][k] = nzval_a[j];
+	++ptr_to_send[p]; 
+      } else {          /* local */
+	ia[nnz_loc] = irow;
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = nzval_a[j];
+	++nnz_loc;
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+
+  /* ------------------------------------------------------------
+     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+     NOTE: Can possibly use MPI_Alltoallv.
+     ------------------------------------------------------------*/
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToSend[p];
+      MPI_Isend( ia_send[p], it, mpi_int_t,
+		 p, iam, grid->comm, &send_req[p] );
+      it = nnzToSend[p];
+      MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
+		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
+    }
+  }
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToRecv[p];
+      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+      it = nnzToRecv[p];
+      MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
+		grid->comm, &status );
+      for (i = 0; i < nnzToRecv[p]; ++i) {
+	ia[nnz_loc] = itemp[i];
+	irow = itemp[i];
+	jcol = itemp[i + nnzToRecv[p]];
+	/* assert(jcol<n); */
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = dtemp[i];
+	++nnz_loc;
+	
+	gbi = BlockNum( irow );
+	gbj = BlockNum( jcol );
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+  
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      MPI_Wait( &send_req[p], &status);
+      MPI_Wait( &send_req[procs+p], &status);
+    }
+  }
+  
+  /* ------------------------------------------------------------
+     DEALLOCATE TEMPORARY STORAGE
+     ------------------------------------------------------------*/
+  
+  SUPERLU_FREE(nnzToRecv);
+  memAux -= 2 * procs * iword;
+  if ( procs > 1 ) {
+    SUPERLU_FREE(send_req);
+    SUPERLU_FREE(ia_send);
+    SUPERLU_FREE(aij_send);
+    SUPERLU_FREE(index);
+    SUPERLU_FREE(nzval);
+    SUPERLU_FREE(ptr_to_send);
+    SUPERLU_FREE(itemp);
+    SUPERLU_FREE(dtemp);
+    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
+      procs*sizeof(doublecomplex*) + 2*SendCnt * iword +
+      SendCnt* dword + procs*iword +
+      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
+  }
+  
+  /* ------------------------------------------------------------
+     CONVERT THE TRIPLET FORMAT.
+     ------------------------------------------------------------*/
+  if (nnz_loc_ainf != 0) {
+    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * iword);
+    if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_val[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * dword);
+  }
+  else {
+    ainf_rowind = NULL;
+    ainf_val = NULL;
+  }
+  if (nnz_loc_asup != 0) {
+    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_colind[].");
+      return (memAux + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * iword);
+    if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_val[].");
+      return (memAux  + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * dword);
+  }
+  else {
+    asup_colind = NULL;
+    asup_val = NULL;
+  }
+
+  /* Initialize the array of column pointers */
+  k = 0; 
+  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
+  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
+    k += jsize;              
+    jsize = ainf_colptr[j];  
+    ainf_colptr[j] = k;
+  }
+  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
+  i = 0;
+  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
+  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
+    i += isize;
+    isize = asup_rowptr[j];  
+    asup_rowptr[j] = i;
+  }
+  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
+
+  /* Copy the triplets into the column oriented storage */
+  for (i = 0; i < nnz_loc; ++i) {
+    jcol = ja[i];
+    irow = ia[i];
+    gbi = BlockNum( irow );
+    gbj = BlockNum( jcol );
+    /* Count nonzeros in each column of L / row of U */
+    if (gbi >= gbj) {
+      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
+      k = ainf_colptr[j];
+      ainf_rowind[k] = irow;
+      ainf_val[k] = aij[i];
+      ainf_colptr[j] ++;
+    }
+    else {
+      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
+      k = asup_rowptr[j];
+      asup_colind[k] = jcol;
+      asup_val[k] = aij[i];
+      asup_rowptr[j] ++;
+    }
+  }
+
+  /* Reset the column pointers to the beginning of each column */
+  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
+    ainf_colptr[j] = ainf_colptr[j-1];
+  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
+    asup_rowptr[j] = asup_rowptr[j-1];
+  ainf_colptr[0] = 0;
+  asup_rowptr[0] = 0;
+  
+  SUPERLU_FREE(ia);
+  SUPERLU_FREE(aij);
+  memAux -= 2*szbuf*iword + szbuf*dword;
+  
+  *p_ainf_colptr = ainf_colptr;
+  *p_ainf_rowind = ainf_rowind; 
+  *p_ainf_val    = ainf_val;
+  *p_asup_rowptr = asup_rowptr;
+  *p_asup_colind = asup_colind;
+  *p_asup_val    = asup_val;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit zdist_A()");
+  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
+#endif
+
+  return (-memRet);
+} /* dist_A */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the input matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *          This routine should not be called for this case, an error
+ *          is generated.  Instead, pddistribute routine should be called.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (Input) int
+ *        Dimension of the matrix.
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
+ *
+ * ScalePermstruct (Input) ScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (Input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ * 
+ * LUstruct (Input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated for performing the distribution
+ *       of the data, when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+float
+zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
+		ScalePermstruct_t *ScalePermstruct,
+		Pslu_freeable_t *Pslu_freeable, 
+		LUstruct_t *LUstruct, gridinfo_t *grid)
+{
+  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+  Glu_freeable_t Glu_freeable_n;
+  LocalLU_t *Llu = LUstruct->Llu;
+  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
+    len, len1, nsupc, nsupc_gb, ii, nprocs;
+  int_t ljb;  /* local block column number */
+  int_t nrbl; /* number of L blocks in current block column */
+  int_t nrbu; /* number of U blocks in current block column */
+  int_t gb;   /* global block number; 0 < gb <= nsuper */
+  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
+  int_t mybufmax[NBUFFERS];
+  NRformat_loc *Astore;
+  doublecomplex *a;
+  int_t *asub, *xa;
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  doublecomplex *asup_val, *ainf_val;
+  int_t *xsup, *supno;    /* supernode and column mapping */
+  int_t *lsub, *xlsub, *usub, *xusub;
+  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
+  int_t next_ind;      /* next available position in index[*] */
+  int_t next_val;      /* next available position in nzval[*] */
+  int_t *index;        /* indices consist of headers and row subscripts */
+  int   *index1;       /* temporary pointer to array of int */
+  doublecomplex *lusup, *uval; /* nonzero values in L and U */
+  int_t *recvBuf;
+  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
+  doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+  doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  
+  /*-- Counts to be used in factorization. --*/
+  int  *ToRecv, *ToSendD, **ToSendR;
+  
+  /*-- Counts to be used in lower triangular solve. --*/
+  int_t  *fmod;          /* Modification count for L-solve.        */
+  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nfsendx = 0;    /* Number of Xk I will send               */
+  int_t  kseen;
+  
+  /*-- Counts to be used in upper triangular solve. --*/
+  int_t  *bmod;          /* Modification count for U-solve.        */
+  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
+  int_t  *ilsum;         /* starting position of each supernode in 
+			    the full array (local)                 */  
+  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
+				the full array (local, block column wise) */  
+  /*-- Auxiliary arrays; freed on return --*/
+  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  int_t *LUb_length; /* L,U block length; size nsupers_ij */
+  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
+  int_t *LUb_number; /* global block number; size nsupers_ij */
+  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
+  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  doublecomplex *dense, *dense_col; /* SPA */
+  doublecomplex zero = {0.0, 0.0};
+  int_t ldaspa;     /* LDA of SPA */
+  int_t iword, dword;
+  float memStrLU, memA,
+        memDist = 0.; /* memory used for redistributing the data, which does
+		         not include the memory for the numerical values
+                         of L and U (positive number)*/
+  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
+		         L and U, that will be used in the numeric
+                         factorization (positive number) */
+
+#if ( PRNTlevel>=1 )
+  int_t nLblocks = 0, nUblocks = 0;
+#endif
+  
+  /* Initialization. */
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+#endif
+  myrow = MYROW( iam, grid );
+  mycol = MYCOL( iam, grid );
+  nprocs = grid->npcol * grid->nprow;
+  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+  Astore   = (NRformat_loc *) A->Store;
+  
+  iword = sizeof(int_t);
+  dword = sizeof(doublecomplex);
+
+  if (fact == SamePattern_SameRowPerm) {
+    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
+  }
+
+  if ((memStrLU = 
+       dist_symbLU (n, Pslu_freeable, 
+		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
+    return (memStrLU);
+  memDist += (-memStrLU);
+  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
+  supno = Glu_persist->supno;   
+  nsupers  = supno[n-1] + 1;
+  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
+  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
+  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
+  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum[].");  
+    return (memDist + memNLU);
+  }
+  memNLU += (nsupers_i+1) * iword;
+  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum_j[].");
+    return (memDist + memNLU);
+  }
+  memDist += (nsupers_j+1) * iword;
+
+  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
+  ilsum[0] = 0;
+  ldaspa = 0;
+  for (gb = 0; gb < nsupers; gb++) 
+    if ( myrow == PROW( gb, grid ) ) {
+      i = SuperSize( gb );
+      ldaspa += i;
+      lb = LBi( gb, grid );
+      ilsum[lb + 1] = ilsum[lb] + i;
+    }
+  ilsum[nsupers_i] = ldaspa;
+
+  ldaspa_j = 0; ilsum_j[0] = 0;  
+  for (gb = 0; gb < nsupers; gb++) 
+    if (mycol == PCOL( gb, grid )) {
+      i = SuperSize( gb );
+      ldaspa_j += i;
+      lb = LBj( gb, grid );
+      ilsum_j[lb + 1] = ilsum_j[lb] + i;
+    }
+  ilsum_j[nsupers_j] = ldaspa_j;
+  
+  if ((memA = zdist_A(A, ScalePermstruct, Glu_persist,
+		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
+		      &asup_rowptr, &asup_colind, &asup_val,
+		      ilsum, ilsum_j)) > 0)
+    return (memDist + memA + memNLU);
+  memDist += (-memA);
+
+  /* ------------------------------------------------------------
+     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+     ------------------------------------------------------------*/
+  
+  /* We first need to set up the L and U data structures and then
+   * propagate the values of A into them.
+   */
+  if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) {
+    fprintf(stderr, "Calloc fails for ToRecv[].");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+  memNLU += nsupers * iword;
+  
+  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+  if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) {
+    fprintf(stderr, "Malloc fails for ToSendR[].");
+    return (memDist + memNLU);
+  }
+  memNLU += k*sizeof(int_t*);
+  j = k * grid->npcol;
+  if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for index[].");
+    return (memDist + memNLU);
+  }
+  memNLU += j*iword;
+  
+  for (i = 0; i < j; ++i) index1[i] = EMPTY;
+  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+  
+  /* Auxiliary arrays used to set up L and U block data structures.
+     They are freed on return. */
+  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_length[].");
+    return (memDist + memNLU);
+  }
+  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Malloc fails for LUb_indptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_number[].");
+    return (memDist + memNLU);
+  }    
+  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_valptr[].");
+    return (memDist + memNLU);
+  }
+  memDist += 4 * nsupers_ij * iword;
+  
+  k = CEILING( nsupers, grid->nprow ); 
+  /* Pointers to the beginning of each block row of U. */
+  if ( !(Unzval_br_ptr = 
+	 (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) {
+    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
+    return (memDist + memNLU);
+  }
+  memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*);
+  Unzval_br_ptr[nsupers_i-1] = NULL;
+  Ufstnz_br_ptr[nsupers_i-1] = NULL;
+
+  if ( !(ToSendD = SUPERLU_MALLOC(nsupers_i * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for ToSendD[].");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO;
+
+  memNLU += nsupers_i*iword;  
+  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU);
+  }
+  memDist += (nsupers_i + nsupers_j)*iword;
+  
+  /* Auxiliary arrays used to set up L, U block data structures.
+     They are freed on return.
+     k is the number of local row blocks.   */
+  if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
+				   * sp_ienv_dist(3))) ) {
+    fprintf(stderr, "Calloc fails for SPA dense[].");
+    return (memDist + memNLU);
+  }
+  /* These counts will be used for triangular solves. */
+  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for fmod[].");
+    return (memDist + memNLU);
+  }
+  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for bmod[].");
+    return (memDist + memNLU);
+  }
+  /* ------------------------------------------------ */
+  memNLU += 2*nsupers_i*iword + 
+    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
+  
+  /* Pointers to the beginning of each block column of L. */
+  if ( !(Lnzval_bc_ptr = 
+	 (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
+    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
+    return (memDist + memNLU);
+  }
+  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
+    return (memDist + memNLU);
+  }
+  memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*);
+  Lnzval_bc_ptr[nsupers_j-1] = NULL;
+  Lrowind_bc_ptr[nsupers_j-1] = NULL;
+  
+  /* These lists of processes will be used for triangular solves. */
+  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[].");
+    return (memDist + memNLU);
+  }
+  len = nsupers_j * grid->nprow;
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    fsendx_plist[i] = &index[j];
+  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[].");
+    return (memDist + memNLU);
+  }
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
+    return (memDist + memNLU);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    bsendx_plist[i] = &index[j];
+  /* -------------------------------------------------------------- */
+  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+  
+  /*------------------------------------------------------------
+    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+    ------------------------------------------------------------*/
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid);  /* Local block number row wise */
+    fsupc = FstBlockC( jb );
+    nsupc = SuperSize( jb );
+    
+    if ( myrow == jbrow ) { /* Block row jb in my process row */
+      /* Scatter A into SPA. */
+      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
+	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
+	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
+	    printf ("ERR7\n");
+	  jcol = asup_colind[i];
+	  if (jcol >= n)
+	    printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  gb = BlockNum( jcol );
+	  lb = LBj( gb, grid );
+	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
+	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
+	  if (jcol >= ldaspa_j)
+	    printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  dense_col[jcol] = asup_val[i];
+	}
+	dense_col += ldaspa_j;
+      }
+      
+      /*------------------------------------------------
+       * SET UP U BLOCKS.
+       *------------------------------------------------*/
+      /* Count number of blocks and length of each block. */
+      nrbu = 0;
+      len = 0; /* Number of column subscripts I own. */
+      len1 = 0; /* number of fstnz subscripts */
+      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
+	jcol = usub[i];
+	gb = BlockNum( jcol ); /* Global block number */
+	
+	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
+	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
+	  iam, jb, gb, jcol, jbcol, pc); */
+	
+	lb = LBj( gb, grid );  /* Local block number */
+	pc = PCOL( gb, grid ); /* Process col owning this block */
+	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
+	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
+	pr = PROW( gb, grid );
+	if ( pr != jbrow  && mycol == pc)
+	  bsendx_plist[lb][jbrow] = YES; 
+	if (mycol == pc) {
+	  len += nsupc;
+	  LUb_length[lb] += nsupc;
+	  ToSendD[ljb_i] = YES;
+	  if (Urb_marker[lb] <= jb) { /* First see this block */
+	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
+	    Urb_marker[lb] = jb + 1;
+	    LUb_number[nrbu] = gb;
+	    /* if (gb == 391825 && jb == 145361)
+	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
+	       iam, jb, gb, nrbu); */
+	    nrbu ++;
+	    len1 += SuperSize( gb );
+	    if ( gb != jb )/* Exclude diagonal block. */
+	      ++bmod[ljb_i];/* Mod. count for back solve */
+#if ( PRNTlevel>=1 )
+	    ++nUblocks;
+#endif
+	  }
+	}
+      } /* for i ... */
+      
+      if ( nrbu ) { 
+	/* Sort the blocks of U in increasing block column index.
+	   SuperLU_DIST assumes this is true */
+	/* simple insert sort algorithm */
+	/* to be transformed in quick sort */
+	for (j = 1; j < nrbu; j++) {
+	  k = LUb_number[j];
+	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
+	    LUb_number[i+1] = LUb_number[i];
+	  }
+	  LUb_number[i+1] = k;
+	} 
+	
+	/* Set up the initial pointers for each block in
+	   index[] and nzval[]. */
+	/* Add room for descriptors */
+	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1+1)) ) {
+	  fprintf (stderr, "Malloc fails for Uindex[]");
+	  return (memDist + memNLU);
+	}
+	Ufstnz_br_ptr[ljb_i] = index;
+	if (!(Unzval_br_ptr[ljb_i] =
+	      doublecomplexMalloc_dist(len))) {
+	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
+	  return (memDist + memNLU);
+	}
+	memNLU += (len1+1)*iword + len*dword;
+	uval = Unzval_br_ptr[ljb_i];
+	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+	index[0] = nrbu;  /* Number of column blocks */
+	index[1] = len;   /* Total length of nzval[] */
+	index[2] = len1;  /* Total length of index */
+	index[len1] = -1; /* End marker */
+	next_ind = BR_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbu; k++) {
+	  gb = LUb_number[k];
+	  lb = LBj( gb, grid );
+	  len = LUb_length[lb];
+	  LUb_length[lb] = 0;  /* Reset vector of block length */
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len;
+	  LUb_indptr[lb] = next_ind;
+	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
+	    index[next_ind] = FstBlockC( jb + 1 );
+	  LUb_valptr[lb] = next_val;
+	  next_val += len;
+	}
+	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
+	   and the initial values of A from SPA into Unzval_br_ptr[]. */
+	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	  jcol = usub[i];
+	  gb = BlockNum( jcol );
+	  
+	  if ( mycol == PCOL( gb, grid ) ) {
+	    lb = LBj( gb, grid );
+	    k = LUb_indptr[lb]; /* Start fstnz in index */
+	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
+	  }
+	}  /* for i ... */
+	
+	for (i = 0; i < nrbu; i++) {
+	  gb = LUb_number[i];
+	  lb = LBj( gb, grid );   
+	  next_ind = LUb_indptr[lb];
+	  k = FstBlockC( jb + 1);
+	  jcol = ilsum_j[lb];
+	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
+	    dense_col = dense;
+	    j = index[next_ind+jj];
+	    for (ii = j; ii < k; ii++) {
+	      uval[LUb_valptr[lb]++] = dense_col[jcol];
+	      dense_col[jcol] = zero;
+	      dense_col += ldaspa_j;	      
+	    }
+	  }
+	}
+      } else {
+	Ufstnz_br_ptr[ljb_i] = NULL;
+	Unzval_br_ptr[ljb_i] = NULL;
+      } /* if nrbu ... */	
+    } /* if myrow == jbrow */
+    
+      /*------------------------------------------------
+       * SET UP L BLOCKS.
+       *------------------------------------------------*/
+    if (mycol == jbcol) {  /* Block column jb in my process column */
+      /* Scatter A_inf into SPA. */
+      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
+	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
+	  irow = ainf_rowind[i];
+	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
+	  gb = BlockNum( irow );
+	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
+	  if ( myrow == PROW( gb, grid ) ) {
+	    lb = LBi( gb, grid );
+	    irow = ilsum[lb] + irow - FstBlockC( gb );
+	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
+	    dense_col[irow] = ainf_val[i];
+	  }
+	}
+	dense_col += ldaspa;
+      }      
+      
+      /* sort the indices of the diagonal block at the beginning of xlsub */
+      if (myrow == jbrow) {
+	k = xlsub[ljb_j];
+	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	  irow = lsub[i];
+	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
+	    lsub[i] = lsub[k + irow - fsupc];
+	    lsub[k + irow - fsupc] = irow;
+	    i --;
+	  }
+	}
+      }
+      
+      /* Count number of blocks and length of each block. */
+      nrbl = 0;
+      len = 0; /* Number of row subscripts I own. */
+      kseen = 0;
+      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	irow = lsub[i];
+	gb = BlockNum( irow ); /* Global block number */	  
+	pr = PROW( gb, grid ); /* Process row owning this block */
+	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
+	     myrow == jbrow) {
+	  fsendx_plist[ljb_j][pr] = YES;
+	  ++nfsendx;
+	}
+	if ( myrow == pr ) {
+	  lb = LBi( gb, grid );  /* Local block number */
+	  if (Lrb_marker[lb] <= jb) { /* First see this block */
+	    Lrb_marker[lb] = jb + 1;
+	    LUb_length[lb] = 1;
+	    LUb_number[nrbl++] = gb;
+	    if ( gb != jb ) /* Exclude diagonal block. */
+	      ++fmod[lb]; /* Mod. count for forward solve */
+	    if ( kseen == 0 && myrow != jbrow ) {
+	      ++nfrecvx;
+	      kseen = 1;
+	    }
+#if ( PRNTlevel>=1 )
+	    ++nLblocks;
+#endif
+	  } else 
+	    ++LUb_length[lb];	    
+	  ++len;
+	}
+      } /* for i ... */
+      
+      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+	/* Set up the initial pointers for each block in 
+	   index[] and nzval[]. */
+	/* If I am the owner of the diagonal block, order it first in LUb_number.
+	   Necessary for SuperLU_DIST routines */
+	kseen = EMPTY;
+	for (j = 0; j < nrbl; j++) {
+	  if (LUb_number[j] == jb)
+	    kseen = j;
+	}
+	if (kseen != EMPTY && kseen != 0) {
+	  LUb_number[kseen] = LUb_number[0];
+	  LUb_number[0] = jb;
+	}
+	
+	/* Add room for descriptors */
+	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1)) ) {
+	  fprintf (stderr, "Malloc fails for index[]");
+	  return (memDist + memNLU);
+	}
+	Lrowind_bc_ptr[ljb_j] = index;
+	if (!(Lnzval_bc_ptr[ljb_j] = 
+	      doublecomplexMalloc_dist(len*nsupc))) {
+	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+	  return (memDist + memNLU);
+	}
+	memNLU += len1*iword + len*nsupc*dword;
+	
+	lusup = Lnzval_bc_ptr[ljb_j];
+	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+	index[0] = nrbl;  /* Number of row blocks */
+	index[1] = len;   /* LDA of the nzval[] */
+	next_ind = BC_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbl; ++k) {
+	  gb = LUb_number[k];
+	  lb = LBi( gb, grid );
+	  len = LUb_length[lb];
+	  LUb_length[lb] = 0;
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len; 
+	  LUb_indptr[lb] = next_ind;
+	    LUb_valptr[lb] = next_val;
+	    next_ind += len;
+	    next_val += len;
+	  }
+	  /* Propagate the compressed row subscripts to Lindex[],
+	     and the initial values of A from SPA into Lnzval[]. */
+	  len = index[1];  /* LDA of lusup[] */
+	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	    irow = lsub[i];
+	    gb = BlockNum( irow );
+	    if ( myrow == PROW( gb, grid ) ) {
+	      lb = LBi( gb, grid );
+	      k = LUb_indptr[lb]++; /* Random access a block */
+	      index[k] = irow;
+	      k = LUb_valptr[lb]++;
+	      irow = ilsum[lb] + irow - FstBlockC( gb );
+	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
+		lusup[k] = dense_col[irow];
+		dense_col[irow] = zero;
+		k += len;
+		dense_col += ldaspa;
+	      }
+	    }
+	  } /* for i ... */
+	} else {
+	  Lrowind_bc_ptr[ljb_j] = NULL;
+	  Lnzval_bc_ptr[ljb_j] = NULL;
+	} /* if nrbl ... */		  
+      } /* if mycol == pc */
+  } /* for jb ... */
+
+  SUPERLU_FREE(ilsum_j);
+  SUPERLU_FREE(Urb_marker);
+  SUPERLU_FREE(LUb_length);
+  SUPERLU_FREE(LUb_indptr);
+  SUPERLU_FREE(LUb_number);
+  SUPERLU_FREE(LUb_valptr);
+  SUPERLU_FREE(Lrb_marker);
+  SUPERLU_FREE(dense);
+  
+  /* Free the memory used for storing L and U */
+  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
+  if (lsub != NULL)
+    SUPERLU_FREE(lsub);  
+  if (usub != NULL)
+    SUPERLU_FREE(usub);
+  
+  /* Free the memory used for storing A */
+  SUPERLU_FREE(ainf_colptr);
+  if (ainf_rowind != NULL) {
+    SUPERLU_FREE(ainf_rowind);
+    SUPERLU_FREE(ainf_val);
+  }
+  SUPERLU_FREE(asup_rowptr);
+  if (asup_colind != NULL) {
+    SUPERLU_FREE(asup_colind);	
+    SUPERLU_FREE(asup_val);	
+  }
+  
+  /* exchange information about bsendx_plist in between column of processors */
+  k = SUPERLU_MAX( grid->nprow, grid->npcol);
+  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+    fprintf (stderr, "Malloc fails for recvBuf[].");
+    return (memDist + memNLU);
+  }
+  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU);
+  }
+  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU);
+  }
+  
+  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
+    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
+  
+  for (p = 0; p < nprocs; p++)
+    nnzToRecv[p] = 0;
+  
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    nnzToRecv[p] += grid->npcol;
+  }    
+  i = 0;
+  for (p = 0; p < nprocs; p++) {
+    ptrToRecv[p] = i;
+    i += nnzToRecv[p];
+    ptrToSend[p] = 0;
+    if (p != iam)
+      nnzToSend[p] = nnzToRecv[iam];
+    else
+      nnzToSend[p] = 0;
+  }
+  nnzToRecv[iam] = 0;
+  i = ptrToRecv[iam];
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    if (p == iam) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      for (j = 0; j < grid->npcol; j++, i++)
+	recvBuf[i] = ToSendR[ljb_j][j];
+    }
+  }   
+  
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+  
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
+    /* (myrow == jbrow) {
+       if (ToSendD[ljb_i] == YES)
+       ToRecv[jb] = 1;
+       }
+       else {
+       if (recvBuf[ptrToRecv[p] + mycol] == YES)
+       ToRecv[jb] = 2;
+       } */
+    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
+      if (myrow == jbrow)
+	ToRecv[jb] = 1;
+      else
+	ToRecv[jb] = 2;
+    }
+    if (mycol == jbcol) {
+      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
+	ToSendR[ljb_j][i] = recvBuf[j];  
+      ToSendR[ljb_j][mycol] = EMPTY;
+    }
+    ptrToRecv[p] += grid->npcol;
+  }   
+  
+  /* exchange information about bsendx_plist in between column of processors */
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+		 MPI_MAX, grid->cscp.comm);
+  
+  for (jb = 0; jb < nsupers; jb ++) {
+    jbcol = PCOL( jb, grid);
+    jbrow = PROW( jb, grid);
+    if (mycol == jbcol) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      if (myrow == jbrow ) {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
+	  (*bsendx_plist)[k] = recvBuf[k];
+	  if ((*bsendx_plist)[k] != EMPTY)
+	    nbsendx ++;
+	}
+      }
+      else {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
+	  (*bsendx_plist)[k] = EMPTY;
+      }
+    }
+  }
+  
+  SUPERLU_FREE(nnzToRecv);
+  SUPERLU_FREE(ptrToRecv);
+  SUPERLU_FREE(nnzToSend);
+  SUPERLU_FREE(ptrToSend);
+  SUPERLU_FREE(recvBuf);
+  
+  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+  Llu->Unzval_br_ptr = Unzval_br_ptr;
+  Llu->ToRecv = ToRecv;
+  Llu->ToSendD = ToSendD;
+  Llu->ToSendR = ToSendR;
+  Llu->fmod = fmod;
+  Llu->fsendx_plist = fsendx_plist;
+  Llu->nfrecvx = nfrecvx;
+  Llu->nfsendx = nfsendx;
+  Llu->bmod = bmod;
+  Llu->bsendx_plist = bsendx_plist;
+  Llu->nbrecvx = nbrecvx;
+  Llu->nbsendx = nbsendx;
+  Llu->ilsum = ilsum;
+  Llu->ldalsum = ldaspa;
+  LUstruct->Glu_persist = Glu_persist;	
+#if ( PRNTlevel>=1 )
+  if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+		     nLblocks, nUblocks);
+#endif
+  
+  k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+      ABORT("Malloc fails for mod_bit[].");
+
+  /* Find the maximum buffer size. */
+  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		MPI_MAX, grid->comm);
+  
+#if ( DEBUGlevel>=1 )
+  /* Memory allocated but not freed:
+     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
+     ToRecv, ToSendR, ToSendD, mod_bit
+  */
+  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+#endif
+    
+  return (- (memDist+memNLU));
+} /* zdist_psymbtonum */
+
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
new file mode 100644
index 0000000..2fc49a5
--- /dev/null
+++ b/SRC/pzutil.c
@@ -0,0 +1,539 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
+ */
+int pzCompRow_loc_to_CompCol_global
+(
+ int_t need_value, /* Input. Whether need to gather numerical values */
+ SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
+ gridinfo_t *grid, /* Input */
+ SuperMatrix *GA   /* Output */
+)
+{
+    NRformat_loc *Astore;
+    NCformat *GAstore;
+    doublecomplex *a, *a_loc;
+    int_t *colind, *rowptr;
+    int_t *colptr_loc, *rowind_loc;
+    int_t m_loc, n, i, j, k, l;
+    int_t colnnz, fst_row, nnz_loc, nnz;
+    doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
+    doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+			   local block rows.
+			   Use n_loc+1 pointers for each block. */
+    int_t *colptr_blk;  /* The column pointers for each block, after
+			   redistribution to the local block columns. 
+			   Use n_loc+1 pointers for each block. */
+    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
+    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
+    int_t *fst_rows, *n_locs;
+    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
+    int   it, n_loc, procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global");
+#endif
+
+    /* Initialization. */
+    n = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a = Astore->nzval;
+    rowptr = Astore->rowptr;
+    colind = Astore->colind;
+    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
+
+    /* ------------------------------------------------------------
+       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
+       ------------------------------------------------------------*/
+    zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
+                             &rowind_loc, &colptr_loc);
+    /* Change local row index numbers to global numbers. */
+    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
+
+#if ( DEBUGlevel>=2 )
+    printf("Proc %d\n", grid->iam);
+    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
+    PrintInt10("colptr_loc", n+1, colptr_loc);
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
+	  ABORT("Malloc fails for fst_rows[]");
+    n_locs = fst_rows + procs;
+    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
+		  grid->comm);
+    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
+    n_locs[procs-1] = n - fst_rows[procs-1];
+    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
+	  ABORT("Malloc fails for recvcnts[]");
+    sendcnts = recvcnts + procs;
+    rdispls = sendcnts + procs;
+    sdispls = rdispls + procs;
+    itemp_32 = sdispls + procs;
+
+    /* All-to-all transfer column pointers of each block.
+       Now the matrix view is P-by-P block-partition. */
+    /* n column starts for each column, and procs column ends for each block */
+    if ( !(colptr_send = intMalloc_dist(n + procs)) )
+	   ABORT("Malloc fails for colptr_send[]");
+    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
+	   ABORT("Malloc fails for colptr_blk[]");
+    for (i = 0, j = 0; i < procs; ++i) {
+        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
+	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
+	sendcnts[i] = n_locs[i] + 1;
+#if ( DEBUGlevel>=1 )
+	assert(j == fst_rows[i]);
+#endif
+	sdispls[i] = j + i;
+	recvcnts[i] = n_loc + 1;
+	rdispls[i] = i * (n_loc + 1);
+	j += n_locs[i]; /* First column of next block in colptr_loc[] */
+    }
+    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
+		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+    /* Adjust colptr_blk[] so that they contain the local indices of the
+       column pointers in the receive buffer. */
+    nnz = 0; /* The running sum of the nonzeros counted by far */
+    k = 0;
+    for (i = 0; i < procs; ++i) {
+	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
+	    colnnz = colptr_blk[j+1] - colptr_blk[j];
+	    /*assert(k<=j);*/
+	    colptr_blk[k] = nnz;
+	    nnz += colnnz; /* Start of the next column */
+	    ++k;
+	}
+	colptr_blk[k++] = nnz; /* Add an END marker for each block */
+    }
+    /*assert(k == (n_loc+1)*procs);*/
+
+    /* Now prepare to transfer row indices and values. */
+    sdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
+	sdispls[i+1] = sdispls[i] + sendcnts[i];
+    }
+    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
+    for (i = 0; i < procs; ++i) {
+        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
+	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
+    }
+    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
+    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
+
+    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
+    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
+        ABORT("Malloc fails for rowind_recv[]");
+    rowind_buf = rowind_recv + k;
+    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
+		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+        if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) )
+	    ABORT("Malloc fails for rowind_recv[]");
+	a_buf = a_recv + k;
+	MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX,
+                      a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
+                      grid->comm);
+    }
+      
+    /* Reset colptr_loc[] to point to the n_loc global columns. */
+    colptr_loc[0] = 0;
+    itemp = colptr_send;
+    for (j = 0; j < n_loc; ++j) {
+        colnnz = 0;
+	for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
+	    colnnz += colptr_blk[k+1] - colptr_blk[k];
+	}
+	colptr_loc[j+1] = colptr_loc[j] + colnnz;
+	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
+    }
+    itemp[n_loc] = colptr_loc[n_loc];
+      
+    /* Merge blocks of row indices into columns of row indices. */
+    for (i = 0; i < procs; ++i) {
+        k = i * (n_loc + 1);
+	for (j = 0; j < n_loc; ++j) { /* i-th block */
+	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+	        rowind_buf[itemp[j]] = rowind_recv[l];
+		++itemp[j];
+	    }
+	}
+    }
+
+    if ( need_value ) {
+        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
+        for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1);
+	    for (j = 0; j < n_loc; ++j) { /* i-th block */
+	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+		    a_buf[itemp[j]] = a_recv[l];
+		    ++itemp[j];
+		}
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
+       ------------------------------------------------------------*/
+    GA->nrow  = A->nrow;
+    GA->ncol  = A->ncol;
+    GA->Stype = SLU_NC;
+    GA->Dtype = A->Dtype;
+    GA->Mtype = A->Mtype;
+    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
+    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
+
+    /* First gather the size of each piece. */
+    nnz_loc = colptr_loc[n_loc];
+    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
+    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
+    GAstore->nnz = nnz;
+    
+    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
+    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
+      
+    /* Allgatherv for row indices. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + itemp[i];
+        itemp_32[i] = itemp[i];
+    }
+    itemp_32[procs-1] = itemp[procs-1];
+    it = nnz_loc;
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+      if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
+          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
+    } else GAstore->nzval = NULL;
+
+    /* Now gather the column pointers. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + n_locs[i];
+        itemp_32[i] = n_locs[i];
+    }
+    itemp_32[procs-1] = n_locs[procs-1];
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+
+    /* Recompute column pointers. */
+    for (i = 1; i < procs; ++i) {
+        k = rdispls[i];
+	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
+	itemp[i] += itemp[i-1]; /* prefix sum */
+    }
+    GAstore->colptr[n] = nnz;
+
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+        printf("After pdCompRow_loc_to_CompCol_global()\n");
+	zPrint_CompCol_Matrix_dist(GA);
+    }
+#endif
+
+    SUPERLU_FREE(a_loc);
+    SUPERLU_FREE(rowind_loc);
+    SUPERLU_FREE(colptr_loc);
+    SUPERLU_FREE(fst_rows);
+    SUPERLU_FREE(recvcnts);
+    SUPERLU_FREE(colptr_send);
+    SUPERLU_FREE(colptr_blk);
+    SUPERLU_FREE(rowind_recv);
+    if ( need_value) SUPERLU_FREE(a_recv);
+#if ( DEBUGlevel>=1 )
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
+    CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
+#endif
+    return 0;
+} /* pzCompRow_loc_to_CompCol_global */
+
+
+/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B.
+ */
+int pzPermute_Dense_Matrix
+(
+ int_t fst_row,
+ int_t m_loc,
+ int_t row_to_proc[],
+ int_t perm[],
+ doublecomplex X[], int ldx,
+ doublecomplex B[], int ldb,
+ int nrhs,
+ gridinfo_t *grid
+)
+{
+    int_t i, j, k, l;
+    int p, procs;
+    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *send_ibuf, *recv_ibuf;
+    doublecomplex *send_dbuf, *recv_dbuf;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()");
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
+        ABORT("Malloc fails for sendcnts[].");
+    sendcnts_nrhs = sendcnts + procs;
+    recvcnts = sendcnts_nrhs + procs;
+    recvcnts_nrhs = recvcnts + procs;
+    sdispls = recvcnts_nrhs + procs;
+    sdispls_nrhs = sdispls + procs;
+    rdispls = sdispls_nrhs + procs;
+    rdispls_nrhs = rdispls + procs;
+    ptr_to_ibuf = rdispls_nrhs + procs;
+    ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        p = row_to_proc[perm[i]];
+	++sendcnts[p];
+    }
+    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
+    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
+    for (i = 1; i < procs; ++i) {
+        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
+	sdispls_nrhs[i] = sdispls[i] * nrhs;
+	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
+	rdispls_nrhs[i] = rdispls[i] * nrhs;
+	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
+	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
+    }
+    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
+    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
+    /*assert(k == m_loc);*/
+    /*assert(l == m_loc);*/
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+
+    for (i = 0; i < procs; ++i) {
+        ptr_to_ibuf[i] = sdispls[i];
+	ptr_to_dbuf[i] = sdispls_nrhs[i];
+    }
+
+    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        j = perm[i];
+	p = row_to_proc[j];
+	send_ibuf[ptr_to_ibuf[p]] = j;
+	j = ptr_to_dbuf[p];
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    send_dbuf[j++] = X[i-fst_row + k*ldx];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+	  
+    /* Transfer the (permuted) row indices and numerical values. */
+    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
+		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
+		  grid->comm);
+
+    /* Copy the buffer into b. */
+    for (i = 0, l = 0; i < m_loc; ++i) {
+        j = recv_ibuf[i] - fst_row; /* Relative row number */
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    B[j + k*ldb] = recv_dbuf[l++];
+	}
+    }
+
+    SUPERLU_FREE(sendcnts);
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()");
+#endif
+    return 0;
+} /* pzPermute_Dense_Matrix */
+
+
+/*! \brief Initialize the data structure for the solution phase.
+ */
+int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, 
+	       int_t perm_r[], int_t perm_c[], int_t nrhs,
+	       LUstruct_t *LUstruct, gridinfo_t *grid,
+	       SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *row_to_proc, *inv_perm_c, *itemp;
+    NRformat_loc *Astore;
+    int_t        i, fst_row, m_loc, p;
+    int          procs;
+
+    Astore = (NRformat_loc *) A->Store;
+    fst_row = Astore->fst_row;
+    m_loc = Astore->m_loc;
+    procs = grid->nprow * grid->npcol;
+    
+    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
+	ABORT("Malloc fails for row_to_proc[]");
+    SOLVEstruct->row_to_proc = row_to_proc;
+    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
+        ABORT("Malloc fails for inv_perm_c[].");
+    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
+    SOLVEstruct->inv_perm_c = inv_perm_c;
+
+    /* ------------------------------------------------------------
+       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
+       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
+       
+       NOTE: For those processes that do not own any row, it must
+             must be set so that fst_row == A->nrow. 
+       ------------------------------------------------------------*/
+    if ( !(itemp = intMalloc_dist(procs+1)) )
+        ABORT("Malloc fails for itemp[]");
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = A->nrow;
+    for (p = 0; p < procs; ++p) {
+        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
+    }
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+      printf("fst_row = %d\n", fst_row);
+      PrintInt10("row_to_proc", A->nrow, row_to_proc);
+      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
+    }
+#endif
+    SUPERLU_FREE(itemp);
+
+#if 0
+    /* Compute the mapping between rows and processes. */
+    /* XSL NOTE: What happens if # of mapped processes is smaller
+       than total Procs?  For the processes without any row, let
+       fst_row be EMPTY (-1). Make sure this case works! */
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = n;
+    for (p = 0; p < procs; ++p) {
+        j = itemp[p];
+	if ( j != EMPTY ) {
+	    k = itemp[p+1];
+	    if ( k == EMPTY ) k = n;
+	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
+	}
+    }
+#endif    
+
+    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
+		   &SOLVEstruct->num_diag_procs,
+		   &SOLVEstruct->diag_procs,
+		   &SOLVEstruct->diag_len);
+
+    /* Setup communication pattern for redistribution of B and X. */
+    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
+	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+        ABORT("Malloc fails for gstrs_comm[]");
+    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+		 LUstruct->Glu_persist, SOLVEstruct);
+
+    if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
+           SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
+        ABORT("Malloc fails for gsmv_comm[]");
+    SOLVEstruct->A_colind_gsmv = NULL;
+    
+    options->SolveInitialized = YES;
+    return 0;
+} /* zSolveInit */
+
+/*! \brief Release the resources used for the solution phase.
+ */
+void zSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct)
+{
+    int_t *it;
+
+    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+    if ( options->RefineInitialized ) {
+        pzgsmv_finalize(SOLVEstruct->gsmv_comm);
+	options->RefineInitialized = NO;
+    }
+    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+    SUPERLU_FREE(SOLVEstruct->row_to_proc);
+    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+    SUPERLU_FREE(SOLVEstruct->diag_procs);
+    SUPERLU_FREE(SOLVEstruct->diag_len);
+    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
+    options->SolveInitialized = NO;
+} /* zSolveFinalize */
+
+/*! \brief Check the inf-norm of the error vector 
+ */
+void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
+		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+{
+    double err, xnorm, temperr, tempxnorm;
+    doublecomplex *x_work, *xtrue_work;
+    doublecomplex temp;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+        z_sub(&temp, &x_work[i], &xtrue_work[i]);
+	err = SUPERLU_MAX(err, slud_z_abs(&temp));
+	xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i]));
+      }
+
+      /* get the golbal max err & xnrom */
+      temperr = err;
+      tempxnorm = xnorm;
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+
+      err = err / xnorm;
+      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
diff --git a/SRC/smach_dist.c b/SRC/smach_dist.c
new file mode 100644
index 0000000..394347e
--- /dev/null
+++ b/SRC/smach_dist.c
@@ -0,0 +1,94 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+float smach_dist(char *cmach)
+{
+/*  -- SuperLU auxiliary routine (version 5.0) --   
+    This uses C99 standard constants, and is thread safe.
+
+    Must be compiled with "-std=c99" flag.
+
+
+    Purpose   
+    =======   
+
+    SMACH returns single precision machine parameters.   
+
+    Arguments   
+    =========   
+
+    CMACH   (input) CHARACTER*1   
+            Specifies the value to be returned by SMACH:   
+            = 'E' or 'e',   SMACH := eps   
+            = 'S' or 's ,   SMACH := sfmin   
+            = 'B' or 'b',   SMACH := base   
+            = 'P' or 'p',   SMACH := eps*base   
+            = 'N' or 'n',   SMACH := t   
+            = 'R' or 'r',   SMACH := rnd   
+            = 'M' or 'm',   SMACH := emin   
+            = 'U' or 'u',   SMACH := rmin   
+            = 'L' or 'l',   SMACH := emax   
+            = 'O' or 'o',   SMACH := rmax   
+
+            where   
+
+            eps   = relative machine precision   
+            sfmin = safe minimum, such that 1/sfmin does not overflow   
+            base  = base of the machine   
+            prec  = eps*base   
+            t     = number of (base) digits in the mantissa   
+            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
+            emin  = minimum exponent before (gradual) underflow   
+            rmin  = underflow threshold - base**(emin-1)   
+            emax  = largest exponent before overflow   
+            rmax  = overflow threshold  - (base**emax)*(1-eps)   
+
+   ===================================================================== 
+*/
+
+    float sfmin, small, rmach;
+
+    if ( strncmp(cmach, "E", 1)==0 ) {
+	rmach = FLT_EPSILON * 0.5;
+    } else if ( strncmp(cmach, "S", 1)==0 ) {
+	sfmin = FLT_MIN;
+	small = 1. / FLT_MAX;
+	if (small >= sfmin) {
+	    /* Use SMALL plus a bit, to avoid the possibility of rounding   
+	       causing overflow when computing  1/sfmin. */
+	    sfmin = small * (FLT_EPSILON*0.5 + 1.);
+	}
+	rmach = sfmin;
+    } else if ( strncmp(cmach, "B", 1)==0 ) {
+	rmach = FLT_RADIX;
+    } else if ( strncmp(cmach, "P", 1)==0 ) {
+	rmach = FLT_EPSILON * 0.5 * FLT_RADIX;
+    } else if ( strncmp(cmach, "N", 1)==0 ) {
+	rmach = FLT_MANT_DIG;
+    } else if ( strncmp(cmach, "R", 1)==0 ) {
+	rmach = FLT_ROUNDS;
+    } else if ( strncmp(cmach, "M", 1)==0 ) {
+	rmach = FLT_MIN_EXP;
+    } else if ( strncmp(cmach, "U", 1)==0 ) {
+	rmach = FLT_MIN;
+    } else if ( strncmp(cmach, "L", 1)==0 ) {
+	rmach = FLT_MAX_EXP;
+    } else if ( strncmp(cmach, "O", 1)==0 ) {
+	rmach = FLT_MAX;
+    }
+
+    return rmach;
+
+} /* end smach_dist */
diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c
new file mode 100644
index 0000000..27cbf93
--- /dev/null
+++ b/SRC/sp_colorder.c
@@ -0,0 +1,243 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Permutes the columns of the original matrix
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * December 31, 2016 version 5.1.3
+ * </pre>
+ */
+#include "superlu_ddefs.h"
+
+
+int check_perm_dist(char *, int_t, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * sp_colorder() permutes the columns of the original matrix. It performs
+ * the following steps:
+ *
+ *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
+ *
+ *    2. If options->Fact = DOFACT, then
+ *       (1) Compute column elimination tree etree[] of AC'AC;
+ *       (2) Post order etree[] to get a postordered elimination tree etree[],
+ *           and a postorder permutation post[];
+ *       (3) Apply post[] permutation to columns of AC;
+ *       (4) Overwrite perm_c[] with the product perm_c * post.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         Specifies whether or not the elimination tree will be re-used.
+ *         If options->Fact == DOFACT, this means first time factor A, 
+ *         etree is computed and output.
+ *         Otherwise, re-factor A, etree is input, unchanged on exit.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE.
+ *         In the future, more general A can be handled.
+ *
+ * perm_c  (input/output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *         If options->Fact == DOFACT, perm_c is both input and output.
+ *         On output, it is changed according to a postorder of etree.
+ *         Otherwise, perm_c is input.
+ *         
+ * etree   (input/output) int*
+ *         Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol.
+ *         If options->Fact == DOFACT, etree is an output argument,
+ *         otherwise it is an input argument.
+ *         Note: etree is a vector of parent pointers for a forest whose
+ *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * AC      (output) SuperMatrix*
+ *         The resulting matrix after applied the column permutation
+ *         perm_c[] to matrix A. The type of AC can be:
+ *         Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
+ * </pre>
+ */
+void
+sp_colorder(superlu_dist_options_t *options,  SuperMatrix *A, int_t *perm_c, 
+	    int_t *etree, SuperMatrix *AC)
+{
+
+    NCformat  *Astore;
+    NCPformat *ACstore;
+    int_t       *iwork, *post;
+    register  int_t n, i;
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter sp_colorder()");
+#endif
+
+    n = A->ncol;
+    
+    /* Apply column permutation perm_c to A's column pointers so to
+       obtain NCP format in AC = A*Pc.  */
+    AC->Stype       = SLU_NCP;
+    AC->Dtype       = A->Dtype;
+    AC->Mtype       = A->Mtype;
+    AC->nrow        = A->nrow;
+    AC->ncol        = A->ncol;
+    Astore          = A->Store;
+    ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) );
+    if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore");
+    ACstore->nnz    = Astore->nnz;
+    ACstore->nzval  = Astore->nzval;
+    ACstore->rowind = Astore->rowind;
+    ACstore->colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
+    if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg");
+    ACstore->colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
+    if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend");
+
+#if ( DEBUGlevel>=3 )
+    if ( !iam ) {
+	PrintInt10("pre_order:", n, perm_c);
+	check_perm_dist("Initial perm_c", n, perm_c);
+    }
+#endif      
+
+    for (i = 0; i < n; i++) {
+	ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; 
+	ACstore->colend[perm_c[i]] = Astore->colptr[i+1];
+    }
+	
+    if ( options->Fact == DOFACT 
+	 || options->Fact == SamePattern )
+	/* In this case, perm_r[] may be changed, etree(Pr*A + (Pr*A)')
+	   may be changed, so need to recompute etree.   */
+    { 
+	/* Factor A "from scratch" -- we also compute the etree, and
+	 * make perm_c consistent with the postorder of the etree.
+	 */
+
+	iwork = (int_t*) SUPERLU_MALLOC((n+1)*sizeof(int_t)); 
+	if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
+
+	if ( A->nrow != A->ncol  /* Rectangular matrix */
+	     || options->ColPerm == MMD_ATA ) {
+	    /* Compute the column etree of A*Pc'. */
+	    sp_coletree_dist(ACstore->colbeg, ACstore->colend, ACstore->rowind,
+			     A->nrow, A->ncol, etree);
+	} else {
+	    /* Compute the etree of Pc*(A'+A)*Pc'. */
+	    int_t *b_colptr, *b_rowind, bnz, j;
+	    int_t *c_colbeg, *c_colend;
+
+	    /* Form B = A + A'. */
+	    at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			   &bnz, &b_colptr, &b_rowind);
+
+	    /* Form C = Pc*B*Pc'. */
+	    c_colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
+	    c_colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
+	    if (!(c_colbeg) || !(c_colend) )
+		ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend");
+	    for (i = 0; i < n; i++) {
+		c_colbeg[perm_c[i]] = b_colptr[i]; 
+		c_colend[perm_c[i]] = b_colptr[i+1];
+	    }
+	    for (j = 0; j < n; ++j) {
+		for (i = c_colbeg[j]; i < c_colend[j]; ++i) {
+		    b_rowind[i] = perm_c[b_rowind[i]];
+		}
+	    }
+
+	    /* Compute etree of C. */
+	    sp_symetree_dist(c_colbeg, c_colend, b_rowind, n, etree);
+
+	    SUPERLU_FREE(b_colptr);
+	    if ( bnz ) SUPERLU_FREE(b_rowind);
+	    SUPERLU_FREE(c_colbeg);
+	    SUPERLU_FREE(c_colend);
+	}
+#if ( DEBUGlevel>=3 )
+	if ( !iam ) PrintInt10("etree:", n, etree);
+#endif	
+	
+	/* Post order etree */
+	post = (int_t *) TreePostorder_dist(n, etree);
+	/* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
+	   iwork = post; */
+
+	/* Renumber etree in postorder */
+	for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]];
+	for (i = 0; i < n; ++i) etree[i] = iwork[i];
+
+#if ( DEBUGlevel>=3 )
+	if ( !iam ) PrintInt10("postorder etree:", n, etree);
+#endif
+	
+	/* Postmultiply A*Pc by post[] */
+	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i];
+	for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i];
+	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i];
+	for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i];
+
+	for (i = 0; i < n; ++i)
+	    iwork[i] = post[perm_c[i]];  /* product of perm_c and post */
+	for (i = 0; i < n; ++i) perm_c[i] = iwork[i];
+
+#if ( DEBUGlevel>=3 )
+	if ( !iam ) {
+	    PrintInt10("Pc*post:", n, perm_c);
+	    check_perm_dist("final perm_c", n, perm_c);
+	}
+#endif
+
+	SUPERLU_FREE (post);
+	SUPERLU_FREE (iwork);
+
+    } /* end if options->Fact == DOFACT ... */
+
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ACstore, ACstore->colbeg, ACstore->colend  */
+    CHECK_MALLOC(iam, "Exit sp_colorder()");
+#endif
+
+} /* SP_COLORDER */
+
+int
+check_perm_dist(char *what, int_t n, int_t *perm)
+{
+    register int_t i;
+    int_t          *marker;
+    marker = (int_t *) intCalloc_dist(n);
+
+    for (i = 0; i < n; ++i) {
+	if ( perm[i] >= n || marker[perm[i]] == 1 ) {
+	    printf("%s: Not a valid PERM[" IFMT "] = " IFMT "\n", 
+		   what, i, perm[i]);
+	    ABORT("check_perm_dist");
+	} else {
+	    marker[perm[i]] = 1;
+	}
+    }
+
+    SUPERLU_FREE(marker);
+    return 0;
+}
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
new file mode 100644
index 0000000..24386cb
--- /dev/null
+++ b/SRC/sp_ienv.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Chooses machine-dependent parameters for the local environment
+ */
+/*
+ * File name:		sp_ienv.c
+ * History:             Modified from lapack routine ILAENV
+ */
+#include "superlu_ddefs.h"
+#include "machines.h"
+
+/*! \brief
+
+</pre>
+    Purpose   
+    =======   
+
+    sp_ienv_dist() is inquired to choose machine-dependent parameters for the
+    local environment. See ISPEC for a description of the parameters.   
+
+    This version provides a set of parameters which should give good,   
+    but not optimal, performance on many of the currently available   
+    computers.  Users are encouraged to modify this subroutine to set   
+    the tuning parameters for their particular machine using the option   
+    and problem size information in the arguments.   
+
+    Arguments   
+    =========   
+
+    ISPEC   (input) int
+            Specifies the parameter to be returned as the value of SP_IENV_DIST.   
+            = 1: the panel size w; a panel consists of w consecutive
+	         columns of matrix A in the process of Gaussian elimination.
+		 The best value depends on machine's cache characters.
+            = 2: the relaxation parameter relax; if the number of
+	         nodes (columns) in a subtree of the elimination tree is less
+		 than relax, this subtree is considered as one supernode,
+		 regardless of the their row structures.
+            = 3: the maximum size for a supernode, which must be greater
+                 than or equal to relaxation parameter (see case 2);
+	    = 4: the minimum row dimension for 2-D blocking to be used;
+	    = 5: the minimum column dimension for 2-D blocking to be used;
+	    = 6: the estimated fills factor for the adjacency structures 
+	         of L and U, compared with A;
+	    = 7: the minimum value of the product M*N*K for a GEMM call
+	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+	    
+   (SP_IENV_DIST) (output) int
+            >= 0: the value of the parameter specified by ISPEC   
+            < 0:  if SP_IENV_DIST = -k, the k-th argument had an illegal value.
+  
+    ===================================================================== 
+</pre>
+*/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int_t
+sp_ienv_dist(int_t ispec)
+{
+    // printf(" this function called\n");
+    int i;
+
+    char* ttemp;
+
+    switch (ispec) {
+#if ( MACH==CRAY_T3E )
+	case 2: return (6);
+	case 3: return (30);
+
+#elif ( MACH==IBM )
+	case 2: return (20);
+	case 3: return (100);
+#else
+	case 2: 
+            ttemp = getenv("NREL");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 20;
+            
+	case 3: 
+            ttemp = getenv("NSUP");
+            if(ttemp)
+            {
+                return(atoi(ttemp));
+            }
+            else
+            return 128;
+
+#endif
+        case 6: return (5);
+        case 7:
+	    ttemp = getenv ("N_GEMM");
+	    if (ttemp) return atoi (ttemp);
+	    else return 10000;
+
+    }
+
+    /* Invalid value for ISPEC */
+    i = 1;
+    xerr_dist("sp_ienv", &i);
+    return 0;
+
+
+} /* sp_ienv_dist */
+
diff --git a/SRC/static_schedule.c b/SRC/static_schedule.c
new file mode 100644
index 0000000..b653047
--- /dev/null
+++ b/SRC/static_schedule.c
@@ -0,0 +1,968 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Performs static scheduling for the look-ahead factorization algorithm.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified:
+ *
+ * Reference:
+ * 
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#ifdef ISORT
+extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2);
+extern void isort1 (int_t N, int_t * ARRAY);
+
+#else
+
+int
+superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
+int
+static_schedule(superlu_dist_options_t * options, int m, int n, 
+		LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat,
+		int_t *perm_c_supno, int_t *iperm_c_supno, int *info)
+{
+    int_t *xsup;
+    int_t  i, ib, jb, lb, nlb, il, iu;
+    int_t Pc, Pr;
+    int iam, krow, yourcol, mycol, myrow; 
+    int j, k, nsupers;  /* k - current panel to work on */
+    int_t *index;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int ncb, nrb, p, pr, pc, nblocks;
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
+        *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l,
+        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
+    float edag_supno_l_bytes;
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+
+    int iword = sizeof (int_t);
+
+    /* Test the input parameters. */
+    *info = 0;
+    if (m < 0) *info = -2;
+    else if (n < 0) *info = -3;
+    if (*info) {
+        pxerr_dist ("static_schedule", grid, -*info);
+        return (-1);
+    }
+
+    /* Quick return if possible. */
+    if (m == 0 || n == 0) return 0;
+ 
+    /* 
+     * Initialization.  
+     */
+    iam = grid->iam;
+    Pc = grid->npcol; 
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    mycol = MYCOL (iam, grid);
+    nsupers = Glu_persist->supno[n - 1] + 1;
+    xsup = Glu_persist->xsup;
+    nblocks = 0;
+    ncb = nsupers / Pc;
+    nrb = nsupers / Pr;
+
+#if ( DEBUGlevel >= 1 ) 
+    print_memorylog(stat, "before static schedule");
+#endif
+
+    /* ================================================== *
+     * static scheduling of j-th step of LU-factorization *
+     * ================================================== */
+    if (options->lookahead_etree == YES &&  /* use e-tree of symmetrized matrix and */
+        (options->ParSymbFact == NO ||  /* 1) symmetric fact with serial symbolic, or */
+         (options->SymPattern == YES && /* 2) symmetric pattern, and                  */
+          options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */
+
+        /* if symmetric pattern or using e-tree of |A^T|+|A|,
+           then we can use a simple tree structure for static schduling */
+
+        if (options->ParSymbFact == NO) {
+            /* Use the etree computed from serial symb. fact., and turn it
+               into supernodal tree.  */
+            int_t *etree = LUstruct->etree;
+#if ( PRNTlevel>=1 )
+            if (grid->iam == 0)
+                printf (" === using column e-tree ===\n");
+#endif
+
+            /* look for the first off-diagonal blocks */
+            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	    log_memory(nsupers * iword, stat);
+
+            for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers;
+
+            for (j = 0, lb = 0; lb < nsupers; lb++) {
+                for (k = 0; k < SuperSize (lb); k++) {
+                    jb = Glu_persist->supno[etree[j + k]];
+                    if (jb != lb)
+                        etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb);
+                }
+                j += SuperSize (lb);
+            }
+        } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
+            /* Compute an "etree" based on struct(L),
+               assuming struct(U) = struct(L').   */
+#if ( PRNTlevel>=1 )
+            if (grid->iam == 0)
+                printf (" === using supernodal e-tree ===\n");
+#endif
+
+            /* find the first block in each supernodal-column of local L-factor */
+            etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	    log_memory(nsupers * iword, stat);
+
+            for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers;
+            for (lb = 0; lb < ncb; lb++) {
+                jb = lb * grid->npcol + mycol;
+                index = Llu->Lrowind_bc_ptr[lb];
+                if (index) {   /* Not an empty column */
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW (jb, grid);
+                    if (krow == myrow) {  /* skip the diagonal block */
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if (i > 0)
+                    {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+
+                    for (j = 0; j < i; j++)
+                    {
+                        etree_supno_l[jb] =
+                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+            if (mycol < nsupers % grid->npcol) {
+                jb = ncb * grid->npcol + mycol;
+                index = Llu->Lrowind_bc_ptr[ncb];
+                if (index) {     /* Not an empty column */
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW (jb, grid);
+                    if (krow == myrow) { /* skip the diagonal block */
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if (i > 0) {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    for (j = 0; j < i; j++) {
+                        etree_supno_l[jb] =
+                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+
+            /* form global e-tree */
+            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+
+            MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t,
+                           MPI_MIN, grid->comm);
+
+            SUPERLU_FREE (etree_supno_l);
+        }
+
+        /* initialize number of children for each node */
+        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+        for (i = 0; i < nsupers; i++) num_child[i] = 0;
+        for (i = 0; i < nsupers; i++)
+            if (etree_supno[i] != nsupers)  num_child[etree_supno[i]]++;
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for (i = 0; i < nsupers; i++) {
+            if (num_child[i] == 0) {
+                ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
+                nnodes++;
+
+                if (nnodes == 1) {
+                    head = ptr;
+                    tail = ptr;
+                } else {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+
+        while (nnodes > 0) {
+            ptr = head;
+            j = ptr->id;
+            head = ptr->next;
+            perm_c_supno[i] = j;
+            SUPERLU_FREE (ptr);
+            i++;
+            nnodes--;
+
+            if (etree_supno[j] != nsupers) {
+                num_child[etree_supno[j]]--;
+                if (num_child[etree_supno[j]] == 0) {
+                    nnodes++;
+
+                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                    ptr->id = etree_supno[j];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id ); */
+                    if (nnodes == 1) {
+                        head = ptr;
+                        tail = ptr;
+                    } else {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" ); */
+        }
+        SUPERLU_FREE (num_child);
+        SUPERLU_FREE (etree_supno);
+	log_memory(-2 * nsupers * iword, stat);
+
+    } else {         /* Unsymmetric pattern */
+
+        /* Need to process both L- and U-factors, use the symmetrically
+           pruned graph of L & U instead of tree (very naive implementation) */
+        int nrbp1 = nrb + 1;
+	float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes;
+
+        /* allocate some workspace */
+        if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int))))
+            ABORT ("Malloc fails for sendcnts[].");
+	log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat);
+
+        sdispls = &sendcnts[Pr * Pc];
+        recvcnts = &sdispls[Pr * Pc];
+        rdispls = &recvcnts[Pr * Pc];
+        srows = &rdispls[Pr * Pc];
+        rrows = &srows[Pr * Pc * nrbp1];
+
+        myrow = MYROW (iam, grid);
+#if ( PRNTlevel>=1 )
+        if (grid->iam == 0)
+            printf (" === using DAG ===\n");
+#endif
+
+        /* send supno block of local U-factor to a processor *
+         * who owns the corresponding block of L-factor      */
+
+        /* srows   : # of block to send to a processor from each supno row */
+        /* sendcnts: total # of blocks to send to a processor              */
+        for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0;
+        for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0;
+
+        /* sending blocks of U-factors corresponding to L-factors */
+        /* count the number of blocks to send */
+        for (lb = 0; lb < nrb; ++lb) {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[lb];
+
+            if (index) {         /* Not an empty row */
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    sendcnts[p]++;
+                    srows[p * nrbp1 + lb]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        if (myrow < nsupers % grid->nprow) {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[nrb];
+
+            if (index) {         /* Not an empty row */
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    sendcnts[p]++;
+                    srows[p * nrbp1 + nrb]++;
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        /* insert blocks to send */
+        sdispls[0] = 0;
+        for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+        if (!(blocks = intMalloc_dist (nblocks)))
+            ABORT ("Malloc fails for blocks[].");
+	log_memory( nblocks * iword, stat );
+
+        for (lb = 0; lb < nrb; ++lb) {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[lb];
+
+            if (index) {       /* Not an empty row */
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        if (myrow < nsupers % grid->nprow) {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[nrb];
+
+            if (index) {       /* Not an empty row */
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        /* communication */
+        MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+        MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm);
+
+	log_memory( -(nblocks * iword), stat );  /* blocks[] to be freed soon */
+
+        nblocks = recvcnts[0];
+        rdispls[0] = sdispls[0] = 0;
+        for (p = 1; p < Pr * Pc; p++) {
+            rdispls[p] = rdispls[p - 1] + recvcnts[p - 1];
+            sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+            nblocks += recvcnts[p];
+        }
+
+        if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[].");
+	log_memory( nblocks * iword, stat );
+
+        MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts,
+                       rdispls, mpi_int_t, grid->comm);
+
+        SUPERLU_FREE (blocks); /* memory logged before */
+
+	
+        /* store the received U-blocks by rows */
+        nlb = nsupers / Pc;
+        if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[].");
+        if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[].");
+
+	Ublock_bytes = nblocks * iword;
+	Urows_bytes = (1 + nlb) * iword;
+	log_memory( Ublock_bytes + Urows_bytes, stat );
+
+        k = 0;
+        for (jb = 0; jb < nlb; jb++) {
+            j = jb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[jb] = 0;
+
+            for (pc = 0; pc < Pc; pc++) {
+                p = pr * Pc + pc; /* the processor owning this block of U-factor */
+
+                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
+                     i++) {
+                    Ublock[k] = blockr[i];
+                    k++;
+                    Urows[jb]++;
+                }
+                rdispls[p] += rrows[p * nrbp1 + lb];
+            }
+            /* sort by the column indices to make things easier for later on */
+
+#ifdef ISORT
+            isort1 (Urows[jb], &(Ublock[k - Urows[jb]]));
+#else
+            qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        if (mycol < nsupers % grid->npcol) {
+            j = nlb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[nlb] = 0;
+
+            for (pc = 0; pc < Pc; pc++) {
+                p = pr * Pc + pc;
+                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
+                     i++) {
+                    Ublock[k] = blockr[i];
+                    k++;
+                    Urows[nlb]++;
+                }
+                rdispls[p] += rrows[p * nrb + lb];
+            }
+#ifdef ISORT
+            isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]]));
+#else
+            qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        SUPERLU_FREE (blockr);
+	log_memory( -nblocks * iword, stat );
+
+        /* sort the block in L-factor */
+        nblocks = 0;
+        for (lb = 0; lb < ncb; lb++) {
+            jb = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {        /* Not an empty column */
+                nblocks += index[0];
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            jb = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {       /* Not an empty column */
+                nblocks += index[0];
+            }
+        }
+
+        if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[].");
+        if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[].");
+
+	Lblock_bytes = nblocks * iword;
+	Lrows_bytes = (1 + ncb) * iword;
+	log_memory(Lblock_bytes + Lrows_bytes, stat);
+
+        for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0;
+        nblocks = 0;
+        for (lb = 0; lb < ncb; lb++) {
+            Lrows[lb] = 0;
+
+            jb = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {      /* Not an empty column */
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW (jb, grid);
+                if (krow == myrow)  /* skip the diagonal block */
+                {
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+
+                for (j = 0; j < i; j++) {
+                    Lblock[nblocks] = index[k];
+                    Lrows[lb]++;
+                    nblocks++;
+
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+#ifdef ISORT
+            isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]]));
+#else
+            qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        if (mycol < nsupers % grid->npcol) {
+            Lrows[ncb] = 0;
+            jb = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {       /* Not an empty column */
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW (jb, grid);
+                if (krow == myrow) { /* skip the diagonal block */
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+                for (j = 0; j < i; j++) {
+                    Lblock[nblocks] = index[k];
+                    Lrows[ncb]++;
+                    nblocks++;
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+#ifdef ISORT
+                isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]));
+#else
+                qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]),
+                       sizeof (int_t), &superlu_sort_perm);
+#endif
+            }
+        }
+
+        /* look for the first local symmetric nonzero block match */
+        if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[].");
+        if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[].");
+
+	log_memory( 2 * nsupers * iword, stat );
+
+        for (lb = 0; lb < nsupers; lb++)
+            sf_block_l[lb] = nsupers;
+        i = 0;
+        j = 0;
+        for (jb = 0; jb < nlb; jb++) {
+            if (Urows[jb] > 0) {
+                ib = i + Urows[jb];
+                lb = jb * Pc + mycol;
+                for (k = 0; k < Lrows[jb]; k++) {
+                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
+                        i++;
+
+                    if (Ublock[i] == Lblock[j]) {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[jb] - k);
+                        k = Lrows[jb];
+                    } else {
+                        j++;
+                    }
+                }
+                i = ib;
+            } else {
+                j += Lrows[jb];
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            if (Urows[nlb] > 0) {
+                ib = i + Urows[nlb];
+                lb = nlb * Pc + mycol;
+                for (k = 0; k < Lrows[nlb]; k++) {
+                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
+                        i++;
+
+                    if (Ublock[i] == Lblock[j])
+                    {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[nlb] - k);
+                        k = Lrows[nlb];
+                    }
+                    else
+                    {
+                        j++;
+                    }
+                }
+                i = ib;
+            } else {
+                j += Lrows[nlb];
+            }
+        }
+
+        /* compute the first global symmetric matchs */
+        MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN,
+                       grid->comm);
+        SUPERLU_FREE (sf_block_l);
+	log_memory( -nsupers * iword, stat );
+
+        /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */
+        if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[].");
+        if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[].");
+	log_memory( 2 * nsupers * iword, stat );
+
+        for (lb = 0; lb < nsupers; lb++)  nnodes_l[lb] = 0;
+        for (lb = 0; lb < nsupers; lb++)  nnodes_u[lb] = 0;
+
+        nblocks = 0;
+        /* from U-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++) {
+            lb = jb * Pc + mycol;
+            ib = i + Urows[jb];
+            while (i < ib) {
+                if (Ublock[i] <= sf_block[lb]) {
+                    nnodes_u[lb]++;
+                    i++;
+                    nblocks++;
+                } else {     /* get out */
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if (mycol < nsupers % grid->npcol) {
+            lb = nlb * Pc + mycol;
+            ib = i + Urows[nlb];
+            while (i < ib) {
+                if (Ublock[i] <= sf_block[lb]) {
+                    nnodes_u[lb]++;
+                    i++;
+                    nblocks++;
+                } else {         /* get out */
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+        /* from L-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++) {
+            lb = jb * Pc + mycol;
+            ib = i + Lrows[jb];
+            while (i < ib) {
+                if (Lblock[i] < sf_block[lb]) {
+                    nnodes_l[lb]++;
+                    i++;
+                    nblocks++;
+                } else {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if (mycol < nsupers % grid->npcol) {
+            lb = nlb * Pc + mycol;
+            ib = i + Lrows[nlb];
+            while (i < ib) {
+                if (Lblock[i] < sf_block[lb]) {
+                    nnodes_l[lb]++;
+                    i++;
+                    nblocks++;
+                } else {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+#ifdef USE_ALLGATHER
+        /* insert local nodes in DAG */
+        if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks)))
+            ABORT ("Malloc fails for edag_supno_l[].");
+	edag_supno_l_bytes = (nsupers + nblocks) * iword;
+	log_memory(edag_supno_l_bytes, stat);
+
+        iu = il = nblocks = 0;
+        for (lb = 0; lb < nsupers; lb++) {
+            j = lb / Pc;
+            pc = lb % Pc;
+
+            edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
+            nblocks++;
+            if (mycol == pc) {
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for (jb = 0; jb < nnodes_u[lb]; jb++) {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++;
+                    nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for (jb = 0; jb < nnodes_l[lb]; jb++) {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++;
+                    nblocks++;
+                }
+                il = ib;
+            }
+        }
+        SUPERLU_FREE (nnodes_u);
+	log_memory(-nsupers * iword, stat);
+
+        /* form global DAG on each processor */
+        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT,
+                       grid->comm);
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for (lb = 1; lb < Pc * Pr; lb++) {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
+	log_memory(nblocks * iword, stat);
+
+        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+        SUPERLU_FREE (edag_supno_l);
+	log_memory(-edag_supno_l_bytes, stat);
+
+        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
+            ABORT ("Malloc fails for edag_supno[].");
+	log_memory(nsupers * iword, stat);
+
+        k = 0;
+        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            for (lb = 0; lb < nsupers; lb++) {
+                nnodes_l[lb] += recvbuf[k];
+                k += (1 + recvbuf[k]);
+            }
+        }
+        for (lb = 0; lb < nsupers; lb++) {
+            if (nnodes_l[lb] > 0)
+                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
+                    ABORT ("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            for (lb = 0; lb < nsupers; lb++) {
+                jb = k + recvbuf[k] + 1;
+                k++;
+                for (; k < jb; k++) {
+                    edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                    nnodes_l[lb]++;
+                }
+            }
+        }
+        SUPERLU_FREE (recvbuf);
+	log_memory(-nblocks * iword, stat);
+
+#else   /* not USE_ALLGATHER */
+        int nlsupers = nsupers / Pc;
+        if (mycol < nsupers % Pc) nlsupers++;
+
+        /* insert local nodes in DAG */
+        if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks)))
+            ABORT ("Malloc fails for edag_supno_l[].");
+	edag_supno_l_bytes = (nlsupers + nblocks) * iword;
+	log_memory(edag_supno_l_bytes, stat);
+
+        iu = il = nblocks = 0;
+        for (lb = 0; lb < nsupers; lb++) {
+            j = lb / Pc;
+            pc = lb % Pc;
+            if (mycol == pc) {
+                edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
+                nblocks++;
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for (jb = 0; jb < nnodes_u[lb]; jb++) {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++;
+                    nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for (jb = 0; jb < nnodes_l[lb]; jb++) {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++;
+                    nblocks++;
+                }
+                il = ib;
+            } else if (nnodes_l[lb] + nnodes_u[lb] != 0)
+                printf (" # %d: nnodes[" IFMT "]=" IFMT "+" IFMT "\n",
+			grid->iam, lb, nnodes_l[lb], nnodes_u[lb]);
+        }
+        SUPERLU_FREE (nnodes_u);
+	log_memory(-nsupers * iword, stat);
+
+        /* form global DAG on each processor */  
+        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for (lb = 1; lb < Pc * Pr; lb++) {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
+	log_memory(nblocks * iword, stat);
+
+        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+        SUPERLU_FREE (edag_supno_l);
+	log_memory(-edag_supno_l_bytes, stat);
+
+        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
+            ABORT ("Malloc fails for edag_supno[].");
+	log_memory(nsupers * sizeof(int_t *), stat);
+
+        k = 0;
+        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            yourcol = MYCOL (p, grid);
+
+            for (lb = 0; lb < nsupers; lb++) {
+                j = lb / Pc;
+                pc = lb % Pc;
+                if (yourcol == pc) {
+                    nnodes_l[lb] += recvbuf[k];
+                    k += (1 + recvbuf[k]);
+                }
+            }
+        }
+        for (lb = 0; lb < nsupers; lb++) {
+            if (nnodes_l[lb] > 0)
+                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
+                    ABORT ("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            yourcol = MYCOL (p, grid);
+
+            for (lb = 0; lb < nsupers; lb++) {
+                j = lb / Pc;
+                pc = lb % Pc;
+                if (yourcol == pc)
+                {
+                    jb = k + recvbuf[k] + 1;
+                    k++;
+                    for (; k < jb; k++)
+                    {
+                        edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                        nnodes_l[lb]++;
+                    }
+                }
+            }
+        }
+        SUPERLU_FREE (recvbuf);
+	log_memory( -nblocks * iword , stat);
+
+#endif  /* end USE_ALL_GATHER */
+
+        /* initialize the num of child for each node */
+        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+        for (i = 0; i < nsupers; i++) num_child[i] = 0;
+        for (i = 0; i < nsupers; i++) {
+            for (jb = 0; jb < nnodes_l[i]; jb++) {
+                num_child[edag_supno[i][jb]]++;
+            }
+        }
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for (i = 0; i < nsupers; i++) {
+            if (num_child[i] == 0) {
+                ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
+                nnodes++;
+
+                if (nnodes == 1) {
+                    head = ptr;
+                    tail = ptr;
+                } else {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+
+        while (nnodes > 0) {
+            /*printf( "=== pop %d (%d) ===\n",head->id,i ); */
+            ptr = head;
+            j = ptr->id;
+            head = ptr->next;
+
+            perm_c_supno[i] = j;
+            SUPERLU_FREE (ptr);
+            i++;
+            nnodes--;
+
+            for (jb = 0; jb < nnodes_l[j]; jb++) {
+                num_child[edag_supno[j][jb]]--;
+                if (num_child[edag_supno[j][jb]] == 0) {
+                    nnodes++;
+
+                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                    ptr->id = edag_supno[j][jb];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id ); */
+                    if (nnodes == 1) {
+                        head = ptr;
+                        tail = ptr;
+                    } else {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" ); */
+        }
+        for (lb = 0; lb < nsupers; lb++)
+            if (nnodes_l[lb] > 0)  SUPERLU_FREE (edag_supno[lb]);
+
+        SUPERLU_FREE (num_child);
+        SUPERLU_FREE (edag_supno);
+        SUPERLU_FREE (nnodes_l);
+        SUPERLU_FREE (sf_block);
+        SUPERLU_FREE (sendcnts);
+
+	log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat);
+
+        SUPERLU_FREE (Ublock);
+        SUPERLU_FREE (Urows);
+        SUPERLU_FREE (Lblock);
+        SUPERLU_FREE (Lrows);
+	log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat);
+    }
+    /* ======================== *
+     * end of static scheduling *
+     * ======================== */
+
+    for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb;
+
+#if ( DEBUGlevel >= 1 )
+    print_memorylog(stat, "after static schedule");
+#endif
+
+    return 0;
+} /* STATIC_SCHEDULE */
+
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
new file mode 100644
index 0000000..007fbe3
--- /dev/null
+++ b/SRC/superlu_ddefs.h
@@ -0,0 +1,382 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief  Distributed SuperLU data types and function prototypes
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * April 5, 2015
+ * </pre>
+ */
+
+#ifndef __SUPERLU_dDEFS /* allow multiple inclusions */
+#define __SUPERLU_dDEFS
+
+/*
+ * File name:	superlu_ddefs.h
+ * Purpose:     Distributed SuperLU data types and function prototypes
+ * History:
+ */
+
+#include "superlu_defs.h"
+
+/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
+typedef struct {
+    int_t lbnum;  /* Row block number (local).      */
+    int_t indpos; /* Starting position in Uindex[]. */
+} Ucb_indptr_t;
+
+/* 
+ * On each processor, the blocks in L are stored in compressed block
+ * column format, the blocks in U are stored in compressed block row format.
+ */
+#define MAX_LOOKAHEADS 50
+typedef struct {
+    int_t   **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+    double  **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
+    int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
+    double  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
+#if 0
+    int_t   *Lsub_buf;        /* Buffer for the remote subscripts of L */
+    double  *Lval_buf;        /* Buffer for the remote nonzeros of L   */
+    int_t   *Usub_buf;        /* Buffer for the remote subscripts of U */
+    double  *Uval_buf;        /* Buffer for the remote nonzeros of U   */
+#endif
+    int_t   *Lsub_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote subscripts of L*/
+    double  *Lval_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote nonzeros of L  */
+    int_t   *Usub_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote subscripts of U */
+    double  *Uval_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote nonzeros of U   */
+    double  *ujrow;           /* used in panel factorization.          */
+    int_t   bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks:
+			       *  0 : maximum size of Lsub_buf[]
+			       *  1 : maximum size of Lval_buf[]
+			       *  2 : maximum size of Usub_buf[] 
+			       *  3 : maximum size of Uval_buf[]
+			       *  4 : maximum size of tempv[LDA]
+			       */
+
+    /*-- Record communication schedule for factorization. --*/
+    int   *ToRecv;          /* Recv from no one (0), left (1), and up (2).*/
+    int   *ToSendD;         /* Whether need to send down block row.       */
+    int   **ToSendR;        /* List of processes to send right block col. */
+
+    /*-- Record communication schedule for forward/back solves. --*/
+    int_t   *fmod;            /* Modification count for L-solve            */
+    int_t   **fsendx_plist;   /* Column process list to send down Xk       */
+    int_t   *frecv;           /* Modifications to be recv'd in proc row    */
+    int_t   nfrecvx;          /* Number of Xk I will receive in L-solve    */
+    int_t   nfsendx;          /* Number of Xk I will send in L-solve       */
+    int_t   *bmod;            /* Modification count for U-solve            */
+    int_t   **bsendx_plist;   /* Column process list to send down Xk       */
+    int_t   *brecv;           /* Modifications to be recv'd in proc row    */
+    int_t   nbrecvx;          /* Number of Xk I will receive in U-solve    */
+    int_t   nbsendx;          /* Number of Xk I will send in U-solve       */
+    int_t   *mod_bit;         /* Flag contribution from each row blocks    */
+
+    /*-- Auxiliary arrays used for forward/back solves. --*/
+    int_t   *ilsum;           /* Starting position of each supernode in lsum
+				 (local)  */
+    int_t   ldalsum;          /* LDA of lsum (local) */
+    int_t   SolveMsgSent;     /* Number of actual messages sent in LU-solve */
+    int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
+
+
+    /*********************/	
+    /* The following variables are used in the hybrid solver */
+
+    /*-- Counts to be used in U^{-T} triangular solve. -- */
+    int_t UT_SOLVE;
+    int_t L_SOLVE;
+    int_t FRECV;
+    int_t ut_ldalsum;        /* LDA of lsum (local) */
+    int_t *ut_ilsum;         /* ilsum in column-wise                        */
+    int_t *utmod;            /* Modification count for Ut-solve.            */
+    int_t **ut_sendx_plist;  /* Row process list to send down Xk            */
+    int_t *utrecv;           /* Modifications to be recev'd in proc column. */
+    int_t n_utsendx;         /* Number of Xk I will receive                 */
+    int_t n_utrecvx;         /* Number of Xk I will send                    */
+    int_t n_utrecvmod;
+    int_t nroot;
+    int_t *ut_modbit;
+    int_t *Urbs;
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+    /* some additional counters for L solve */
+    int_t n;
+    int_t nleaf;
+    int_t nfrecvmod;
+} LocalLU_t;
+
+
+typedef struct {
+    int_t *etree;
+    Glu_persist_t *Glu_persist;
+    LocalLU_t *Llu;
+} LUstruct_t;
+
+
+/*-- Data structure for communication during matrix-vector multiplication. */
+typedef struct {
+    int_t *extern_start;
+    int_t *ind_tosend;    /* X indeices to be sent to other processes */
+    int_t *ind_torecv;    /* X indeices to be received from other processes */
+    int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs)
+			     (also point to val_torecv) */
+    int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs)
+			     (also point to val_tosend) */
+    int   *SendCounts;    /* Numbers of X indices to be sent
+			     (also numbers of X values to be received) */
+    int   *RecvCounts;    /* Numbers of X indices to be received
+			     (also numbers of X values to be sent) */
+    double *val_tosend;   /* X values to be sent to other processes */
+    double *val_torecv;   /* X values to be received from other processes */
+    int_t TotalIndSend;   /* Total number of indices to be sent
+			     (also total number of values to be received) */
+    int_t TotalValSend;   /* Total number of values to be sent.
+			     (also total number of indices to be received) */
+} pdgsmv_comm_t;
+
+/*-- Data structure holding the information for the solution phase --*/
+typedef struct {
+    int_t *row_to_proc;
+    int_t *inv_perm_c;
+    int_t num_diag_procs, *diag_procs, *diag_len;
+    pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, 
+         	       		      required by IterRefine.          */
+    pxgstrs_comm_t *gstrs_comm;  /* communication metadata for SpTRSV. */
+    int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column
+                             indices of A are translated into the relative
+                             positions in the gathered x-vector.
+                             This is re-used in repeated calls to pdgsmv() */
+    /*int_t *xrow_to_proc; Xiaoye: can be removed */
+} SOLVEstruct_t;
+
+
+/***********************************************************************
+ * Function prototypes
+ ***********************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Supernodal LU factor related */
+extern void
+dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
+			    int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
+extern void
+dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
+			       int_t, double *, int_t *, int_t *,
+			       Stype_t, Dtype_t, Mtype_t);
+extern void
+dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *,
+                         double **, int_t **, int_t **);
+extern int
+pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
+	 		        SuperMatrix *);
+extern void
+dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
+extern void
+dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t,
+			  Stype_t, Dtype_t, Mtype_t);
+extern void
+dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, 
+			      int_t *, int_t *, int_t *, int_t *, int_t *,
+			      Stype_t, Dtype_t, Mtype_t);
+extern void
+dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t,
+                        double *, int_t);
+
+extern void    dallocateA_dist (int_t, int_t, double **, int_t **, int_t **);
+extern void    dGenXtrue_dist (int_t, int_t, double *, int_t);
+extern void    dFillRHS_dist (char *, int_t, double *, int_t,
+                              SuperMatrix *, double *, int_t);
+extern int     dcreate_matrix(SuperMatrix *, int, double **, int *, 
+			      double **, int *, FILE *, gridinfo_t *);
+extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *, 
+			      double **, int *, FILE *, gridinfo_t *);
+extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *, 
+			      double **, int *, FILE *, gridinfo_t *);
+
+/* Driver related */
+extern void    dgsequ_dist (SuperMatrix *, double *, double *, double *,
+			    double *, double *, int_t *);
+extern double  dlangs_dist (char *, SuperMatrix *);
+extern void    dlaqgs_dist (SuperMatrix *, double *, double *, double,
+			    double, double, char *);
+extern void    pdgsequ (SuperMatrix *, double *, double *, double *,
+			double *, double *, int_t *, gridinfo_t *);
+extern double  pdlangs (char *, SuperMatrix *, gridinfo_t *);
+extern void    pdlaqgs (SuperMatrix *, double *, double *, double,
+			double, double, char *);
+extern int     pdPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[],
+				      double [], int, double [], int, int,
+				      gridinfo_t *);
+
+extern int     sp_dtrsv_dist (char *, char *, char *, SuperMatrix *,
+			      SuperMatrix *, double *, int *);
+extern int     sp_dgemv_dist (char *, double, SuperMatrix *, double *,
+			      int, double, double *, int);
+extern int     sp_dgemm_dist (char *, int, double, SuperMatrix *,
+                        double *, int, double, double *, int);
+
+extern float ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
+			 LUstruct_t *, gridinfo_t *);
+extern void  pdgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *, 
+			      ScalePermstruct_t *, double *,
+			      int, int, gridinfo_t *, LUstruct_t *, double *,
+			      SuperLUStat_t *, int *);
+extern float pddistribute(fact_t, int_t, SuperMatrix *, 
+			 ScalePermstruct_t *, Glu_freeable_t *, 
+			 LUstruct_t *, gridinfo_t *);
+extern void  pdgssvx(superlu_dist_options_t *, SuperMatrix *, 
+		     ScalePermstruct_t *, double *,
+		     int, int, gridinfo_t *, LUstruct_t *,
+		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+extern int  dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
+		       int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
+extern void dSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *);
+extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
+                          int_t [], int_t [], gridinfo_t *grid,
+	                  Glu_persist_t *, SOLVEstruct_t *);
+extern void pxgstrs_finalize(pxgstrs_comm_t *);
+extern int  dldperm_dist(int_t, int_t, int_t, int_t [], int_t [],
+		    double [], int_t *, double [], double []);
+extern int  static_schedule(superlu_dist_options_t *, int, int, 
+		            LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
+			    int_t *, int_t *, int *);
+extern void LUstructInit(const int_t, LUstruct_t *);
+extern void LUstructFree(LUstruct_t *);
+extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+/* #define GPU_PROF
+#define IPM_PROF */
+
+extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
+		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
+extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
+			     double *, int_t, int, SuperLUStat_t *, int *);
+extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
+		    double *, int_t, int_t, int_t, int, SOLVEstruct_t *,
+		    SuperLUStat_t *, int *);
+extern void dlsum_fmod(double *, double *, double *, double *,
+		       int, int, int_t , int_t *, int_t, int_t, int_t,
+		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       MPI_Request [], SuperLUStat_t *);
+extern void dlsum_bmod(double *, double *, double *,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
+                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
+		       MPI_Request [], SuperLUStat_t *);
+extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
+		    ScalePermstruct_t *, gridinfo_t *,
+		    double [], int_t, double [], int_t, int,
+		    SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+extern void pdgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
+		  gridinfo_t *, double *, int_t, double *, int_t,
+		  int, double *, SuperLUStat_t *, int *);
+extern int   pdgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
+				   gridinfo_t *, int_t *, int_t *[],
+				   double *[], int_t *[], int_t []);
+extern int  pdgsmv_AXglobal(int_t, int_t [], double [], int_t [],
+	                       double [], double []);
+extern int  pdgsmv_AXglobal_abs(int_t, int_t [], double [], int_t [],
+				 double [], double []);
+extern void pdgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
+			pdgsmv_comm_t *);
+extern void pdgsmv(int_t, SuperMatrix *, gridinfo_t *, pdgsmv_comm_t *,
+		   double x[], double ax[]);
+extern void pdgsmv_finalize(pdgsmv_comm_t *);
+
+/* Memory-related */
+extern double  *doubleMalloc_dist(int_t);
+extern double  *doubleCalloc_dist(int_t);
+extern void  *duser_malloc_dist (int_t, int_t);
+extern void  duser_free_dist (int_t, int_t);
+extern int_t dQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *,
+			      SuperLUStat_t *, superlu_dist_mem_usage_t *);
+
+/* Auxiliary routines */
+extern void    dfill_dist (double *, int_t, double);
+extern void    dinf_norm_error_dist (int_t, int_t, double*, int_t,
+                                     double*, int_t, gridinfo_t*);
+extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
+				double [], int_t , gridinfo_t *);
+extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
+			   double **, int_t **, int_t **);
+extern void  dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
+			 double **, int_t **, int_t **);
+extern void  dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
+		     double **, int_t **, int_t **);
+extern void  dreadMM_dist(FILE *, int_t *, int_t *, int_t *,
+	                  double **, int_t **, int_t **);
+
+/* Distribute the data for numerical factorization */
+extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *,
+                                ScalePermstruct_t *, Pslu_freeable_t *, 
+                                LUstruct_t *, gridinfo_t *);
+extern void pdGetDiagU(int_t, LUstruct_t *, gridinfo_t *, double *);
+
+
+/* Routines for debugging */
+extern void  dPrintLblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+		 	   LocalLU_t *);
+extern void  dPrintUblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+			   LocalLU_t *);
+extern void  dPrint_CompCol_Matrix_dist(SuperMatrix *);
+extern void  dPrint_Dense_Matrix_dist(SuperMatrix *);
+extern int   dPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
+extern int   file_PrintDouble5(FILE *, char *, int_t, double *);
+
+
+/* BLAS */
+
+#ifdef USE_VENDOR_BLAS
+extern void dgemm_(const char*, const char*, const int*, const int*, const int*,
+                  const double*, const double*, const int*, const double*,
+                  const int*, const double*, double*, const int*, int, int);
+extern void dtrsv_(char*, char*, char*, int*, double*, int*,
+                  double*, int*, int, int, int);
+extern void dtrsm_(char*, char*, char*, char*, int*, int*, 
+                  double*, double*, int*, double*, 
+                  int*, int, int, int, int);
+extern void dgemv_(char *, int *, int *, double *, double *a, int *, 
+                  double *, int *, double *, double *, int *, int);
+extern void dger_(int*, int*, double*, double*, int*,
+                 double*, int*, double*, int*);
+
+#else
+extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
+                   const double*,  const double*,  const int*,  const double*,
+                   const int*,  const double*, double*, const int*);
+extern int dtrsv_(char*, char*, char*, int*, double*, int*,
+                  double*, int*);
+extern int dtrsm_(char*, char*, char*, char*, int*, int*, 
+                  double*, double*, int*, double*, int*);
+extern int dgemv_(char *, int *, int *, double *, double *a, int *, 
+                  double *, int *, double *, double *, int *);
+extern void dger_(int*, int*, double*, double*, int*,
+                 double*, int*, double*, int*);
+
+#endif
+
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif /* __SUPERLU_dDEFS */
+
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
new file mode 100644
index 0000000..27c1bdf
--- /dev/null
+++ b/SRC/superlu_defs.h
@@ -0,0 +1,764 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Definitions which are precision-neutral
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ *
+ * Modified:
+ *     Feburary 20, 2008
+ *     October 11, 2014
+ * </pre>
+ */
+
+#ifndef __SUPERLU_DEFS /* allow multiple inclusions */
+#define __SUPERLU_DEFS
+
+/*
+ * File name:	superlu_defs.h
+ * Purpose:     Definitions which are precision-neutral
+ */
+#ifdef _CRAY
+    #include <fortran.h>
+#endif
+
+#ifdef _OPENMP
+   #include <omp.h>
+#endif
+
+#include <mpi.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+
+/*************************************************************************
+ * Constants
+ **************************************************************************/
+/*
+ * You can support older version of SuperLU_DIST.
+ * At compile-time, you can catch the new release as:
+ *   #ifdef SUPERLU_DIST_MAIN_VERSION == 5
+ *       use the new interface
+ *   #else
+ *       use the old interface
+ *   #endif
+ * Versions 4.x and earlier do not include a #define'd version numbers.
+ */
+#define SUPERLU_DIST_MAJOR_VERSION     5
+#define SUPERLU_DIST_MINOR_VERSION     1
+#define SUPERLU_DIST_PATCH_VERSION     3
+
+/* Define my integer size int_t */
+#ifdef _CRAY
+  typedef short int_t;
+  /*#undef int   Revert back to int of default size. */
+  #define mpi_int_t   MPI_SHORT
+#elif defined (_LONGINT)
+  typedef long long int int_t;
+  #define mpi_int_t   MPI_LONG_LONG_INT
+  #define IFMT "%lld"
+#else /* Default */
+  typedef int int_t;
+  #define mpi_int_t   MPI_INT
+  #define IFMT "%8d"
+#endif
+
+#include "superlu_enum_consts.h"
+#include "Cnames.h"
+#include "supermatrix.h"
+#include "util_dist.h"
+#include "psymbfact.h"
+
+#define ISORT     /* NOTE: qsort() has bug on Mac */
+
+/*********************************************************************** 
+ * Constants
+ ***********************************************************************/
+/* 
+ * For each block column of L, the index[] array contains both the row 
+ * subscripts and the integers describing the size of the blocks.
+ * The organization of index[] looks like:
+ *
+ *     [ BLOCK COLUMN HEADER (size BC_HEADER)
+ *           number of blocks 
+ *           number of row subscripts, i.e., LDA of nzval[]
+ *       BLOCK 0                                        <----
+ *           BLOCK DESCRIPTOR (of size LB_DESCRIPTOR)  |
+ *               block number (global)                      |
+ *               number of full rows in the block           |
+ *           actual row subscripts                          |
+ *       BLOCK 1                                            | Repeat ...
+ *           BLOCK DESCRIPTOR                               | number of blocks
+ *               block number (global)                      | 
+ *               number of full rows in the block           |
+ *           actual row subscripts                          |
+ *       .                                                  |
+ *       .                                                  |
+ *       .                                              <----
+ *     ]
+ *
+ * For each block row of U, the organization of index[] looks like:
+ *
+ *     [ BLOCK ROW HEADER (of size BR_HEADER)
+ *           number of blocks 
+ *           number of entries in nzval[]
+ *           number of entries in index[]
+ *       BLOCK 0                                        <----
+ *           BLOCK DESCRIPTOR (of size UB_DESCRIPTOR)  |
+ *               block number (global)                      |
+ *               number of nonzeros in the block            |
+ *           actual fstnz subscripts                        |
+ *       BLOCK 1                                            | Repeat ...
+ *           BLOCK DESCRIPTOR                               | number of blocks
+ *               block number (global)                      |
+ *               number of nonzeros in the block            |
+ *           actual fstnz subscripts                        |
+ *       .                                                  |
+ *       .                                                  |
+ *       .                                              <----
+ *     ]
+ *
+ */
+#define BC_HEADER      2
+#define LB_DESCRIPTOR  2
+#define BR_HEADER      3
+#define UB_DESCRIPTOR  2
+#define NBUFFERS       5
+
+/*
+ * Communication tags
+ */
+/* Return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
+ * for each supernodal column "num", the five communications are:            *
+ * 0,1: for sending L to "right"                                             *
+ * 2,3: for sending off-diagonal blocks of U "down"                          *
+ * 4  : for sending the diagonal blcok down (in pxgstrf2)                    */
+#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
+
+    /* For numeric factorization. */
+#if 0
+#define NTAGS    10000
+#else
+#define NTAGS    INT_MAX
+#endif
+#define UjROW    10
+#define UkSUB    11
+#define UkVAL    12
+#define LkSUB    13
+#define LkVAL    14
+#define LkkDIAG  15
+    /* For triangular solves. */
+#define XK_H     2  /* The header preceeding each X block. */
+#define LSUM_H   2  /* The header preceeding each MOD block. */
+#define GSUM     20 
+#define Xk       21
+#define Yk       22
+#define LSUM     23
+
+/* 
+ * Communication scopes
+ */
+#define COMM_ALL      100
+#define COMM_COLUMN   101
+#define COMM_ROW      102
+
+/*
+ * Matrix distribution for sparse matrix-vector multiplication
+ */
+#define SUPER_LINEAR     11
+#define SUPER_BLOCK      12
+
+/*
+ * No of marker arrays used in the symbolic factorization, each of size n
+ */
+#define NO_MARKER     3
+
+
+
+/***********************************************************************
+ * Macros
+ ***********************************************************************/
+#define IAM(comm)    { int rank; MPI_Comm_rank ( comm, &rank ); rank};
+#define MYROW(iam,grid) ( (iam) / grid->npcol )
+#define MYCOL(iam,grid) ( (iam) % grid->npcol )
+#define BlockNum(i)     ( supno[i] )
+#define FstBlockC(bnum) ( xsup[bnum] )
+#define SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] )
+#define LBi(bnum,grid)  ( (bnum)/grid->nprow )/* Global to local block rowwise */
+#define LBj(bnum,grid)  ( (bnum)/grid->npcol )/* Global to local block columnwise*/
+#define PROW(bnum,grid) ( (bnum) % grid->nprow )
+#define PCOL(bnum,grid) ( (bnum) % grid->npcol )
+#define PNUM(i,j,grid)  ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */
+#define CEILING(a,b)    ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) )
+    /* For triangular solves */
+#define RHS_ITERATE(i)                    \
+        for (i = 0; i < nrhs; ++i)
+#define X_BLK(i)                          \
+        ilsum[i] * nrhs + (i+1) * XK_H
+#define LSUM_BLK(i)                       \
+        ilsum[i] * nrhs + (i+1) * LSUM_H
+
+#define SuperLU_timer_  SuperLU_timer_dist_
+#define LOG2(x)   (log10((double) x) / log10(2.0))
+
+
+#if ( VAMPIR>=1 ) 
+#define VT_TRACEON    VT_traceon()
+#define VT_TRACEOFF   VT_traceoff()
+#else
+#define VT_TRACEON 
+#define VT_TRACEOFF
+#endif
+
+
+/***********************************************************************
+ * New data types
+ ***********************************************************************/
+
+/* 
+ *   Define the 2D mapping of matrix blocks to process grid.
+ *
+ *   Process grid:
+ *     Processes are numbered (0 : P-1).
+ *     P = Pr x Pc, where Pr, Pc are the number of process rows and columns.
+ *     (pr,pc) is the coordinate of IAM; 0 <= pr < Pr, 0 <= pc < Pc.
+ *
+ *   Matrix blocks:
+ *     Matrix is partitioned according to supernode partitions, both
+ *     column and row-wise. 
+ *     The k-th block columns (rows) contains columns (rows) (s:t), where
+ *             s=xsup[k], t=xsup[k+1]-1.
+ *     Block A(I,J) contains
+ *             rows from (xsup[I]:xsup[I+1]-1) and
+ *             columns from (xsup[J]:xsup[J+1]-1)
+ *
+ *  Mapping of matrix entry (i,j) to matrix block (I,J):
+ *     (I,J) = ( supno[i], supno[j] )
+ *
+ *  Mapping of matrix block (I,J) to process grid (pr,pc):
+ *     (pr,pc) = ( MOD(I,NPROW), MOD(J,NPCOL) )
+ *  
+ *  (xsup[nsupers],supno[n]) are replicated on all processors.
+ *
+ */
+
+/*-- Communication subgroup */
+typedef struct {
+    MPI_Comm comm;        /* MPI communicator */
+    int Np;               /* number of processes */
+    int Iam;              /* my process number */
+} superlu_scope_t;
+
+/*-- Process grid definition */
+typedef struct {
+    MPI_Comm comm;        /* MPI communicator */
+    superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */
+    superlu_scope_t cscp; /* process scope in columnwise, vertical direction */
+    int iam;              /* my process number in this scope */
+    int_t nprow;          /* number of process rows */
+    int_t npcol;          /* number of process columns */
+} gridinfo_t;
+
+
+/*
+ *-- The structures are determined by SYMBFACT and used thereafter.
+ *
+ * (xsup,supno) describes mapping between supernode and column:
+ *	xsup[s] is the leading column of the s-th supernode.
+ *      supno[i] is the supernode no to which column i belongs;
+ *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
+ *	        xsup 0 1 2 4 7 12
+ *	Note: dfs will be performed on supernode rep. relative to the new 
+ *	      row pivoting ordering
+ *
+ * This is allocated during symbolic factorization SYMBFACT.
+ */
+typedef struct {
+    int_t     *xsup;
+    int_t     *supno;
+} Glu_persist_t;
+
+/*
+ *-- The structures are determined by SYMBFACT and used by DDISTRIBUTE.
+ * 
+ * (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xlsub[j] points to the starting
+ *	location of the j-th column in lsub[*]. Note that xlsub 
+ *	is indexed by column.
+ *	Storage: original row subscripts
+ *
+ *      During the course of sparse LU factorization, we also use
+ *	(xlsub,lsub) for the purpose of symmetric pruning. For each
+ *	supernode {s,s+1,...,t=s+r} with first column s and last
+ *	column t, the subscript set
+ *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ *	is the structure of column s (i.e. structure of this supernode).
+ *	It is used for the storage of numerical values.
+ *	Furthermore,
+ *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ *	is the structure of the last column t of this supernode.
+ *	It is for the purpose of symmetric pruning. Therefore, the
+ *	structural subscripts can be rearranged without making physical
+ *	interchanges among the numerical values.
+ *
+ *	However, if the supernode has only one column, then we
+ *	only keep one set of subscripts. For any subscript interchange
+ *	performed, similar interchange must be done on the numerical
+ *	values.
+ *
+ *	The last column structures (for pruning) will be removed
+ *	after the numercial LU factorization phase.
+ *
+ * (xusub,usub): xusub[i] points to the starting location of column i
+ *      in usub[]. For each U-segment, only the row index of first nonzero
+ *      is stored in usub[].
+ *
+ *      Each U column consists of a number of full segments. Each full segment
+ *      starts from a leading nonzero, running up to the supernode (block)
+ *      boundary. (Recall that the column-wise supernode partition is also
+ *      imposed on the rows.) Because the segment is full, we don't store all
+ *      the row indices. Instead, only the leading nonzero index is stored.
+ *      The rest can be found together with xsup/supno pair.
+ *      For example, 
+ *          usub[xsub[j+1]] - usub[xsub[j]] = number of segments in column j.
+ *          for any i in usub[], 
+ *              supno[i]         = block number in which i belongs to
+ *  	        xsup[supno[i]+1] = first row of the next block
+ *              The nonzeros of this segment are: 
+ *                  i, i+1 ... xsup[supno[i]+1]-1 (only i is stored in usub[])
+ *
+ */
+typedef struct {
+    int_t     *lsub;     /* compressed L subscripts */
+    int_t     *xlsub;
+    int_t     *usub;     /* compressed U subscripts */
+    int_t     *xusub;
+    int_t     nzlmax;    /* current max size of lsub */
+    int_t     nzumax;    /*    "    "    "      usub */
+    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
+    int_t     *llvl;     /* keep track of level in L for level-based ILU */
+    int_t     *ulvl;     /* keep track of level in U for level-based ILU */
+} Glu_freeable_t;
+
+
+/* 
+ *-- The structure used to store matrix A of the linear system and
+ *   several vectors describing the transformations done to matrix A.
+ *
+ * A      (SuperMatrix*)
+ *        Matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *        The number of linear equations is A->nrow. The type of A can be:
+ *        Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
+ *         
+ * DiagScale  (DiagScale_t)
+ *        Specifies the form of equilibration that was done.
+ *        = NOEQUIL: No equilibration.
+ *        = ROW:  Row equilibration, i.e., A was premultiplied by diag(R).
+ *        = COL:  Column equilibration, i.e., A was postmultiplied by diag(C).
+ *        = BOTH: Both row and column equilibration, i.e., A was replaced 
+ *                 by diag(R)*A*diag(C).
+ *
+ * R      double*, dimension (A->nrow)
+ *        The row scale factors for A.
+ *        If DiagScale = ROW or BOTH, A is multiplied on the left by diag(R).
+ *        If DiagScale = NOEQUIL or COL, R is not defined.
+ *
+ * C      double*, dimension (A->ncol)
+ *        The column scale factors for A.
+ *        If DiagScale = COL or BOTH, A is multiplied on the right by diag(C).
+ *        If DiagScale = NOEQUIL or ROW, C is not defined.
+ *         
+ * perm_r (int*) dimension (A->nrow)
+ *        Row permutation vector which defines the permutation matrix Pr,
+ *        perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ * perm_c (int*) dimension (A->ncol)
+ *	  Column permutation vector, which defines the 
+ *        permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *        in position j in A*Pc.
+ *
+ */
+typedef struct {
+    DiagScale_t DiagScale;
+    double *R;
+    double *C; 
+    int_t  *perm_r;
+    int_t  *perm_c;
+} ScalePermstruct_t;
+
+/*-- Data structure for redistribution of B and X --*/
+typedef struct {
+    int  *B_to_X_SendCnt;
+    int  *X_to_B_SendCnt;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+
+    /* the following are needed in the hybrid solver PDSLin */	
+    int *X_to_B_iSendCnt;
+    int *X_to_B_vSendCnt;
+    int    *disp_ibuf;
+    int_t  *send_ibuf;
+    void   *send_dbuf;
+
+    int_t  x2b, b2x;
+    int_t  *send_ibuf2;
+    int_t  *recv_ibuf2;
+    void   *send_dbuf2;
+    void   *recv_dbuf2;
+} pxgstrs_comm_t;
+
+/* 
+ *-- This contains the options used to control the solution process.
+ *
+ * Fact   (fact_t)
+ *        Specifies whether or not the factored form of the matrix
+ *        A is supplied on entry, and if not, how the matrix A should
+ *        be factorizaed.
+ *        = DOFACT: The matrix A will be factorized from scratch, and the
+ *             factors will be stored in L and U.
+ *        = SamePattern: The matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the column elimination tree
+ *             LUstruct->etree.
+ *        = SamePattern_SameRowPerm: The matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, both row and
+ *             column permutation vectors perm_r and perm_c, and the
+ *             data structure set up from the previous symbolic factorization.
+ *        = FACTORED: On entry, L, U, perm_r and perm_c contain the 
+ *              factored form of A. If DiagScale is not NOEQUIL, the matrix
+ *              A has been equilibrated with scaling factors R and C.
+ *
+ * Equil  (yes_no_t)
+ *        Specifies whether to equilibrate the system (scale A's row and
+ *        columns to have unit norm).
+ *
+ * ColPerm (colperm_t)
+ *        Specifies what type of column permutation to use to reduce fill.
+ *        = NATURAL: use the natural ordering 
+ *        = MMD_ATA: use minimum degree ordering on structure of A'*A
+ *        = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A
+ *        = COLAMD: use approximate minimum degree column ordering
+ *        = MY_PERMC: use the ordering specified by the user
+ *         
+ * Trans  (trans_t)
+ *        Specifies the form of the system of equations:
+ *        = NOTRANS: A * X = B        (No transpose)
+ *        = TRANS:   A**T * X = B     (Transpose)
+ *        = CONJ:    A**H * X = B     (Transpose)
+ *
+ * IterRefine (IterRefine_t)
+ *        Specifies whether to perform iterative refinement.
+ *        = NO: no iterative refinement
+ *        = SINGLE: perform iterative refinement in single precision
+ *        = DOUBLE: perform iterative refinement in double precision
+ *        = EXTRA: perform iterative refinement in extra precision
+ *
+ * DiagPivotThresh (double, in [0.0, 1.0]) (only for serial SuperLU)
+ *        Specifies the threshold used for a diagonal entry to be an
+ *        acceptable pivot.
+ *
+ * SymmetricMode (yest_no_t) (only for serial SuperLU)
+ *        Specifies whether to use symmetric mode. Symmetric mode gives 
+ *        preference to diagonal pivots, and uses an (A'+A)-based column
+ *        permutation algorithm.
+ *
+ * PivotGrowth (yes_no_t)  (only for serial SuperLU)
+ *        Specifies whether to compute the reciprocal pivot growth.
+ *
+ * ConditionNumber (ues_no_t) (only for serial SuperLU)
+ *        Specifies whether to compute the reciprocal condition number.
+ *
+ * RowPerm (rowperm_t) (only for SuperLU_DIST or ILU in serial SuperLU)
+ *        Specifies whether to permute rows of the original matrix.
+ *        = NO: not to permute the rows
+ *        = LargeDiag: make the diagonal large relative to the off-diagonal
+ *        = MY_PERMR: use the permutation given by the user
+ *
+ * ILU_DropRule (int)  (only for serial SuperLU)
+ *        Specifies the dropping rule:
+ *	  = DROP_BASIC:   Basic dropping rule, supernodal based ILUTP(tau).
+ *	  = DROP_PROWS:   Supernodal based ILUTP(p,tau), p = gamma * nnz(A)/n.
+ *	  = DROP_COLUMN:  Variant of ILUTP(p,tau), for j-th column,
+ *			      p = gamma * nnz(A(:,j)).
+ *	  = DROP_AREA:    Variation of ILUTP, for j-th column, use
+ *			      nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory.
+ *	  = DROP_DYNAMIC: Modify the threshold tau during factorizaion:
+ *			  If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma
+ *				  tau_L(j) := MIN(tau_0, tau_L(j-1) * 2);
+ *			  Otherwise
+ *				  tau_L(j) := MAX(tau_0, tau_L(j-1) / 2);
+ *			  tau_U(j) uses the similar rule.
+ *			  NOTE: the thresholds used by L and U are separate.
+ *	  = DROP_INTERP:  Compute the second dropping threshold by
+ *	                  interpolation instead of sorting (default).
+ *  		          In this case, the actual fill ratio is not
+ *			  guaranteed to be smaller than gamma.
+ *   	  Note: DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive.
+ *	  ( Default: DROP_BASIC | DROP_AREA )
+ *
+ * ILU_DropTol (double) (only for serial SuperLU)
+ *        numerical threshold for dropping.
+ *
+ * ILU_FillFactor (double) (only for serial SuperLU)
+ *        Gamma in the secondary dropping.
+ *
+ * ILU_Norm (norm_t)  (only for serial SuperLU)
+ *        Specify which norm to use to measure the row size in a
+ *        supernode: infinity-norm, 1-norm, or 2-norm.
+ *
+ * ILU_FillTol (double) (only for serial SuperLU)
+ *        numerical threshold for zero pivot perturbation.
+ *
+ * ILU_MILU (milu_t)  (only for serial SuperLU)
+ *        Specifies which version of MILU to use.
+ *
+ * ILU_MILU_Dim (double) 
+ *        Dimension of the PDE if available.
+ *
+ * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether to replace the tiny diagonals by
+ *        sqrt(epsilon)*||A|| during LU factorization.
+ *
+ * SolveInitialized (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether the initialization has been performed to the
+ *        triangular solve.
+ *
+ * RefineInitialized (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether the initialization has been performed to the
+ *        sparse matrix-vector multiplication routine needed in iterative
+ *        refinement.
+ *
+ * num_lookaheads (int) (only for SuperLU_DIST)
+ *        Specifies the number of levels in the look-ahead factorization
+ *
+ * lookahead_etree (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether to use the elimination tree computed from the 
+ *        serial symbolic factorization to perform scheduling.
+ *
+ * SymPattern (yes_no_t) (only for SuperLU_DIST)
+ *        Gives the scheduling algorithm a hint whether the matrix
+ *        would have symmetric pattern.
+ *
+ */
+typedef struct {
+    fact_t        Fact;
+    yes_no_t      Equil;
+    colperm_t     ColPerm;
+    trans_t       Trans;
+    IterRefine_t  IterRefine;
+    double        DiagPivotThresh;
+    yes_no_t      SymmetricMode;
+    yes_no_t      PivotGrowth;
+    yes_no_t      ConditionNumber;
+    rowperm_t     RowPerm;
+    int 	  ILU_DropRule;
+    double	  ILU_DropTol;    /* threshold for dropping */
+    double	  ILU_FillFactor; /* gamma in the secondary dropping */
+    norm_t	  ILU_Norm;       /* infinity-norm, 1-norm, or 2-norm */
+    double	  ILU_FillTol;    /* threshold for zero pivot perturbation */
+    milu_t	  ILU_MILU;
+    double	  ILU_MILU_Dim;   /* Dimension of PDE (if available) */
+    yes_no_t      ParSymbFact;
+    yes_no_t      ReplaceTinyPivot; /* used in SuperLU_DIST */
+    yes_no_t      SolveInitialized;
+    yes_no_t      RefineInitialized;
+    yes_no_t      PrintStat;
+    int           nnzL, nnzU;      /* used to store nnzs for now       */
+    int           num_lookaheads;  /* num of levels in look-ahead      */
+    yes_no_t      lookahead_etree; /* use etree computed from the
+				      serial symbolic factorization */
+    yes_no_t      SymPattern;      /* symmetric factorization          */
+} superlu_dist_options_t;
+
+typedef struct {
+    float for_lu;
+    float total;
+    int_t expansions;
+    long long int nnzL, nnzU;
+} superlu_dist_mem_usage_t;
+
+/* 
+ *-- The new structures added in the hybrid CUDA + OpenMP + MPI code.
+ */
+typedef struct {
+    int_t rukp;
+    int_t iukp;
+    int_t jb;
+    int_t full_u_cols;
+
+} Ublock_info_t;
+
+typedef struct {
+    int_t lptr;
+    int_t ib;
+    int_t FullRow;
+} Remain_info_t;
+
+typedef struct
+{
+    int id, key;
+    void *next;
+} etree_node;
+
+struct superlu_pair
+{
+    int ind;
+    int val;
+};
+
+/**--------**/
+
+
+/***********************************************************************
+ * Function prototypes
+ ***********************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void   set_default_options_dist(superlu_dist_options_t *);
+extern void   superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *);
+extern void   superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t,
+			      gridinfo_t *);
+extern void   superlu_gridexit(gridinfo_t *);
+extern void   print_options_dist(superlu_dist_options_t *);
+extern void   print_sp_ienv_dist(superlu_dist_options_t *);
+extern void   Destroy_CompCol_Matrix_dist(SuperMatrix *);
+extern void   Destroy_SuperNode_Matrix_dist(SuperMatrix *);
+extern void   Destroy_SuperMatrix_Store_dist(SuperMatrix *);
+extern void   Destroy_CompCol_Permuted_dist(SuperMatrix *);
+extern void   Destroy_CompRowLoc_Matrix_dist(SuperMatrix *);
+extern void   Destroy_CompRow_Matrix_dist(SuperMatrix *);
+extern void   sp_colorder (superlu_dist_options_t*, SuperMatrix*, int_t*, int_t*,
+			   SuperMatrix*);
+extern int    sp_symetree_dist(int_t *, int_t *, int_t *, int_t, int_t *);
+extern int    sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *);
+extern void   get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *);
+extern void   at_plus_a_dist(const int_t, const int_t, int_t *, int_t *,
+			     int_t *, int_t **, int_t **);
+extern int    genmmd_dist_(int_t *, int_t *, int_t *a, 
+			   int_t *, int_t *, int_t *, int_t *, 
+			   int_t *, int_t *, int_t *, int_t *, int_t *);
+extern void  bcast_tree(void *, int, MPI_Datatype, int, int,
+			gridinfo_t *, int, int *);
+extern int_t symbfact(superlu_dist_options_t *, int, SuperMatrix *, int_t *,
+                      int_t *, Glu_persist_t *, Glu_freeable_t *);
+extern int_t symbfact_SubInit(fact_t, void *, int_t, int_t, int_t, int_t,
+			      Glu_persist_t *, Glu_freeable_t *);
+extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *,
+			       Glu_freeable_t *);
+extern int_t symbfact_SubFree(Glu_freeable_t *);
+extern void    countnz_dist (const int_t, int_t *, 
+			     long long int *, long long int *,
+			     Glu_persist_t *, Glu_freeable_t *);
+extern long long int fixupL_dist (const int_t, const int_t *, Glu_persist_t *,
+				  Glu_freeable_t *);
+extern int_t   *TreePostorder_dist (int_t, int_t *);
+extern float   smach_dist(char *);
+extern double  dmach_dist(char *);
+extern void    *superlu_malloc_dist (size_t);
+extern void    superlu_free_dist (void*);
+extern int_t   *intMalloc_dist (int_t);
+extern int_t   *intCalloc_dist (int_t);
+extern int_t   mc64id_dist(int_t *);
+extern void  arrive_at_ublock (int_t, int_t *, int_t *, int_t *,
+			       int_t *, int_t *, int_t, int_t, 
+			       int_t *, int_t *, int_t *, gridinfo_t *);
+extern int_t estimate_bigu_size (int_t, int_t, int_t **, Glu_persist_t *,
+				 gridinfo_t *, int_t *);
+
+/* Auxiliary routines */
+extern double SuperLU_timer_ ();
+extern void   superlu_abort_and_exit_dist(char *);
+extern int_t  sp_ienv_dist (int_t);
+extern void   ifill_dist (int_t *, int_t, int_t);
+extern void   super_stats_dist (int_t, int_t *);
+extern void   ScalePermstructInit(const int_t, const int_t, 
+				   ScalePermstruct_t *);
+extern void   ScalePermstructFree(ScalePermstruct_t *);
+extern void  get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *,
+			    int_t **, int_t **);
+extern int_t QuerySpace_dist(int_t, int_t, Glu_freeable_t *, superlu_dist_mem_usage_t *);
+extern int   xerr_dist (char *, int *);
+extern void  pxerr_dist (char *, gridinfo_t *, int_t);
+extern void  PStatInit(SuperLUStat_t *);
+extern void  PStatFree(SuperLUStat_t *);
+extern void  PStatPrint(superlu_dist_options_t *, SuperLUStat_t *, gridinfo_t *);
+extern void  log_memory(long long, SuperLUStat_t *);
+extern void  print_memorylog(SuperLUStat_t *, char *);
+
+/* Prototypes for parallel symbolic factorization */
+extern float symbfact_dist
+(int,  int, SuperMatrix *, int_t *, int_t *,  int_t *, int_t *,
+ Pslu_freeable_t *, MPI_Comm *, MPI_Comm *,  superlu_dist_mem_usage_t *);
+
+/* Get the column permutation using parmetis */
+extern float get_perm_c_parmetis 
+(SuperMatrix *, int_t *, int_t *, int, int, 
+ int_t **, int_t **, gridinfo_t *, MPI_Comm *);
+
+/* Auxiliary routines for memory expansions used during
+   the parallel symbolic factorization routine */
+
+extern int_t psymbfact_LUXpandMem
+(int_t, int_t, int_t, int_t, int_t, int_t, int_t, int_t, 
+ Pslu_freeable_t *, Llu_symbfact_t *,  vtcsInfo_symbfact_t *, psymbfact_stat_t *);
+
+extern int_t psymbfact_LUXpand
+(int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, 
+ Pslu_freeable_t *, Llu_symbfact_t *,  vtcsInfo_symbfact_t *, psymbfact_stat_t *);
+
+extern int_t psymbfact_LUXpand_RL
+(int_t, int_t, int_t, int_t, int_t, int_t,
+ Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
+
+extern int_t psymbfact_prLUXpand
+(int_t,  int_t, int, Llu_symbfact_t *, psymbfact_stat_t *);
+
+#ifdef GPU_ACC   /* GPU related */
+extern void gemm_division_cpu_gpu (int *, int *, int *, int,
+				   int, int, int *, int);
+extern int_t get_cublas_nb ();
+extern int_t get_num_cuda_streams ();
+#endif
+
+extern int get_thread_per_process();
+extern int_t get_max_buffer_size ();
+extern int_t get_min (int_t *, int_t);
+extern int compare_pair (const void *, const void *);
+extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t,
+			       int_t *, int_t *, int);
+
+/* Routines for debugging */
+extern void  print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *);
+extern void  check_repfnz_dist(int_t, int_t, int_t, int_t *);
+extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *);
+extern void  PrintDouble5(char *, int_t, double *);
+extern void  PrintInt10(char *, int_t, int_t *);
+extern void  PrintInt32(char *, int, int *);
+extern int   file_PrintInt10(FILE *, char *, int_t, int_t *);
+extern int   file_PrintInt32(FILE *, char *, int, int *);
+extern int   file_PrintLong10(FILE *, char *, int_t, int_t *);
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif /* __SUPERLU_DEFS */
diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h
new file mode 100644
index 0000000..07fb1a4
--- /dev/null
+++ b/SRC/superlu_enum_consts.h
@@ -0,0 +1,81 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/** @file superlu_enum_consts.h
+ * \brief enum constants header file 
+ *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley, 
+ * October 1, 2010
+ *
+ */
+
+#ifndef __SUPERLU_ENUM_CONSTS /* allow multiple inclusions */
+#define __SUPERLU_ENUM_CONSTS
+
+/***********************************************************************
+ * Enumerate types
+ ***********************************************************************/
+typedef enum {NO, YES}                                          yes_no_t;
+typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
+typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                   rowperm_t;
+typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
+	      METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC}      colperm_t;
+typedef enum {NOTRANS, TRANS, CONJ}                             trans_t;
+typedef enum {NOEQUIL, ROW, COL, BOTH}                          DiagScale_t;
+typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA}    IterRefine_t;
+typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}              MemType;
+typedef enum {HEAD, TAIL}                                       stack_end_t;
+typedef enum {SYSTEM, USER}                                     LU_space_t;
+typedef enum {ONE_NORM, TWO_NORM, INF_NORM}			norm_t;
+typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3}			milu_t;
+#if 0
+typedef enum {NODROP		= 0x0000,
+	      DROP_BASIC	= 0x0001, /* ILU(tau) */
+	      DROP_PROWS	= 0x0002, /* ILUTP: keep p maximum rows */
+	      DROP_COLUMN	= 0x0004, /* ILUTP: for j-th column, 
+					     p = gamma * nnz(A(:,j)) */
+	      DROP_AREA 	= 0x0008, /* ILUTP: for j-th column, use
+					     nnz(F(:,1:j)) / nnz(A(:,1:j))
+					     to limit memory growth  */
+	      DROP_SECONDARY	= 0x000E, /* PROWS | COLUMN | AREA */
+	      DROP_DYNAMIC	= 0x0010,
+	      DROP_INTERP	= 0x0100}			rule_t;
+#endif
+
+
+/* 
+ * The following enumerate type is used by the statistics variable 
+ * to keep track of flop count and time spent at various stages.
+ *
+ * Note that not all of the fields are disjoint.
+ */
+typedef enum {
+    COLPERM, /* find a column ordering that minimizes fills */
+    ROWPERM, /* find a row ordering maximizes diagonal. */
+    RELAX,   /* find artificial supernodes */
+    ETREE,   /* compute column etree */
+    EQUIL,   /* equilibrate the original matrix */
+    SYMBFAC, /* symbolic factorization. */
+    DIST,    /* distribute matrix. */
+    FACT,    /* perform LU factorization */
+    COMM,    /* communication for factorization */
+    SOL_COMM,/* communication for solve */
+    RCOND,   /* estimate reciprocal condition number */
+    SOLVE,   /* forward and back solves */
+    REFINE,  /* perform iterative refinement */
+    TRSV,    /* fraction of FACT spent in xTRSV */
+    GEMV,    /* fraction of FACT spent in xGEMV */
+    FERR,    /* estimate error bounds after iterative refinement */
+    NPHASES  /* total number of phases */
+} PhaseType;
+
+
+#endif /* __SUPERLU_ENUM_CONSTS */
diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c
new file mode 100644
index 0000000..1213d27
--- /dev/null
+++ b/SRC/superlu_grid.c
@@ -0,0 +1,178 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief SuperLU grid utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+/* Define global variables */
+MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL;
+
+/*! \brief All processes in the MPI communicator must call this routine.
+ */
+void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
+					 the new grid is formed. */
+		      int_t nprow, int_t npcol, gridinfo_t *grid)
+{
+    int Np = nprow * npcol;
+    int_t *usermap;
+    int i, j, info;
+
+    /* Make a list of the processes in the new communicator. */
+    usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t));
+    for (j = 0; j < npcol; ++j)
+	for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j;
+    
+    /* Check MPI environment initialization. */
+    MPI_Initialized( &info );
+    if ( !info )
+	ABORT("C main program must explicitly call MPI_Init()");
+
+    MPI_Comm_size( Bcomm, &info );
+    if ( info < Np )
+	ABORT("Number of processes is smaller than NPROW * NPCOL");
+
+    superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid);
+    
+    SUPERLU_FREE(usermap);
+}
+
+
+/*! \brief All processes in the MPI communicator must call this routine.
+ */
+void superlu_gridmap(
+		     MPI_Comm Bcomm, /* The base communicator upon which
+					the new grid is formed. */
+		     int_t nprow,
+		     int_t npcol,
+		     int_t usermap[], /* usermap(i,j) holds the process
+					 number to be placed in {i,j} of
+					 the process grid.  */
+		     int_t ldumap,    /* The leading dimension of the
+					 2D array usermap[].  */
+		     gridinfo_t *grid)
+{
+    MPI_Group mpi_base_group, superlu_grp;
+    int Np = nprow * npcol, mycol, myrow;
+    int *pranks;
+    int i, j, info;
+    
+    /* Create datatype in C for MPI complex. */
+    if ( SuperLU_MPI_DOUBLE_COMPLEX == MPI_DATATYPE_NULL ) {
+	MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX );
+	MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX );
+    }
+
+    /* Check MPI environment initialization. */
+    MPI_Initialized( &info );
+    if ( !info )
+	ABORT("C main program must explicitly call MPI_Init()");
+
+    grid->nprow = nprow;
+    grid->npcol = npcol;
+
+    /* Make a list of the processes in the new communicator. */
+    pranks = (int *) SUPERLU_MALLOC(Np*sizeof(int));
+    for (j = 0; j < npcol; ++j)
+	for (i = 0; i < nprow; ++i)
+	    pranks[i*npcol+j] = usermap[j*ldumap+i];
+    
+    /*
+     * Form MPI communicator for all.
+     */
+    /* Get the group underlying Bcomm. */
+    MPI_Comm_group( Bcomm, &mpi_base_group );
+    /* Create the new group. */
+    MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp );
+    /* Create the new communicator. */
+    /* NOTE: The call is to be executed by all processes in Bcomm,
+       even if they do not belong in the new group -- superlu_grp. */
+    MPI_Comm_create( Bcomm, superlu_grp, &grid->comm );
+
+    /* Bail out if I am not in the group, superlu_group. */
+    if ( grid->comm == MPI_COMM_NULL ) {
+	grid->comm = Bcomm;
+	MPI_Comm_rank( Bcomm, &i );
+	grid->iam = i;
+	/*grid->iam = -1;*/
+	SUPERLU_FREE(pranks);
+	return;
+    }
+
+    MPI_Comm_rank( grid->comm, &(grid->iam) );
+    myrow = grid->iam / npcol;
+    mycol = grid->iam % npcol;
+
+    /*
+     * Form MPI communicator for myrow, scope = COMM_ROW.
+     */
+#if 0
+    for (i = 0; i < npcol; ++i) pranks[i] = myrow*npcol + i;
+    MPI_Comm_group( grid->comm, &superlu_grp );          /* Find all's group */
+    MPI_Group_incl( superlu_grp, npcol, pranks, &grp );  /* Form new group */
+    MPI_Comm_create( grid->comm, grp, &grid->rscp.comm );/* Create new comm */
+#else
+    MPI_Comm_split(grid->comm, myrow, mycol, &(grid->rscp.comm));
+#endif
+
+    /*
+     * Form MPI communicator for mycol, scope = COMM_COLUMN.
+     */
+#if 0
+    for (i = 0; i < nprow; ++i) pranks[i] = i*npcol + mycol;
+    MPI_Group_incl( superlu_grp, nprow, pranks, &grp );  /* Form new group */
+    MPI_Comm_create( grid->comm, grp, &grid->cscp.comm );/* Create new comm */
+#else
+    MPI_Comm_split(grid->comm, mycol, myrow, &(grid->cscp.comm));
+#endif
+
+    grid->rscp.Np = npcol;
+    grid->rscp.Iam = mycol;
+    grid->cscp.Np = nprow;
+    grid->cscp.Iam = myrow;
+
+#if 0
+    {
+	int tag_ub;
+	if ( !grid->iam ) {
+	    MPI_Attr_get(Bcomm, MPI_TAG_UB, &tag_ub, &info);
+	    printf("MPI_TAG_UB %d\n", tag_ub);
+	    /* returns 4295677672
+	       In reality it is restricted to no greater than 16384. */
+	}
+	exit(0);
+    }
+#endif
+
+    SUPERLU_FREE(pranks);
+    MPI_Group_free(&superlu_grp);
+    MPI_Group_free(&mpi_base_group);
+}
+
+void superlu_gridexit(gridinfo_t *grid)
+{
+    if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) {
+	/* Marks the communicator objects for deallocation. */
+	MPI_Comm_free( &grid->rscp.comm );
+	MPI_Comm_free( &grid->cscp.comm );
+	MPI_Comm_free( &grid->comm );
+    }
+    if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) {
+	MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX );
+    }
+}
diff --git a/SRC/superlu_timer.c b/SRC/superlu_timer.c
new file mode 100644
index 0000000..6f6e682
--- /dev/null
+++ b/SRC/superlu_timer.c
@@ -0,0 +1,78 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Returns the time in seconds used by the process
+ *
+ * <pre>
+ * Purpose
+ * ======= 
+ *	Returns the time in seconds used by the process.
+ *
+ * Note: the timer function call is machine dependent. Use conditional
+ *       compilation to choose the appropriate function.
+ * </pre>
+ */
+
+#include "superlu_defs.h"
+
+#ifdef SUN 
+/*
+ * 	It uses the system call gethrtime(3C), which is accurate to 
+ *	nanoseconds. 
+*/
+#include <sys/time.h>
+ 
+double SuperLU_timer_() {
+    return ( (double)gethrtime() / 1e9 );
+}
+
+#elif defined ( UNIX_TIMER )
+
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+#include <sys/time.h>
+
+double SuperLU_timer_()
+{
+    struct tms use;
+    double tmp;
+    int clocks_per_sec = sysconf(_SC_CLK_TCK);
+
+    times(&use);
+    tmp = use.tms_utime;
+    tmp += use.tms_stime;
+    return (double)(tmp) / clocks_per_sec;
+}
+
+#elif _WIN32
+
+#include <time.h>
+
+double SuperLU_timer_()
+{
+    clock_t t;
+    t=clock();
+
+    return ((double)t)/CLOCKS_PER_SEC;
+}
+
+#else
+
+#include <mpi.h>
+
+double SuperLU_timer_()
+{
+    return MPI_Wtime();
+}
+
+#endif
+
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
new file mode 100644
index 0000000..dc918b2
--- /dev/null
+++ b/SRC/superlu_zdefs.h
@@ -0,0 +1,385 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief  Distributed SuperLU data types and function prototypes
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * April 5, 2015
+ * </pre>
+ */
+
+#ifndef __SUPERLU_zDEFS /* allow multiple inclusions */
+#define __SUPERLU_zDEFS
+
+/*
+ * File name:	superlu_zdefs.h
+ * Purpose:     Distributed SuperLU data types and function prototypes
+ * History:
+ */
+
+#include "superlu_defs.h"
+#include "dcomplex.h"
+
+/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
+typedef struct {
+    int_t lbnum;  /* Row block number (local).      */
+    int_t indpos; /* Starting position in Uindex[]. */
+} Ucb_indptr_t;
+
+/* 
+ * On each processor, the blocks in L are stored in compressed block
+ * column format, the blocks in U are stored in compressed block row format.
+ */
+#define MAX_LOOKAHEADS 50
+typedef struct {
+    int_t   **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+    doublecomplex  **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
+    int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
+    doublecomplex  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
+#if 0
+    int_t   *Lsub_buf;        /* Buffer for the remote subscripts of L */
+    double  *Lval_buf;        /* Buffer for the remote nonzeros of L   */
+    int_t   *Usub_buf;        /* Buffer for the remote subscripts of U */
+    doublecomplex  *Uval_buf;        /* Buffer for the remote nonzeros of U   */
+#endif
+    int_t   *Lsub_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote subscripts of L*/
+    doublecomplex  *Lval_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote nonzeros of L  */
+    int_t   *Usub_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote subscripts of U */
+    doublecomplex  *Uval_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote nonzeros of U   */
+    doublecomplex  *ujrow;           /* used in panel factorization.          */
+    int_t   bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks:
+			       *  0 : maximum size of Lsub_buf[]
+			       *  1 : maximum size of Lval_buf[]
+			       *  2 : maximum size of Usub_buf[] 
+			       *  3 : maximum size of Uval_buf[]
+			       *  4 : maximum size of tempv[LDA]
+			       */
+
+    /*-- Record communication schedule for factorization. --*/
+    int   *ToRecv;          /* Recv from no one (0), left (1), and up (2).*/
+    int   *ToSendD;         /* Whether need to send down block row.       */
+    int   **ToSendR;        /* List of processes to send right block col. */
+
+    /*-- Record communication schedule for forward/back solves. --*/
+    int_t   *fmod;            /* Modification count for L-solve            */
+    int_t   **fsendx_plist;   /* Column process list to send down Xk       */
+    int_t   *frecv;           /* Modifications to be recv'd in proc row    */
+    int_t   nfrecvx;          /* Number of Xk I will receive in L-solve    */
+    int_t   nfsendx;          /* Number of Xk I will send in L-solve       */
+    int_t   *bmod;            /* Modification count for U-solve            */
+    int_t   **bsendx_plist;   /* Column process list to send down Xk       */
+    int_t   *brecv;           /* Modifications to be recv'd in proc row    */
+    int_t   nbrecvx;          /* Number of Xk I will receive in U-solve    */
+    int_t   nbsendx;          /* Number of Xk I will send in U-solve       */
+    int_t   *mod_bit;         /* Flag contribution from each row blocks    */
+
+    /*-- Auxiliary arrays used for forward/back solves. --*/
+    int_t   *ilsum;           /* Starting position of each supernode in lsum
+				 (local)  */
+    int_t   ldalsum;          /* LDA of lsum (local) */
+    int_t   SolveMsgSent;     /* Number of actual messages sent in LU-solve */
+    int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
+
+
+    /*********************/	
+    /* The following variables are used in the hybrid solver */
+
+    /*-- Counts to be used in U^{-T} triangular solve. -- */
+    int_t UT_SOLVE;
+    int_t L_SOLVE;
+    int_t FRECV;
+    int_t ut_ldalsum;        /* LDA of lsum (local) */
+    int_t *ut_ilsum;         /* ilsum in column-wise                        */
+    int_t *utmod;            /* Modification count for Ut-solve.            */
+    int_t **ut_sendx_plist;  /* Row process list to send down Xk            */
+    int_t *utrecv;           /* Modifications to be recev'd in proc column. */
+    int_t n_utsendx;         /* Number of Xk I will receive                 */
+    int_t n_utrecvx;         /* Number of Xk I will send                    */
+    int_t n_utrecvmod;
+    int_t nroot;
+    int_t *ut_modbit;
+    int_t *Urbs;
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+    /* some additional counters for L solve */
+    int_t n;
+    int_t nleaf;
+    int_t nfrecvmod;
+} LocalLU_t;
+
+
+typedef struct {
+    int_t *etree;
+    Glu_persist_t *Glu_persist;
+    LocalLU_t *Llu;
+} LUstruct_t;
+
+
+/*-- Data structure for communication during matrix-vector multiplication. */
+typedef struct {
+    int_t *extern_start;
+    int_t *ind_tosend;    /* X indeices to be sent to other processes */
+    int_t *ind_torecv;    /* X indeices to be received from other processes */
+    int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs)
+			     (also point to val_torecv) */
+    int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs)
+			     (also point to val_tosend) */
+    int   *SendCounts;    /* Numbers of X indices to be sent
+			     (also numbers of X values to be received) */
+    int   *RecvCounts;    /* Numbers of X indices to be received
+			     (also numbers of X values to be sent) */
+    doublecomplex *val_tosend;   /* X values to be sent to other processes */
+    doublecomplex *val_torecv;   /* X values to be received from other processes */
+    int_t TotalIndSend;   /* Total number of indices to be sent
+			     (also total number of values to be received) */
+    int_t TotalValSend;   /* Total number of values to be sent.
+			     (also total number of indices to be received) */
+} pzgsmv_comm_t;
+
+/*-- Data structure holding the information for the solution phase --*/
+typedef struct {
+    int_t *row_to_proc;
+    int_t *inv_perm_c;
+    int_t num_diag_procs, *diag_procs, *diag_len;
+    pzgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, 
+         	       		      required by IterRefine.          */
+    pxgstrs_comm_t *gstrs_comm;  /* communication metadata for SpTRSV. */
+    int_t *A_colind_gsmv; /* After pzgsmv_init(), the global column
+                             indices of A are translated into the relative
+                             positions in the gathered x-vector.
+                             This is re-used in repeated calls to pzgsmv() */
+    /*int_t *xrow_to_proc; Xiaoye: can be removed */
+} SOLVEstruct_t;
+
+
+/***********************************************************************
+ * Function prototypes
+ ***********************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Supernodal LU factor related */
+extern void
+zCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
+			    int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
+extern void
+zCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
+			       int_t, doublecomplex *, int_t *, int_t *,
+			       Stype_t, Dtype_t, Mtype_t);
+extern void
+zCompRow_to_CompCol_dist(int_t, int_t, int_t, doublecomplex *, int_t *, int_t *,
+                         doublecomplex **, int_t **, int_t **);
+extern int
+pzCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
+	 		        SuperMatrix *);
+extern void
+zCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
+extern void
+zCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, doublecomplex *, int_t,
+			  Stype_t, Dtype_t, Mtype_t);
+extern void
+zCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, 
+			      int_t *, int_t *, int_t *, int_t *, int_t *,
+			      Stype_t, Dtype_t, Mtype_t);
+extern void
+zCopy_Dense_Matrix_dist(int_t, int_t, doublecomplex *, int_t,
+                        doublecomplex *, int_t);
+
+extern void    zallocateA_dist (int_t, int_t, doublecomplex **, int_t **, int_t **);
+extern void    zGenXtrue_dist (int_t, int_t, doublecomplex *, int_t);
+extern void    zFillRHS_dist (char *, int_t, doublecomplex *, int_t,
+                              SuperMatrix *, doublecomplex *, int_t);
+extern int     zcreate_matrix(SuperMatrix *, int, doublecomplex **, int *, 
+			      doublecomplex **, int *, FILE *, gridinfo_t *);
+extern int     zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *, 
+			      doublecomplex **, int *, FILE *, gridinfo_t *);
+extern int     zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *, 
+			      doublecomplex **, int *, FILE *, gridinfo_t *);
+
+/* Driver related */
+extern void    zgsequ_dist (SuperMatrix *, double *, double *, double *,
+			    double *, double *, int_t *);
+extern double  zlangs_dist (char *, SuperMatrix *);
+extern void    zlaqgs_dist (SuperMatrix *, double *, double *, double,
+			    double, double, char *);
+extern void    pzgsequ (SuperMatrix *, double *, double *, double *,
+			double *, double *, int_t *, gridinfo_t *);
+extern double  pzlangs (char *, SuperMatrix *, gridinfo_t *);
+extern void    pzlaqgs (SuperMatrix *, double *, double *, double,
+			double, double, char *);
+extern int     pzPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[],
+				      doublecomplex [], int, doublecomplex [], int, int,
+				      gridinfo_t *);
+
+extern int     sp_ztrsv_dist (char *, char *, char *, SuperMatrix *,
+			      SuperMatrix *, doublecomplex *, int *);
+extern int     sp_zgemv_dist (char *, doublecomplex, SuperMatrix *, doublecomplex *,
+			      int, doublecomplex, doublecomplex *, int);
+extern int     sp_zgemm_dist (char *, int, doublecomplex, SuperMatrix *,
+                        doublecomplex *, int, doublecomplex, doublecomplex *, int);
+
+extern float zdistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
+			 LUstruct_t *, gridinfo_t *);
+extern void  pzgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *, 
+			      ScalePermstruct_t *, doublecomplex *,
+			      int, int, gridinfo_t *, LUstruct_t *, double *,
+			      SuperLUStat_t *, int *);
+extern float pzdistribute(fact_t, int_t, SuperMatrix *, 
+			 ScalePermstruct_t *, Glu_freeable_t *, 
+			 LUstruct_t *, gridinfo_t *);
+extern void  pzgssvx(superlu_dist_options_t *, SuperMatrix *, 
+		     ScalePermstruct_t *, doublecomplex *,
+		     int, int, gridinfo_t *, LUstruct_t *,
+		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+extern int  zSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
+		       int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
+extern void zSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *);
+extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
+                          int_t [], int_t [], gridinfo_t *grid,
+	                  Glu_persist_t *, SOLVEstruct_t *);
+extern void pxgstrs_finalize(pxgstrs_comm_t *);
+extern int  zldperm_dist(int_t, int_t, int_t, int_t [], int_t [],
+		    doublecomplex [], int_t *, double [], double []);
+extern int  static_schedule(superlu_dist_options_t *, int, int, 
+		            LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
+			    int_t *, int_t *, int *);
+extern void LUstructInit(const int_t, LUstruct_t *);
+extern void LUstructFree(LUstruct_t *);
+extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+
+/* #define GPU_PROF
+#define IPM_PROF */
+
+extern int_t pzgstrf(superlu_dist_options_t *, int, int, double,
+		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
+extern void pzgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
+			     doublecomplex *, int_t, int, SuperLUStat_t *, int *);
+extern void pzgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
+		    doublecomplex *, int_t, int_t, int_t, int, SOLVEstruct_t *,
+		    SuperLUStat_t *, int *);
+extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
+		       int, int, int_t , int_t *, int_t, int_t, int_t,
+		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       MPI_Request [], SuperLUStat_t *);
+extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
+                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
+		       MPI_Request [], SuperLUStat_t *);
+extern void pzgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
+		    ScalePermstruct_t *, gridinfo_t *,
+		    doublecomplex [], int_t, doublecomplex [], int_t, int,
+		    SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+extern void pzgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
+		  gridinfo_t *, doublecomplex *, int_t, doublecomplex *, int_t,
+		  int, double *, SuperLUStat_t *, int *);
+extern int   pzgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
+				   gridinfo_t *, int_t *, int_t *[],
+				   doublecomplex *[], int_t *[], int_t []);
+extern int  pzgsmv_AXglobal(int_t, int_t [], doublecomplex [], int_t [],
+	                       doublecomplex [], doublecomplex []);
+extern int  pzgsmv_AXglobal_abs(int_t, int_t [], doublecomplex [], int_t [],
+				 doublecomplex [], double []);
+extern void pzgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
+			pzgsmv_comm_t *);
+extern void pzgsmv(int_t, SuperMatrix *, gridinfo_t *, pzgsmv_comm_t *,
+		   doublecomplex x[], doublecomplex ax[]);
+extern void pzgsmv_finalize(pzgsmv_comm_t *);
+
+/* Memory-related */
+extern doublecomplex  *doublecomplexMalloc_dist(int_t);
+extern doublecomplex  *doublecomplexCalloc_dist(int_t);
+extern double  *doubleMalloc_dist(int_t);
+extern double  *doubleCalloc_dist(int_t);
+extern void  *duser_malloc_dist (int_t, int_t);
+extern void  duser_free_dist (int_t, int_t);
+extern int_t zQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *,
+			      SuperLUStat_t *, superlu_dist_mem_usage_t *);
+
+/* Auxiliary routines */
+extern void    zfill_dist (doublecomplex *, int_t, doublecomplex);
+extern void    zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t,
+                                     doublecomplex*, int_t, gridinfo_t*);
+extern void    pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t,
+				doublecomplex [], int_t , gridinfo_t *);
+extern void  zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
+			   doublecomplex **, int_t **, int_t **);
+extern void  zreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
+			 doublecomplex **, int_t **, int_t **);
+extern void  zreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
+		     doublecomplex **, int_t **, int_t **);
+extern void  zreadMM_dist(FILE *, int_t *, int_t *, int_t *,
+	                  doublecomplex **, int_t **, int_t **);
+
+/* Distribute the data for numerical factorization */
+extern float zdist_psymbtonum(fact_t, int_t, SuperMatrix *,
+                                ScalePermstruct_t *, Pslu_freeable_t *, 
+                                LUstruct_t *, gridinfo_t *);
+extern void pzGetDiagU(int_t, LUstruct_t *, gridinfo_t *, doublecomplex *);
+
+
+/* Routines for debugging */
+extern void  zPrintLblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+		 	   LocalLU_t *);
+extern void  zPrintUblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+			   LocalLU_t *);
+extern void  zPrint_CompCol_Matrix_dist(SuperMatrix *);
+extern void  zPrint_Dense_Matrix_dist(SuperMatrix *);
+extern int   zPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
+extern void  PrintDoublecomplex(char *, int_t, doublecomplex *);
+extern int   file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *);
+
+
+/* BLAS */
+
+#ifdef USE_VENDOR_BLAS
+extern void zgemm_(const char*, const char*, const int*, const int*, const int*,
+                  const doublecomplex*, const doublecomplex*, const int*, const doublecomplex*,
+                  const int*, const doublecomplex*, doublecomplex*, const int*, int, int);
+extern void ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
+                  doublecomplex*, int*, int, int, int);
+extern void ztrsm_(char*, char*, char*, char*, int*, int*, 
+                  doublecomplex*, doublecomplex*, int*, doublecomplex*, 
+                  int*, int, int, int, int);
+extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, 
+                  doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int);
+extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
+                 doublecomplex*, int*, doublecomplex*, int*);
+
+#else
+extern int zgemm_(const char*, const char*, const int*, const int*, const int*,
+                   const doublecomplex*,  const doublecomplex*,  const int*,  const doublecomplex*,
+                   const int*,  const doublecomplex*, doublecomplex*, const int*);
+extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
+                  doublecomplex*, int*);
+extern int ztrsm_(char*, char*, char*, char*, int*, int*, 
+                  doublecomplex*, doublecomplex*, int*, doublecomplex*, int*);
+extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, 
+                  doublecomplex *, int *, doublecomplex *, doublecomplex *, int *);
+extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
+                 doublecomplex*, int*, doublecomplex*, int*);
+
+#endif
+
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif /* __SUPERLU_dDEFS */
+
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
new file mode 100644
index 0000000..1c29653
--- /dev/null
+++ b/SRC/supermatrix.h
@@ -0,0 +1,191 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Matrix type definitions
+ */
+
+#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
+#define __SUPERLU_SUPERMATRIX
+
+
+/********************************************
+ * The matrix types are defined as follows. *
+ ********************************************/
+typedef enum {
+    SLU_NC,    /* column-wise, no supernode */
+    SLU_NCP,   /* column-wise, column-permuted, no supernode 
+                  (The consecutive columns of nonzeros, after permutation,
+		   may not be stored  contiguously.) */
+    SLU_NR,    /* row-wize, no supernode */
+    SLU_SC,    /* column-wise, supernode */
+    SLU_SCP,   /* supernode, column-wise, permuted */    
+    SLU_SR,    /* row-wise, supernode */
+    SLU_DN,     /* Fortran style column-wise storage for dense matrix */
+    SLU_NR_loc  /* distributed compressed row format  */ 
+} Stype_t;
+
+typedef enum {
+    SLU_S,     /* single */
+    SLU_D,     /* double */
+    SLU_C,     /* single complex */
+    SLU_Z      /* double complex */
+} Dtype_t;
+
+typedef enum {
+    SLU_GE,    /* general */
+    SLU_TRLU,  /* lower triangular, unit diagonal */
+    SLU_TRUU,  /* upper triangular, unit diagonal */
+    SLU_TRL,   /* lower triangular */
+    SLU_TRU,   /* upper triangular */
+    SLU_SYL,   /* symmetric, store lower half */
+    SLU_SYU,   /* symmetric, store upper half */
+    SLU_HEL,   /* Hermitian, store lower half */
+    SLU_HEU    /* Hermitian, store upper half */
+} Mtype_t;
+
+typedef struct {
+	Stype_t Stype; /* Storage type: interprets the storage structure 
+		   	  pointed to by *Store. */
+	Dtype_t Dtype; /* Data type. */
+	Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
+			  the matrix. */
+	int_t  nrow;   /* number of rows */
+	int_t  ncol;   /* number of columns */
+	void *Store;   /* pointer to the actual storage of the matrix */
+} SuperMatrix;
+
+/***********************************************
+ * The storage schemes are defined as follows. *
+ ***********************************************/
+
+/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */
+typedef struct {
+    int_t  nnz;	    /* number of nonzeros in the matrix */
+    void *nzval;    /* pointer to array of nonzero values, packed by column */
+    int_t  *rowind; /* pointer to array of row indices of the nonzeros */
+    int_t  *colptr; /* pointer to array of beginning of columns in nzval[] 
+		       and rowind[]  */
+                    /* Note:
+		       Zero-based indexing is used;
+		       colptr[] has ncol+1 entries, the last one pointing
+		       beyond the last column, so that colptr[ncol] = nnz. */
+} NCformat;
+
+/* Stype == SLU_NR */
+typedef struct {
+    int_t  nnz;	    /* number of nonzeros in the matrix */
+    void *nzval;    /* pointer to array of nonzero values, packed by raw */
+    int_t  *colind; /* pointer to array of columns indices of the nonzeros */
+    int_t  *rowptr; /* pointer to array of beginning of rows in nzval[] 
+		       and colind[]  */
+                    /* Note:
+		       Zero-based indexing is used;
+		       rowptr[] has nrow+1 entries, the last one pointing
+		       beyond the last row, so that rowptr[nrow] = nnz. */
+} NRformat;
+
+/* Stype == SLU_SC */
+typedef struct {
+  int_t  nnz;	     /* number of nonzeros in the matrix */
+  int_t  nsuper;     /* number of supernodes, minus 1 */
+  void *nzval;       /* pointer to array of nonzero values, packed by column */
+  int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
+  int_t *rowind;     /* pointer to array of compressed row indices of 
+			rectangular supernodes */
+  int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
+  int_t *col_to_sup;   /* col_to_sup[j] is the supernode number to which column 
+			j belongs; mapping from column to supernode number. */
+  int_t *sup_to_col;   /* sup_to_col[s] points to the start of the s-th 
+			supernode; mapping from supernode number to column.
+		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
+		              sup_to_col: 0 1 2 4 7 12           (nsuper=4) */
+                     /* Note:
+		        Zero-based indexing is used;
+		        nzval_colptr[], rowind_colptr[], col_to_sup and
+		        sup_to_col[] have ncol+1 entries, the last one
+		        pointing beyond the last column.
+		        For col_to_sup[], only the first ncol entries are
+		        defined. For sup_to_col[], only the first nsuper+2
+		        entries are defined. */
+} SCformat;
+
+/* Stype == SLU_SCP */
+typedef struct {
+  int_t  nnz;	     /* number of nonzeros in the matrix */
+  int_t  nsuper;     /* number of supernodes */
+  void *nzval;       /* pointer to array of nonzero values, packed by column */
+  int_t  *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j
+			  in nzval[] */
+  int_t  *nzval_colend;/* nzval_colend[j] points to one past the last element
+			  of column j in nzval[] */
+  int_t  *rowind;      /* pointer to array of compressed row indices of 
+			  rectangular supernodes */
+  int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j
+			  in rowind[] */
+  int_t *rowind_colend;/* rowind_colend[j] points to one past the last element
+			  of column j in rowind[] */
+  int_t *col_to_sup;   /* col_to_sup[j] is the supernode number to which column
+			  j belongs; mapping from column to supernode. */
+  int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th 
+			   supernode; mapping from supernode to column.*/
+  int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the
+			   s-th supernode; mapping from supernode number to
+			   column.
+		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
+		              sup_to_colbeg: 0 1 2 4 7              (nsuper=4)
+			      sup_to_colend: 1 2 4 7 12                    */
+                     /* Note:
+		        Zero-based indexing is used;
+		        nzval_colptr[], rowind_colptr[], col_to_sup and
+		        sup_to_col[] have ncol+1 entries, the last one
+		        pointing beyond the last column.         */
+} SCPformat;
+
+/* Stype == SLU_NCP */
+typedef struct {
+    int_t nnz;	  /* number of nonzeros in the matrix */
+    void *nzval;  /* pointer to array of nonzero values, packed by column */
+    int_t *rowind;/* pointer to array of row indices of the nonzeros */
+		  /* Note: nzval[]/rowind[] always have the same length */
+    int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] 
+                     and rowind[]  */
+    int_t *colend;/* colend[j] points to one past the last element of column
+		     j in nzval[] and rowind[]  */
+		  /* Note:
+		     Zero-based indexing is used;
+		     The consecutive columns of the nonzeros may not be 
+		     contiguous in storage, because the matrix has been 
+		     postmultiplied by a column permutation matrix. */
+} NCPformat;
+
+/* Stype == SLU_DN */
+typedef struct {
+    int_t lda;    /* leading dimension */
+    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
+} DNformat;
+
+/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */
+typedef struct {
+    int_t nnz_loc;   /* number of nonzeros in the local submatrix */
+    int_t m_loc;     /* number of rows local to this processor */
+    int_t fst_row;   /* global index of the first row */
+    void  *nzval;    /* pointer to array of nonzero values, packed by row */
+    int_t *rowptr;   /* pointer to array of beginning of rows in nzval[] 
+			and colind[]  */
+    int_t *colind;   /* pointer to array of column indices of the nonzeros */
+                     /* Note:
+			Zero-based indexing is used;
+			rowptr[] has n_loc + 1 entries, the last one pointing
+			beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
+} NRformat_loc;
+
+
+#endif  /* __SUPERLU_SUPERMATRIX */
diff --git a/SRC/symbfact.c b/SRC/symbfact.c
new file mode 100644
index 0000000..abd7e0c
--- /dev/null
+++ b/SRC/symbfact.c
@@ -0,0 +1,901 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Performs a symbolic factorization
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+ * </pre>
+ */
+
+/*
+ * Modified by X. S. Li.
+ */
+
+#include "superlu_ddefs.h"
+
+/* What type of supernodes we want */
+#define T2_SUPER
+
+
+/*
+ * Internal protypes
+ */
+static void  relax_snode(int_t, int_t *, int_t, int_t *, int_t *);
+static int_t snode_dfs(SuperMatrix *, const int_t, const int_t, int_t *,
+		       int_t *,	Glu_persist_t *, Glu_freeable_t *);
+static int_t column_dfs(SuperMatrix *, const int_t, int_t *, int_t *, int_t *,
+			int_t *, int_t *, int_t *, int_t *, int_t *,
+			Glu_persist_t *, Glu_freeable_t *);
+static int_t pivotL(const int_t, int_t *, int_t *,
+		    Glu_persist_t *, Glu_freeable_t *);
+static int_t set_usub(const int_t, const int_t, const int_t, int_t *, int_t *,
+		      Glu_persist_t *, Glu_freeable_t *);
+static void  pruneL(const int_t, const int_t *, const int_t, const int_t,
+		    const int_t *, const int_t *, int_t *,
+		    Glu_persist_t *, Glu_freeable_t *);
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   symbfact() performs a symbolic factorization on matrix A and sets up 
+ *   the nonzero data structures which are suitable for supernodal Gaussian
+ *   elimination with no pivoting (GENP). This routine features:
+ *        o depth-first search (DFS)
+ *        o supernodes
+ *        o symmetric structure pruning
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes needed for LSUB.
+ *   = 0, matrix dimension is 1.
+ *   > 0, number of bytes allocated when out of memory.
+ * </pre>
+ */
+int_t symbfact
+/************************************************************************/
+(
+ superlu_dist_options_t *options, /* input options */
+ int         pnum,     /* process number */
+ SuperMatrix *A,       /* original matrix A permuted by columns (input) */
+ int_t       *perm_c,  /* column permutation vector (input) */
+ int_t       *etree,   /* column elimination tree (input) */
+ Glu_persist_t *Glu_persist,  /* output */
+ Glu_freeable_t *Glu_freeable /* output */
+ )
+{
+
+    int_t m, n, min_mn, j, i, k, irep, nseg, pivrow, info;
+    int_t *iwork, *perm_r, *segrep, *repfnz;
+    int_t *xprune, *marker, *parent, *xplore;
+    int_t relax, *desc, *relax_end;
+    long long int nnzL, nnzU, nnzLU, nnzLSUB;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(pnum, "Enter symbfact()");
+#endif
+
+    m = A->nrow;
+    n = A->ncol;
+    min_mn = SUPERLU_MIN(m, n);
+
+    /* Allocate storage common to the symbolic factor routines */
+    info = symbfact_SubInit(DOFACT, NULL, 0, m, n, ((NCPformat*)A->Store)->nnz,
+			    Glu_persist, Glu_freeable);
+
+    iwork = (int_t *) intMalloc_dist(6*m+2*n);
+    perm_r = iwork;
+    segrep = iwork + m;
+    repfnz = segrep + m;
+    marker = repfnz + m;
+    parent = marker + m;
+    xplore = parent + m;
+    xprune = xplore + m;
+    relax_end = xprune + n;
+    relax = sp_ienv_dist(2);
+    ifill_dist(perm_r, m, EMPTY);
+    ifill_dist(repfnz, m, EMPTY);
+    ifill_dist(marker, m, EMPTY);
+    Glu_persist->supno[0] = -1;
+    Glu_persist->xsup[0] = 0;
+    Glu_freeable->xlsub[0] = 0;
+    Glu_freeable->xusub[0] = 0;
+
+    /*for (j = 0; j < n; ++j) iperm_c[perm_c[j]] = j;*/
+
+    /* Identify relaxed supernodes. */
+    if ( !(desc = intMalloc_dist(n+1)) )
+	ABORT("Malloc fails for desc[]");;
+    relax_snode(n, etree, relax, desc, relax_end);
+    SUPERLU_FREE(desc);
+    
+    for (j = 0; j < min_mn; ) {
+	if ( relax_end[j] != EMPTY ) { /* beginning of a relaxed snode */
+   	    k = relax_end[j];          /* end of the relaxed snode */
+	 
+	    /* Determine union of the row structure of supernode (j:k). */
+	    if ( (info = snode_dfs(A, j, k, xprune, marker,
+				   Glu_persist, Glu_freeable)) != 0 )
+		return info;
+
+	    for (i = j; i <= k; ++i)
+		pivotL(i, perm_r, &pivrow, Glu_persist, Glu_freeable); 
+
+	    j = k+1;
+	} else {
+	    /* Perform a symbolic factorization on column j, and detects
+	       whether column j starts a new supernode. */
+	    if ((info = column_dfs(A, j, perm_r, &nseg, segrep, repfnz,
+				   xprune, marker, parent, xplore,
+				   Glu_persist, Glu_freeable)) != 0)
+		return info;
+	    
+	    /* Copy the U-segments to usub[*]. */
+	    if ((info = set_usub(min_mn, j, nseg, segrep, repfnz,
+				 Glu_persist, Glu_freeable)) != 0)
+		return info;
+
+	    pivotL(j, perm_r, &pivrow, Glu_persist, Glu_freeable); 
+
+	    /* Prune columns [0:j-1] using column j. */
+	    pruneL(j, perm_r, pivrow, nseg, segrep, repfnz, xprune,
+		   Glu_persist, Glu_freeable);
+
+	    /* Reset repfnz[*] to prepare for the next column. */
+	    for (i = 0; i < nseg; i++) {
+		irep = segrep[i];
+		repfnz[irep] = EMPTY;
+	    }
+
+	    ++j;
+	} /* else */
+    } /* for j ... */
+
+    countnz_dist(min_mn, xprune, &nnzL, &nnzU, Glu_persist, Glu_freeable);
+
+    /* Apply perm_r to L; Compress LSUB array. */
+    nnzLSUB = fixupL_dist(min_mn, perm_r, Glu_persist, Glu_freeable);
+
+    if ( !pnum && (options->PrintStat == YES)) {
+	nnzLU = nnzL + nnzU - min_mn;
+	printf("\tNonzeros in L       %lld\n", nnzL);
+	printf("\tNonzeros in U       %lld\n", nnzU);
+	printf("\tnonzeros in L+U     %lld\n", nnzLU);
+	printf("\tnonzeros in LSUB    %lld\n", nnzLSUB);
+    }
+    SUPERLU_FREE(iwork);
+
+#if ( PRNTlevel>=3 )
+    PrintInt10("lsub", Glu_freeable->xlsub[n], Glu_freeable->lsub);
+    PrintInt10("xlsub", n+1, Glu_freeable->xlsub);
+    PrintInt10("xprune", n, xprune);
+    PrintInt10("usub", Glu_freeable->xusub[n], Glu_freeable->usub);
+    PrintInt10("xusub", n+1, Glu_freeable->xusub);
+    PrintInt10("supno", n, Glu_persist->supno);
+    PrintInt10("xsup", (Glu_persist->supno[n])+2, Glu_persist->xsup);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(pnum, "Exit symbfact()");
+#endif
+
+    /* return (-i); */
+    return (-nnzLSUB);
+
+} /* SYMBFACT */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   relax_snode() identifies the initial relaxed supernodes, assuming that 
+ *   the matrix has been reordered according to an postorder of the etree.
+ * </pre>
+ */ 
+static void relax_snode
+/************************************************************************/
+(
+ const int_t n, /* number of columns in the matrix (input) */
+ int_t       *et,   /* column elimination tree (input) */
+ const int_t relax, /* max no of columns allowed in a relaxed snode (input) */
+ int_t       *desc, /* number of descendants of each etree node. */
+ int_t       *relax_end /* last column in a supernode (output) */
+ )
+{
+
+    register int_t j, parent, nsuper;
+    register int_t fsupc; /* beginning of a snode */
+    
+    ifill_dist(relax_end, n, EMPTY);
+    ifill_dist(desc, n+1, 0);
+    nsuper = 0;
+
+    /* Compute the number of descendants of each node in the etree. */
+    for (j = 0; j < n; j++) {
+	parent = et[j];
+	if ( parent != n )  /* not the dummy root */
+	    desc[parent] += desc[j] + 1;
+    }
+
+    /* Identify the relaxed supernodes by postorder traversal of the etree. */
+    for (j = 0; j < n; ) { 
+     	parent = et[j];
+        fsupc = j;
+ 	while ( parent != n && desc[parent] < relax ) {
+	    j = parent;
+	    parent = et[j];
+	}
+	/* Found a supernode with j being the last column. */
+	relax_end[fsupc] = j; /* Last column is recorded. */
+	++nsuper;
+	++j;
+	/* Search for a new leaf. */
+	while ( desc[j] != 0 && j < n ) ++j;
+    }
+
+#if ( DEBUGlevel>=1 )
+    printf(".. No of relaxed snodes: " IFMT "\trelax: " IFMT "\n", nsuper, relax);
+#endif
+} /* RELAX_SNODE */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre> 
+ * Purpose
+ * =======
+ *    snode_dfs() determines the union of the row structures of those 
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
+ *    the part outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *    0   success;
+ *   >0   number of bytes allocated when run out of memory.
+ * </pre>
+ */
+static int_t snode_dfs
+/************************************************************************/
+(
+ SuperMatrix *A,       /* original matrix A permuted by columns (input) */
+ const int_t jcol,      /* beginning of the supernode (input) */
+ const int_t kcol,      /* end of the supernode (input) */
+ int_t       *xprune,   /* pruned location in each adjacency list (output) */
+ int_t       *marker,   /* working array of size m */
+ Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
+ Glu_freeable_t *Glu_freeable
+ )
+{
+
+    NCPformat *Astore;
+    int_t  *asub, *xa_begin, *xa_end;
+    register int_t i, k, ifrom, ito, nextl, new_next;
+    int_t  nsuper, krow, kmark, mem_error;
+    int_t  *xsup, *supno;
+    int_t  *lsub, *xlsub;
+    int_t  nzlmax, nextu;
+    
+    Astore   = A->Store;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+    xsup     = Glu_persist->xsup;
+    supno    = Glu_persist->supno;
+    lsub     = Glu_freeable->lsub;
+    xlsub    = Glu_freeable->xlsub;
+    nzlmax   = Glu_freeable->nzlmax;
+    nsuper   = ++supno[jcol]; /* Next available supernode number */
+    nextl    = xlsub[jcol];
+    nextu    = Glu_freeable->xusub[jcol];
+
+    for (i = jcol; i <= kcol; i++) {
+	/* For each nonzero in A[*,i] */
+	for (k = xa_begin[i]; k < xa_end[i]; ++k) {
+	    krow = asub[k];
+	    kmark = marker[krow];
+	    if ( kmark != kcol ) { /* First time visit krow */
+		marker[krow] = kcol;
+		lsub[nextl++] = krow;
+		if ( nextl >= nzlmax ) {
+		    if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl,
+						      (MemType) LSUB, &nzlmax,
+						      Glu_freeable))
+			return (mem_error);
+		    lsub = Glu_freeable->lsub;
+		}
+	    }
+    	}
+	supno[i] = nsuper;
+	Glu_freeable->xusub[i+1] = nextu; /* Tidy up the pointers in usub[*]. */
+    }
+
+    /* Supernode > 1, then make a copy of the subscripts for pruning */
+    if ( jcol < kcol ) {
+	new_next = nextl + (nextl - xlsub[jcol]);
+	while ( new_next > nzlmax ) {
+	    if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB,
+					      &nzlmax, Glu_freeable))
+		return (mem_error);
+	    lsub = Glu_freeable->lsub;
+	}
+	ito = nextl;
+	for (ifrom = xlsub[jcol]; ifrom < nextl; )
+	    lsub[ito++] = lsub[ifrom++];	
+        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
+	nextl = ito;
+    }
+
+    xsup[nsuper+1] = kcol + 1;
+    supno[kcol+1]  = nsuper;
+    xprune[kcol]   = nextl;
+    xlsub[kcol+1]  = nextl;
+#if ( PRNTlevel>=3 )
+    printf(".. snode_dfs(): (%8d:%8d) nextl %d\n", jcol, kcol, nextl);
+#endif
+    return 0;
+} /* SNODE_DFS */
+
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   column_dfs() performs a symbolic factorization on column jcol, and
+ *   detects the supernode boundary. This routine uses the row indices of
+ *   A[*,jcol] to start the depth-first search (DFS).
+ *
+ * Output
+ * ======
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal 
+ *   representatives ( segrep[*] ) in topological order of the DFS that 
+ *   generates them. The location of the first nonzero in each such 
+ *   supernodal segment is also returned ( repfnz[*] ).
+ *
+ * Data structure
+ * ==============
+ *   (lsub, xlsub):
+ *      lsub[*] contains the compressed subscripts of the supernodes;
+ *      xlsub[j] points to the starting location of the j-th column in
+ *               lsub[*]; 
+ *	Storage: original row subscripts in A.
+ *
+ *      During the course of symbolic factorization, we also use
+ *	(lsub, xlsub, xprune) for the purpose of symmetric pruning.
+ *      For each supernode {s,s+1,...,t=s+r} with first column s and last
+ *	column t, there are two subscript sets,  the last column
+ *      structures (for pruning) will be removed in the end.
+ *        o lsub[j], j = xlsub[s], ..., xlsub[s+1]-1
+ *          is the structure of column s (i.e. structure of this supernode).
+ *          It is used for the storage of numerical values.
+ *	  o lsub[j], j = xlsub[t], ..., xlsub[t+1]-1
+ *	    is the structure of the last column t of this supernode.
+ *	    It is for the purpose of symmetric pruning. Therefore, the
+ *	    structural subscripts can be rearranged without making physical
+ *	    interchanges among the numerical values.
+ *
+ *      (1) if t > s, only the subscript sets for column s and column t
+ *          are stored. Column t represents pruned adjacency structure.
+ *
+ *                  --------------------------------------------
+ *          lsub[*]    ... |   col s    |   col t   | ...
+ *                  --------------------------------------------
+ *                          ^            ^           ^
+ *                       xlsub[s]    xlsub[s+1]  xlsub[t+1]
+ *                                       :           :
+ *                                       :         xprune[t]
+ *                                   xlsub[t]      
+ *                                   xprune[s]    
+ *
+ *      (2) if t == s, i.e., a singleton supernode, the same subscript set
+ *          is used for both G(L) and pruned graph:
+ *
+ *                  --------------------------------------
+ *          lsub[*]    ... |      s     | ...
+ *                  --------------------------------------
+ *                          ^            ^   
+ *                       xlsub[s]   xlsub[s+1]  
+ *                                  xprune[s]
+ *
+ *       DFS will traverse the second subscript list, i.e., the part of the
+ *       pruned graph.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ * </pre>
+ */
+static int_t column_dfs
+/************************************************************************/
+(
+ SuperMatrix *A,        /* original matrix A permuted by columns (input) */
+ const int_t jcol,      /* current column number (input) */
+ int_t       *perm_r,   /* row permutation vector (input) */
+ int_t       *nseg,     /* number of U-segments in column jcol (output) */
+ int_t       *segrep,   /* list of U-segment representatives (output) */
+ int_t       *repfnz,   /* list of first nonzeros in the U-segments (output) */
+ int_t       *xprune,   /* pruned location in each adjacency list (output) */
+ int_t       *marker,   /* working array of size m */
+ int_t       *parent,   /* working array of size m */
+ int_t       *xplore,   /* working array of size m */
+ Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
+ Glu_freeable_t *Glu_freeable
+ )
+{
+
+    NCPformat *Astore;
+    int_t     *asub, *xa_begin, *xa_end;
+    int_t     jcolp1, jcolm1, jsuper, nsuper, nextl;
+    int_t     k, krep, krow, kmark, kperm;
+    int_t     fsupc; /* first column of a supernode */
+    int_t     myfnz; /* first nonzero column of a U-segment */
+    int_t     chperm, chmark, chrep, kchild;
+    int_t     xdfs, maxdfs, kpar, oldrep;
+    int_t     jptr, jm1ptr;
+    int_t     ito, ifrom, istop;	/* used to compress row subscripts */
+    int_t     *xsup, *supno, *lsub, *xlsub;
+    int_t     nzlmax;
+    static int_t first = 1, maxsuper;
+    int_t     mem_error;
+    
+    /* Initializations */
+    Astore   = A->Store;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+    xsup     = Glu_persist->xsup;
+    supno    = Glu_persist->supno;
+    lsub     = Glu_freeable->lsub;
+    xlsub    = Glu_freeable->xlsub;
+    nzlmax   = Glu_freeable->nzlmax;
+    jcolp1   = jcol + 1;
+    jcolm1   = jcol - 1;
+    jsuper   = nsuper = supno[jcol];
+    nextl    = xlsub[jcol];
+    if ( first ) {
+	maxsuper = sp_ienv_dist(3);
+	first = 0;
+    }
+    
+    *nseg = 0;
+
+    /* For each nonzero in A[*,jcol] perform depth-first search. */
+    for (k = xa_begin[jcol]; k < xa_end[jcol]; ++k) {
+	krow = asub[k];
+	kmark = marker[krow];
+
+	/* krow was visited before, go to the next nonzero. */
+	if ( kmark == jcol ) continue; 
+	
+	/* 
+	 * For each unmarked neighber krow of jcol ...
+	 */
+	marker[krow] = jcol; /* mark as "visited" */
+	kperm = perm_r[krow];
+
+	if ( kperm == EMPTY ) {
+	    /* ---------------
+	     *  krow is in L
+	     * ---------------
+	     * place it in structure of L[*,jcol].
+	     */
+	    lsub[nextl++] = krow; 	/* krow is indexed into A */
+	    if ( nextl >= nzlmax ) {
+		if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB,
+						   &nzlmax, Glu_freeable) )
+		    return (mem_error);
+		lsub = Glu_freeable->lsub;
+	    }
+	    if ( kmark != jcolm1 ) jsuper = EMPTY; /* Row index subset test */
+	} else {
+	    /* ---------------
+	     *  krow is in U
+	     * ---------------
+	     * If its supernode krep has been explored, update repfnz[*].
+	     */
+	    krep = xsup[supno[kperm]+1] - 1;
+	    myfnz = repfnz[krep];
+	    
+	    if ( myfnz != EMPTY ) { /* krep was visited before */
+		if ( kperm < myfnz ) repfnz[krep] = kperm;
+		/* continue; */
+	    } else {
+		/* Otherwise perform DFS, starting at krep */
+		oldrep = EMPTY;
+		parent[krep] = oldrep;
+		repfnz[krep] = kperm;
+		xdfs = xlsub[krep];
+		maxdfs = xprune[krep];
+		
+		do {
+		    /* 
+		     * For each unmarked kchild of krep 
+		     */
+		    while ( xdfs < maxdfs ) {
+			kchild = lsub[xdfs++];
+			chmark = marker[kchild];
+			
+			if ( chmark != jcol ) { /* Not reached yet */
+			    marker[kchild] = jcol;
+			    chperm = perm_r[kchild];
+			    
+			    /* Case kchild is in L: place it in L[*,k] */
+			    if ( chperm == EMPTY ) {
+				lsub[nextl++] = kchild;
+				if ( nextl >= nzlmax ) {
+				    if ( mem_error =
+					symbfact_SubXpand(A->ncol, jcol, nextl,
+							  (MemType) LSUB, &nzlmax,
+							  Glu_freeable) )
+					return (mem_error);
+				    lsub = Glu_freeable->lsub;
+				}
+				if ( chmark != jcolm1 ) jsuper = EMPTY;
+			    } else {
+				/* Case kchild is in U: 
+				 * chrep = its supernode-rep. If its rep 
+				 * has been explored, update its repfnz[*].
+				 */
+				chrep = xsup[supno[chperm]+1] - 1;
+				myfnz = repfnz[chrep];
+				if ( myfnz != EMPTY ) {/* Visited before */
+				    if (chperm < myfnz) repfnz[chrep] = chperm;
+				} else {
+				    /* Continue DFS at sup-rep of kchild */
+				    xplore[krep] = xdfs;
+				    oldrep = krep;
+				    krep = chrep; /* Go deeper down G(L') */
+				    parent[krep] = oldrep;
+				    repfnz[krep] = chperm;
+				    xdfs = xlsub[krep];     
+				    maxdfs = xprune[krep];
+				} /* else */
+			    } /* else */
+			} /* if chmark != jcol */
+			
+		    } /* while */
+		    
+		    /* krow has no more unexplored neighbors:
+		     *    place supernode-rep krep in postorder DFS;
+		     *    backtrack DFS to its parent.
+		     */
+		    segrep[*nseg] = krep;
+		    ++(*nseg);
+		    kpar = parent[krep]; /* Pop from stack; recurse */
+		    if ( kpar == EMPTY ) break; /* DFS done */
+		    krep = kpar;
+		    xdfs = xplore[krep];
+		    maxdfs = xprune[krep];
+		} while ( kpar != EMPTY ); /* Until empty stack */
+	    } /* else */
+	} /* else: krow is in U */
+    } /* for each nonzero in A[*, jcol] */
+    
+    /* Check to see if jcol belongs in the same supernode as jcol-1 */
+    if ( jcol == 0 ) { /* Do nothing for column 0 */
+	nsuper = supno[0] = 0;
+    } else {
+	fsupc = xsup[nsuper];
+	jptr = xlsub[jcol];	/* Not compressed yet */
+	jm1ptr = xlsub[jcolm1];
+	
+#ifdef T2_SUPER
+	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
+#endif
+	/* Make sure the number of columns in a supernode doesn't
+	   exceed threshold. */
+	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
+	
+	/* If jcol starts a new supernode, reclaim storage space in
+	 * lsub[*] from the previous supernode. Note we only store
+	 * the subscript set of the first and last columns of
+	 * a supernode. (first for G(L'), last for pruned graph)
+	 */
+	if ( jsuper ==EMPTY ) { /* Starts a new supernode */
+	    if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */
+#ifdef CHK_COMPRESS
+		printf("  Compress lsub[] at super %d-%d\n",fsupc,jcolm1);
+#endif
+		ito = xlsub[fsupc+1];
+		xlsub[jcolm1] = ito;
+		istop = ito + jptr - jm1ptr;
+		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
+		xlsub[jcol] = istop;
+		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+		    lsub[ito] = lsub[ifrom];
+		nextl = ito;            /* = istop + length(jcol) */
+	    }
+	    ++nsuper;
+	    supno[jcol] = nsuper;
+	} /* if a new supernode */
+	
+    } /* else: jcol > 0 */ 
+    
+    /* Tidy up the pointers before exit */
+    xsup[nsuper+1] = jcolp1;
+    supno[jcolp1]  = nsuper;
+    xprune[jcol]   = nextl; /* Initialize an upper bound for pruning. */
+    xlsub[jcolp1]  = nextl;
+    return 0;
+} /* COLUMN_DFS */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   pivotL() interchanges row subscripts so that each diagonal block of a
+ *   supernode in L has the row subscripts sorted in order of pivots.
+ *   The row subscripts in the off-diagonal block are not sorted.
+ * </pre>
+ */
+static int_t pivotL
+/************************************************************************/
+(
+ const int_t jcol,     /* current column number     (input)    */
+ int_t       *perm_r,  /* row permutation vector    (output)   */
+ int_t       *pivrow,  /* the pivot row index       (output)   */
+ Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
+ Glu_freeable_t *Glu_freeable
+ )
+{
+
+    int_t  fsupc;	/* first column in the supernode */
+    int_t  nsupc;	/* number of columns in the supernode */
+    int_t  nsupr;       /* number of rows in the supernode */
+    int_t  lptr;	/* point_ts to the first subscript of the supernode */
+    int_t  diag, diagind;
+    int_t  *lsub_ptr;
+    int_t  isub, itemp;
+    int_t  *lsub, *xlsub;
+
+    /* Initialization. */
+    lsub     = Glu_freeable->lsub;
+    xlsub    = Glu_freeable->xlsub;
+    fsupc    = (Glu_persist->xsup)[(Glu_persist->supno)[jcol]];
+    nsupc    = jcol - fsupc; /* excluding jcol; nsupc >= 0 */
+    lptr     = xlsub[fsupc];
+    nsupr    = xlsub[fsupc+1] - lptr;
+    lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */
+
+    /* Search for diagonal element. */
+    /* diagind = iperm_c[jcol];*/
+    diagind = jcol;
+    diag = EMPTY;
+    for (isub = nsupc; isub < nsupr; ++isub)
+	if ( lsub_ptr[isub] == diagind ) {
+	    diag = isub;
+	    break;
+	}
+
+    /* Diagonal pivot exists? */
+    if ( diag == EMPTY ) {
+	printf("At column " IFMT ", ", jcol);
+	ABORT("pivotL() encounters zero diagonal");
+    }
+
+    /* Record pivot row. */
+    *pivrow = lsub_ptr[diag];
+    perm_r[*pivrow] = jcol;  /* perm_r[] should be Identity. */
+    /*assert(*pivrow==jcol);*/
+    
+    /* Interchange row subscripts. */
+    if ( diag != nsupc ) {
+	itemp = lsub_ptr[diag];
+	lsub_ptr[diag] = lsub_ptr[nsupc];
+	lsub_ptr[nsupc] = itemp;
+    }
+
+    return 0;
+} /* PIVOTL */
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre> 
+ * Purpose
+ * =======
+ *   set_usub() sets up data structure to store supernodal segments in U.
+ *   The supernodal segments in each column are stored in topological order.
+ *   
+ * NOTE
+ * ====
+ *   For each supernodal segment, we only store the index of the first
+ *   nonzero index, rather than the indices of the whole segment, because
+ *   those indices can be generated from first nonzero and supnodal
+ *   representative.
+ *   Therefore, for G(U), we store the "skeleton" of it.
+ * </pre>
+ */
+static int_t set_usub
+/************************************************************************/
+(
+ const int_t n,       /* total number of columns (input) */
+ const int_t jcol,    /* current column number (input) */
+ const int_t nseg,    /* number of supernodal segments in U[*,jcol] (input) */
+ int_t       *segrep, /* list of U-segment representatives (output) */
+ int_t       *repfnz, /* list of first nonzeros in the U-segments (output) */
+ Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
+ Glu_freeable_t *Glu_freeable
+ )
+{
+
+    int_t ksub, krep, ksupno;
+    int_t k, kfnz;
+    int_t jsupno, nextu;
+    int_t new_next, mem_error;
+    int_t *supno;
+    int_t *usub, *xusub;
+    int_t nzumax;
+
+    supno   = Glu_persist->supno;
+    usub    = Glu_freeable->usub;
+    xusub   = Glu_freeable->xusub;
+    nzumax  = Glu_freeable->nzumax;
+    jsupno  = supno[jcol];
+    nextu   = xusub[jcol];
+
+    new_next = nextu + nseg;
+    while ( new_next > nzumax ) {
+	if (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax,
+					  Glu_freeable))
+	    return (mem_error);
+	usub = Glu_freeable->usub;
+    }
+
+    /* We store U-segments in topological order. */
+    k = nseg - 1;
+    for (ksub = 0; ksub < nseg; ++ksub) {
+	krep = segrep[k--];
+	ksupno = supno[krep];
+
+	if ( ksupno != jsupno ) { /* Should go into usub[*] */
+	    kfnz = repfnz[krep];
+	    if ( kfnz != EMPTY ) { /* Nonzero U-segment */
+		usub[nextu++] = kfnz;
+
+/*	    	fsupc = xsup[ksupno];
+	        isub = xlsub[fsupc] + kfnz - fsupc;
+		irow = lsub[isub];
+		usub[nextu++] = perm_r[irow];*/
+	    } /* if ... */
+	} /* if ... */
+    } /* for each segment... */
+
+    xusub[jcol + 1] = nextu; /* Close U[*,jcol] */
+    return 0;
+} /* SET_USUB */
+
+
+/************************************************************************/
+static void pruneL
+/************************************************************************/
+(
+ const int_t  jcol,    /* in */
+ const int_t  *perm_r, /* in */
+ const int_t  pivrow,  /* in */
+ const int_t  nseg,    /* in */
+ const int_t  *segrep, /* in */
+ const int_t  *repfnz, /* in */
+ int_t  *xprune,       /* out */
+ Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
+ Glu_freeable_t *Glu_freeable
+ )
+{
+/*
+ * Purpose
+ * =======
+ *   pruneL() prunes the L-structure of supernodes whose L-structure
+ *   contains the current pivot row "pivrow".
+ *
+ */
+    int_t  jsupno, irep, irep1, kmin, kmax, krow;
+    int_t  i, ktemp;
+    int_t  do_prune; /* logical variable */
+    int_t  *supno;
+    int_t  *lsub, *xlsub;
+
+    supno  = Glu_persist->supno;
+    lsub   = Glu_freeable->lsub;
+    xlsub  = Glu_freeable->xlsub;
+    
+    /*
+     * For each supernode-rep irep in U[*,j]
+     */
+    jsupno = supno[jcol];
+    for (i = 0; i < nseg; i++) {
+	irep = segrep[i];
+	irep1 = irep + 1;
+
+	/* Do not prune with a zero U-segment */
+ 	if ( repfnz[irep] == EMPTY ) continue;
+
+	/*
+	 * If irep has not been pruned & it has a nonzero in row L[pivrow,i]
+	 */
+	do_prune = FALSE;
+	if ( supno[irep] != jsupno ) {
+	    if ( xprune[irep] >= xlsub[irep1] ) {
+		kmin = xlsub[irep];
+		kmax = xlsub[irep1] - 1;
+		for (krow = kmin; krow <= kmax; ++krow) 
+		    if ( lsub[krow] == pivrow ) {
+			do_prune = TRUE;
+			break;
+		    }
+	    }
+	    
+    	    if ( do_prune ) {
+	     	/* Do a quicksort-type partition. */
+	        while ( kmin <= kmax ) {
+	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
+			kmax--;
+		    else if ( perm_r[lsub[kmin]] != EMPTY )
+			kmin++;
+		    else { /* kmin below pivrow, and kmax above pivrow: 
+		            * 	   interchange the two subscripts
+			    */
+		        ktemp = lsub[kmin];
+		        lsub[kmin] = lsub[kmax];
+		        lsub[kmax] = ktemp;
+		        kmin++;
+		        kmax--;
+		    }
+	        } /* while */
+	        xprune[irep] = kmin; /* Pruning */
+#if ( DEBUGlevel>=3 )
+		printf(".. pruneL(): use col %d: xprune[%d] = %d\n",
+		       jcol, irep, kmin);
+#endif
+	    } /* if do_prune */
+	} /* if */
+    } /* for each U-segment ... */
+} /* PRUNEL */
+
diff --git a/SRC/util.c b/SRC/util.c
new file mode 100644
index 0000000..2ae3ccd
--- /dev/null
+++ b/SRC/util.c
@@ -0,0 +1,1181 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Utilities functions
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * February 1, 2003
+ * Modified: March 31, 2013
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/*! \brief Deallocate the structure pointing to the actual storage of the matrix. */
+void
+Destroy_SuperMatrix_Store_dist(SuperMatrix *A)
+{
+    SUPERLU_FREE ( A->Store );
+}
+
+void
+Destroy_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat *Astore = A->Store;
+    SUPERLU_FREE( Astore->rowind );
+    SUPERLU_FREE( Astore->colptr );
+    if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval );
+    SUPERLU_FREE( Astore );
+}
+
+void
+Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc *Astore = A->Store;
+    SUPERLU_FREE( Astore->rowptr );
+    SUPERLU_FREE( Astore->colind );
+    SUPERLU_FREE( Astore->nzval );
+    SUPERLU_FREE( Astore );
+}
+
+void
+Destroy_CompRow_Matrix_dist(SuperMatrix *A)
+{
+    SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
+    SUPERLU_FREE( ((NRformat *)A->Store)->colind );
+    SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
+    SUPERLU_FREE( A->Store );
+}
+
+void
+Destroy_SuperNode_Matrix_dist(SuperMatrix *A)
+{
+    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
+    SUPERLU_FREE ( A->Store );
+}
+
+/*! \brief A is of type Stype==NCP */
+void
+Destroy_CompCol_Permuted_dist(SuperMatrix *A)
+{
+    SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
+    SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
+    SUPERLU_FREE ( A->Store );
+}
+
+/*! \brief A is of type Stype==DN */
+void
+Destroy_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat* Astore = A->Store;
+    SUPERLU_FREE (Astore->nzval);
+    SUPERLU_FREE ( A->Store );
+}
+
+/*! \brief Destroy distributed L & U matrices. */
+void
+Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter Destroy_LU()");
+#endif
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#else
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+#endif
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nb = CEILING(nsupers, grid->nprow);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    SUPERLU_FREE(Llu->ToSendR[0]);
+    SUPERLU_FREE(Llu->ToSendR);
+
+    /* The following can be freed only after iterative refinement. */
+    SUPERLU_FREE(Llu->ilsum);
+    SUPERLU_FREE(Llu->fmod);
+    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE(Llu->fsendx_plist);
+    SUPERLU_FREE(Llu->bmod);
+    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE(Llu->bsendx_plist);
+    SUPERLU_FREE(Llu->mod_bit);
+
+    SUPERLU_FREE(Glu_persist->xsup);
+    SUPERLU_FREE(Glu_persist->supno);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit Destroy_LU()");
+#endif
+}
+
+/*! \brief Allocate storage in ScalePermstruct */
+void ScalePermstructInit(const int_t m, const int_t n,
+			 ScalePermstruct_t *ScalePermstruct)
+{
+    ScalePermstruct->DiagScale = NOEQUIL;
+    if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) )
+	ABORT("Malloc fails for perm_r[].");
+    if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) )
+	ABORT("Malloc fails for perm_c[].");
+}
+
+/*! \brief Deallocate ScalePermstruct */
+void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct)
+{
+    SUPERLU_FREE(ScalePermstruct->perm_r);
+    SUPERLU_FREE(ScalePermstruct->perm_c);
+    switch ( ScalePermstruct->DiagScale ) {
+      case ROW:
+	SUPERLU_FREE(ScalePermstruct->R);
+	break;
+      case COL:
+	SUPERLU_FREE(ScalePermstruct->C);
+	break;
+      case BOTH:
+	SUPERLU_FREE(ScalePermstruct->R);
+	SUPERLU_FREE(ScalePermstruct->C);
+	break;
+    }
+}
+
+/*! \brief Allocate storage in LUstruct */
+void LUstructInit(const int_t n, LUstruct_t *LUstruct)
+{
+    if ( !(LUstruct->etree = intMalloc_dist(n)) )
+	ABORT("Malloc fails for etree[].");
+    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
+	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
+	ABORT("Malloc fails for Glu_persist_t.");
+    if ( !(LUstruct->Llu = (LocalLU_t *)
+	   SUPERLU_MALLOC(sizeof(LocalLU_t))) )
+	ABORT("Malloc fails for LocalLU_t.");
+}
+
+/*! \brief Deallocate LUstruct */
+void LUstructFree(LUstruct_t *LUstruct)
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter LUstructFree()");
+#endif
+
+    SUPERLU_FREE(LUstruct->etree);
+    SUPERLU_FREE(LUstruct->Glu_persist);
+    SUPERLU_FREE(LUstruct->Llu);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit LUstructFree()");
+#endif
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Count the total number of nonzeros in factors L and U,  and in the 
+ * symmetrically reduced L. 
+ * </pre>
+ */
+void
+countnz_dist(const int_t n, int_t *xprune,
+	     long long int *nnzL, long long int *nnzU, 
+	     Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
+{
+    int_t  fnz, fsupc, i, j, nsuper;
+    int_t  jlen, irep;
+    long long int nnzL0;
+    int_t  *supno, *xsup, *xlsub, *xusub, *usub;
+
+    supno  = Glu_persist->supno;
+    xsup   = Glu_persist->xsup;
+    xlsub  = Glu_freeable->xlsub;
+    xusub  = Glu_freeable->xusub;
+    usub   = Glu_freeable->usub;
+    *nnzL  = 0;
+    *nnzU  = 0;
+    nnzL0  = 0;
+    nsuper = supno[n];
+
+    if ( n <= 0 ) return;
+
+    /* 
+     * For each supernode in L.
+     */
+    for (i = 0; i <= nsuper; i++) {
+	fsupc = xsup[i];
+	jlen = xlsub[fsupc+1] - xlsub[fsupc];
+
+	for (j = fsupc; j < xsup[i+1]; j++) {
+	    *nnzL += jlen;
+	    *nnzU += j - fsupc + 1;
+	    jlen--;
+	}
+	irep = xsup[i+1] - 1;
+	nnzL0 += xprune[irep] - xlsub[irep];
+    }
+    
+    /* printf("\tNo of nonzeros in symm-reduced L = %ld\n", nnzL0);*/
+    
+    /* For each column in U. */
+    for (j = 0; j < n; ++j) {
+	for (i = xusub[j]; i < xusub[j+1]; ++i) {
+	    fnz = usub[i];
+	    fsupc = xsup[supno[fnz]+1];
+	    *nnzU += fsupc - fnz;
+	}
+    }
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Fix up the data storage lsub for L-subscripts. It removes the subscript
+ * sets for structural pruning,	and applies permuation to the remaining
+ * subscripts.
+ * </pre>
+ */
+long long int
+fixupL_dist(const int_t n, const int_t *perm_r, 
+	    Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
+{
+    register int_t nsuper, fsupc, nextl, i, j, k, jstrt;
+    register long long int lsub_size;
+    int_t          *xsup, *lsub, *xlsub;
+
+    if ( n <= 1 ) return 0;
+
+    xsup   = Glu_persist->xsup;
+    lsub   = Glu_freeable->lsub;
+    xlsub  = Glu_freeable->xlsub;
+    nextl  = 0;
+    nsuper = (Glu_persist->supno)[n];
+    lsub_size = xlsub[n];
+    
+    /* 
+     * For each supernode ...
+     */
+    for (i = 0; i <= nsuper; i++) {
+	fsupc = xsup[i];
+	jstrt = xlsub[fsupc];
+	xlsub[fsupc] = nextl;
+	for (j = jstrt; j < xlsub[fsupc+1]; j++) {
+	    lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
+	    nextl++;
+  	}
+	for (k = fsupc+1; k < xsup[i+1]; k++) 
+	    	xlsub[k] = nextl;	/* Other columns in supernode i */
+
+    }
+
+    xlsub[n] = nextl;
+    return lsub_size;
+}
+
+/*! \brief Set the default values for the options argument.
+ */
+void set_default_options_dist(superlu_dist_options_t *options)
+{
+    options->Fact              = DOFACT;
+    options->Equil             = YES;
+    options->ParSymbFact       = NO;
+    options->ColPerm           = METIS_AT_PLUS_A;
+    options->RowPerm           = LargeDiag;
+    options->ReplaceTinyPivot  = YES;
+    options->IterRefine        = SLU_DOUBLE;
+    options->Trans             = NOTRANS;
+    options->SolveInitialized  = NO;
+    options->RefineInitialized = NO;
+    options->PrintStat         = YES;
+    options->num_lookaheads    = 10;
+    options->lookahead_etree   = NO;
+    options->SymPattern        = NO;
+}
+
+/*! \brief Print the options setting.
+ */
+void print_options_dist(superlu_dist_options_t *options)
+{
+    if ( options->PrintStat == NO ) return;
+
+    printf("**************************************************\n");
+    printf(".. options:\n");
+    printf("**    Fact             : %4d\n", options->Fact);
+    printf("**    Equil            : %4d\n", options->Equil);
+    printf("**    ParSymbFact      : %4d\n", options->ParSymbFact);
+    printf("**    ColPerm          : %4d\n", options->ColPerm);
+    printf("**    RowPerm          : %4d\n", options->RowPerm);
+    printf("**    ReplaceTinyPivot : %4d\n", options->ReplaceTinyPivot);
+    printf("**    IterRefine       : %4d\n", options->IterRefine);
+    printf("**    Trans            : %4d\n", options->Trans);
+    printf("**    num_lookaheads   : %4d\n", options->num_lookaheads);
+    printf("**    SymPattern       : %4d\n", options->SymPattern);
+    printf("**    lookahead_etree  : %4d\n", options->lookahead_etree);
+    printf("**************************************************\n");
+}
+
+/*! \brief Print the blocking parameters.
+ */
+void print_sp_ienv_dist(superlu_dist_options_t *options)
+{
+    if ( options->PrintStat == NO ) return;
+
+    printf("**************************************************\n");
+    printf(".. blocking parameters from sp_ienv():\n");
+    printf("**    relaxation           : " IFMT "\n", sp_ienv_dist(2));
+    printf("**    max supernode        : " IFMT "\n", sp_ienv_dist(3));
+    printf("**    estimated fill ratio : " IFMT "\n", sp_ienv_dist(6));
+    printf("**************************************************\n");
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * </pre>
+ */
+int_t
+pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
+	     int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
+	     Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct)
+{
+
+    int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *row_to_proc;
+    int_t i, gbi, k, l, num_diag_procs, *diag_procs;
+    int_t irow, q, knsupc, nsupers, *xsup, *supno;
+    int   iam, p, pkk, procs;
+    pxgstrs_comm_t *gstrs_comm;
+
+    procs = grid->nprow * grid->npcol;
+    iam = grid->iam;
+    gstrs_comm = SOLVEstruct->gstrs_comm;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    row_to_proc = SOLVEstruct->row_to_proc;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
+       ------------------------------------------------------------*/
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for B_to_X_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of elements to be sent to each diagonal process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	++SendCnt[p];
+    }
+  
+    /* Set up the displacements for alltoall. */
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+    }
+    for (p = 0; p < procs; ++p) {
+        SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	sdispls_nrhs[p] = sdispls[p] * nrhs;
+        RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+	rdispls_nrhs[p] = rdispls[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->B_to_X_SendCnt = SendCnt;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
+       ------------------------------------------------------------*/
+    /* This is freed in pxgstrs_finalize(). */
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for X_to_B_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		irow = FstBlockC( k );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    q = row_to_proc[inv_perm_c[irow]];
+#else
+		    q = row_to_proc[irow];
+#endif
+		    ++SendCnt[q];
+		    ++irow;
+		}
+	    }
+	}
+    }
+
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    SendCnt_nrhs[0] = SendCnt[0] * nrhs;
+    RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+        sdispls_nrhs[p] = sdispls[p] * nrhs;
+        rdispls_nrhs[p] = rdispls[p] * nrhs;
+	SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->X_to_B_SendCnt = SendCnt;
+
+    if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for ptr_to_ibuf[].");
+    gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
+    gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    return 0;
+} /* PXGSTRS_INIT */
+
+
+void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm)
+{
+    SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
+    SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
+    SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
+    SUPERLU_FREE(gstrs_comm);
+}
+
+
+/*! \brief Diagnostic print of segment info after panel_dfs().
+ */
+void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, 
+			  int_t *segrep, int_t *repfnz)
+{
+    int_t j, k;
+    
+    for (j = jcol; j < jcol+w; j++) {
+	printf("\tcol " IFMT ":\n", j);
+	for (k = 0; k < nseg; k++)
+	    printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, 
+			segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
+    }
+
+}
+
+void
+PStatInit(SuperLUStat_t *stat)
+{
+    register int_t i;
+
+    if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) )
+	ABORT("Malloc fails for stat->utime[]");
+    if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) )
+	ABORT("SUPERLU_MALLOC fails for stat->ops[]");
+    for (i = 0; i < NPHASES; ++i) {
+        stat->utime[i] = 0.;
+        stat->ops[i] = 0.;
+    }
+    stat->TinyPivots = stat->RefineSteps = 0;
+}
+
+void
+PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
+{
+    double  *utime = stat->utime;
+    flops_t *ops = stat->ops;
+    int_t   iam = grid->iam;
+    flops_t flopcnt, factflop, solveflop;
+
+    if ( options->PrintStat == NO ) return;
+
+    if ( !iam && options->Fact != FACTORED ) {
+	printf("**************************************************\n");
+	printf("**** Time (seconds) ****\n");
+
+        if ( options->Equil != NO )
+	    printf("\tEQUIL time         %8.2f\n", utime[EQUIL]);
+	if ( options->RowPerm != NOROWPERM )
+	    printf("\tROWPERM time       %8.2f\n", utime[ROWPERM]);
+	if ( options->ColPerm != NATURAL )
+	    printf("\tCOLPERM time       %8.2f\n", utime[COLPERM]);
+        printf("\tSYMBFACT time      %8.2f\n", utime[SYMBFAC]);
+	printf("\tDISTRIBUTE time    %8.2f\n", utime[DIST]);
+
+    }
+
+    MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
+	       0, grid->comm);
+    factflop = flopcnt;
+    if ( !iam && options->Fact != FACTORED ) {
+	printf("\tFACTOR time        %8.2f\n", utime[FACT]);
+	if ( utime[FACT] != 0.0 )
+	    printf("\tFactor flops\t%e\tMflops \t%8.2f\n",
+		   flopcnt,
+		   flopcnt*1e-6/utime[FACT]);
+    }
+	
+    MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 
+	       0, grid->comm);
+    solveflop = flopcnt;
+    if ( !iam ) {
+	printf("\tSOLVE time         %8.2f\n", utime[SOLVE]);
+	if ( utime[SOLVE] != 0.0 )
+	    printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
+		   flopcnt,
+		   flopcnt*1e-6/utime[SOLVE]);
+	if ( options->IterRefine != NOREFINE ) {
+	    printf("\tREFINEMENT time    %8.2f\tSteps%8d\n\n",
+		   utime[REFINE], stat->RefineSteps);
+	}
+	printf("**************************************************\n");
+    }
+
+#if ( PROFlevel>=1 )
+    fflush(stdout);
+    MPI_Barrier( grid->comm );
+
+    {
+	int_t i, P = grid->nprow*grid->npcol;
+	flops_t b, maxflop;
+	if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
+	for (i = 0; i < P; ++i) {
+	    if ( iam == i) {
+		printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]);
+		fflush(stdout);
+	    }
+	    MPI_Barrier( grid->comm );
+	}
+	if ( !iam ) printf("\n.. FACT ops distribution:\n");
+	for (i = 0; i < P; ++i) {
+	    if ( iam == i ) {
+		printf("\t\t(%d)\t%e\n", iam, ops[FACT]);
+		fflush(stdout);
+	    }
+	    MPI_Barrier( grid->comm );
+	}
+	MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+	if ( !iam ) {
+	    b = factflop/P/maxflop;
+	    printf("\tFACT load balance: %.2f\n", b);
+	}
+	if ( !iam ) printf("\n.. SOLVE ops distribution:\n");
+	for (i = 0; i < P; ++i) {
+	    if ( iam == i ) {
+		printf("\t\t%d\t%e\n", iam, ops[SOLVE]);
+		fflush(stdout);
+	    }
+	    MPI_Barrier( grid->comm );
+	}
+	MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm);
+	if ( !iam ) {
+	    b = solveflop/P/maxflop;
+	    printf("\tSOLVE load balance: %.2f\n", b);
+	}
+    }
+#endif
+
+/*  if ( !iam ) fflush(stdout);  CRASH THE SYSTEM pierre.  */
+}
+
+void
+PStatFree(SuperLUStat_t *stat)
+{
+    SUPERLU_FREE(stat->utime);
+    SUPERLU_FREE(stat->ops);
+}
+
+/*! \brief Fills an integer array with a given value.
+ */
+void ifill_dist(int_t *a, int_t alen, int_t ival)
+{
+    register int_t i;
+    for (i = 0; i < alen; i++) a[i] = ival;
+}
+
+
+void
+get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+	       int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len)
+{
+    int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk;
+    int_t *xsup;
+
+    i = j = *num_diag_procs = pkk = 0;
+    nprow = grid->nprow;
+    npcol = grid->npcol;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+
+    do {
+	++(*num_diag_procs);
+	i = (++i) % nprow;
+	j = (++j) % npcol;
+	pkk = PNUM( i, j, grid );
+    } while ( pkk != 0 ); /* Until wrap back to process 0 */
+    if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) )
+	ABORT("Malloc fails for diag_procs[]");
+    if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) )
+	ABORT("Calloc fails for diag_len[]");
+    for (i = j = k = 0; k < *num_diag_procs; ++k) {
+	pkk = PNUM( i, j, grid );
+	(*diag_procs)[k] = pkk;
+	i = (++i) % nprow;
+	j = (++j) % npcol;
+    }
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	i = k % *num_diag_procs;
+	(*diag_len)[i] += knsupc;
+    }
+}
+
+
+/*! \brief Get the statistics of the supernodes 
+ */
+#define NBUCKS 10
+static 	int_t	max_sup_size;
+
+void super_stats_dist(int_t nsuper, int_t *xsup)
+{
+    register int_t nsup1 = 0;
+    int_t          i, isize, whichb, bl, bh;
+    int_t          bucket[NBUCKS];
+
+    max_sup_size = 0;
+
+    for (i = 0; i <= nsuper; i++) {
+	isize = xsup[i+1] - xsup[i];
+	if ( isize == 1 ) nsup1++;
+	if ( max_sup_size < isize ) max_sup_size = isize;	
+    }
+
+    printf("    Supernode statistics:\n\tno of super = " IFMT "\n", nsuper+1);
+    printf("\tmax supernode size = " IFMT "\n", max_sup_size);
+    printf("\tno of size 1 supernodes = " IFMT "\n", nsup1);
+
+    /* Histogram of the supernode sizes */
+    ifill_dist (bucket, NBUCKS, 0);
+
+    for (i = 0; i <= nsuper; i++) {
+        isize = xsup[i+1] - xsup[i];
+        whichb = (float) isize / max_sup_size * NBUCKS;
+        if (whichb >= NBUCKS) whichb = NBUCKS - 1;
+        bucket[whichb]++;
+    }
+    
+    printf("\tHistogram of supernode sizes:\n");
+    for (i = 0; i < NBUCKS; i++) {
+        bl = (float) i * max_sup_size / NBUCKS;
+        bh = (float) (i+1) * max_sup_size / NBUCKS;
+        printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl+1, bh, bucket[i]);
+    }
+
+}
+
+
+/*! \brief Check whether repfnz[] == EMPTY after reset.
+ */
+void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz)
+{
+    int_t jj, k;
+
+    for (jj = jcol; jj < jcol+w; jj++) 
+	for (k = 0; k < n; k++)
+	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
+		fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n",
+			jj, k, repfnz[(jj-jcol)*n + k]);
+		ABORT("check_repfnz_dist");
+	    }
+}
+
+void PrintInt10(char *name, int_t len, int_t *x)
+{
+    register int_t i;
+    
+    printf("%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 10 == 0 ) printf("\n\t[" IFMT "-" IFMT "]", i, i+9);
+	printf(IFMT, x[i]);
+    }
+    printf("\n");
+}
+
+void PrintInt32(char *name, int len, int *x)
+{
+    register int i;
+    
+    printf("%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9);
+	printf("%6d", x[i]);
+    }
+    printf("\n");
+}
+
+int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x)
+{
+    register int_t i;
+    
+    fprintf(fp, "%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 10 == 0 ) fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i+9);
+	fprintf(fp, IFMT, x[i]);
+    }
+    fprintf(fp, "\n");
+    return 0;
+}
+
+int file_PrintInt32(FILE *fp, char *name, int len, int *x)
+{
+    register int i;
+    
+    fprintf(fp, "%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9);
+	fprintf(fp, "%6d", x[i]);
+    }
+    fprintf(fp, "\n");
+    return 0;
+}
+
+int_t
+CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt)
+{
+    register int_t i, j, zd, numzd = 0;
+
+    for (j = 0; j < n; ++j) {
+	zd = 0;
+	for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) {
+	    /*if ( iperm[rowind[i]] == j ) zd = 1;*/
+	    if ( rowind[i] == j ) { zd = 1; break; }
+	}
+	if ( zd == 0 ) {
+#if ( PRNTlevel>=2 )
+	    printf(".. Diagonal of column %d is zero.\n", j);
+#endif
+	    ++numzd;
+	}
+    }
+
+    return numzd;
+}
+
+
+/* --------------------------------------------------------------------------- */
+void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
+{
+/*
+ * Purpose
+ * =======
+ * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing
+ * order of ARRAY1.
+ *
+ * Arguments
+ * =========
+ * N       (input) INTEGER
+ *          On entry, specifies the size of the arrays.
+ *
+ * ARRAY1  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ *          On entry, contains the array to be sorted.
+ *          On exit, contains the sorted array.
+ *
+ * ARRAY2  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ *          On entry, contains the array to be sorted.
+ *          On exit, contains the sorted array.
+ */
+  int_t IGAP, I, J;
+  int_t TEMP;
+  IGAP = N / 2;
+  while (IGAP > 0) {
+    for (I = IGAP; I < N; I++) {
+    J = I - IGAP;
+    while (J >= 0) {
+      if (ARRAY1[J] > ARRAY1[J + IGAP]) {
+        TEMP = ARRAY1[J];
+        ARRAY1[J] = ARRAY1[J + IGAP];
+        ARRAY1[J + IGAP] = TEMP;
+        TEMP = ARRAY2[J];
+        ARRAY2[J] = ARRAY2[J + IGAP];
+        ARRAY2[J + IGAP] = TEMP;
+        J = J - IGAP;
+      } else {
+        break;
+      }
+    }
+  }
+    IGAP = IGAP / 2;
+  }
+}
+
+
+void isort1(int_t N, int_t *ARRAY)
+{
+/*
+ * Purpose
+ * =======
+ * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing
+ * order of ARRAY1.
+ *
+ * Arguments
+ * =========
+ * N       (input) INTEGER
+ *          On entry, specifies the size of the arrays.
+ *
+ * ARRAY1  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ *          On entry, contains the array to be sorted.
+ *          On exit, contains the sorted array.
+ *
+ * ARRAY2  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ *          On entry, contains the array to be sorted.
+ *          On exit, contains the sorted array.
+ */
+  int_t IGAP, I, J;
+  int_t TEMP;
+  IGAP = N / 2;
+  while (IGAP > 0) {
+  for (I = IGAP; I < N; I++) {
+    J = I - IGAP;
+    while (J >= 0) {
+      if (ARRAY[J] > ARRAY[J + IGAP]) {
+        TEMP = ARRAY[J];
+        ARRAY[J] = ARRAY[J + IGAP];
+        ARRAY[J + IGAP] = TEMP;
+        J = J - IGAP;
+      } else {
+        break;
+      }
+    }
+  }
+    IGAP = IGAP / 2;
+  }
+}
+
+void log_memory(long long cur_bytes, SuperLUStat_t *stat) {
+    stat->current_buffer += (float) cur_bytes;
+    if (cur_bytes > 0) {
+	stat->peak_buffer = 
+	    SUPERLU_MAX(stat->peak_buffer, stat->current_buffer);
+    }
+}
+
+void print_memorylog(SuperLUStat_t *stat, char *msg) {
+    printf("__ %s (MB):\n\tcurrent_buffer : %8.2f\tpeak_buffer : %8.2f\n",
+	   msg, stat->current_buffer, stat->peak_buffer);
+}
+
+int compare_pair (const void *a, const void *b)
+{
+    return (((struct superlu_pair *) a)->val - ((struct superlu_pair *) b)->val);
+}
+
+int get_thread_per_process()
+{   
+    char* ttemp; 
+    ttemp = getenv("THREAD_PER_PROCESS");
+
+    if(ttemp) return atoi(ttemp);
+    else return 1;
+}
+
+int_t
+get_max_buffer_size ()
+{
+    char *ttemp;
+    ttemp = getenv ("MAX_BUFFER_SIZE");
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 5000000;
+}
+
+int_t
+get_cublas_nb ()
+{
+    char *ttemp;
+    ttemp = getenv ("CUBLAS_NB");
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 64;
+}
+
+int_t
+get_num_cuda_streams ()
+{
+    char *ttemp;
+    ttemp = getenv ("NUM_CUDA_STREAMS");
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 8;
+}
+
+int_t
+get_min (int_t * sums, int_t nprocs)
+{
+    int_t min_ind, min_val;
+    min_ind = 0;
+    min_val = 2147483647;
+    for (int i = 0; i < nprocs; i++)
+    {
+        if (sums[i] < min_val)
+        {
+            min_val = sums[i];
+            min_ind = i;
+        }
+    }
+
+    return min_ind;
+}
+
+int_t
+static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition,
+		  int_t ldp, int_t * sums, int_t * counts, int nprocs)
+{
+    //initialization loop
+    for (int i = 0; i < nprocs; ++i)
+    {
+        counts[i] = 0;
+        sums[i] = 0;
+    }
+    qsort (work_load, nwl, sizeof (struct superlu_pair), compare_pair);
+    // for(int i=0;i<nwl;i++)
+    for (int i = nwl - 1; i >= 0; i--)
+    {
+        int_t ind = get_min (sums, nprocs);
+        // printf("ind %d\n",ind );
+        partition[ldp * ind + counts[ind]] = work_load[i].ind;
+        counts[ind]++;
+        sums[ind] += work_load[i].val;
+
+    }
+
+    return 0;
+}
+
+/*
+ * Search for the metadata of the j-th block in a U panel.
+ */
+void
+arrive_at_ublock (int_t j,      /* j-th block in a U panel */
+                  int_t * iukp, /* output : point to index[] of j-th block */
+                  int_t * rukp, /* output : point to nzval[] of j-th block */ 
+		  int_t * jb,   /* Global block number of block U(k,j). */
+                  int_t * ljb,  /* Local block number of U(k,j). */
+                  int_t * nsupc,/* supernode size of destination block */
+                  int_t iukp0,  /* input : search starting point */
+                  int_t rukp0, 
+		  int_t * usub, /* usub scripts */
+                  int_t * perm_u, /* permutation vector from static schedule */
+                  int_t * xsup, /* for SuperSize and LBj */
+                  gridinfo_t * grid)
+{
+    int_t jj;
+    *iukp = iukp0; /* point to the first block in index[] */
+    *rukp = rukp0; /* point to the start of nzval[] */
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++) /* perm_u[j] == j */
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++) /* == j */
+#endif
+    {
+        // printf("iukp %d \n",*iukp );
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        // printf("jb %d \n",*jb );
+        *nsupc = SuperSize (*jb);
+        // printf("nsupc %d \n",*nsupc );
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+
+        *rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj);
+				     Move to block U(k,jj+1) in nzval[] */ 
+        *iukp += *nsupc;
+    }
+
+    /* Set the pointers to the begining of U block U(k,j) */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+
+
+/*
+ * Count the maximum size of U(k,:) across all the MPI processes.
+ * September 28, 2016
+ */
+static int_t num_full_cols_U
+(
+ int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+ gridinfo_t *grid, int_t *perm_u,
+ int_t *ldu /* max. size of nonzero columns in U(kk,:) */
+)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL) return 0; /* code */
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb,ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    for (int_t j = 0; j < nub; ++j) {
+        
+	/* Sherry -- no need to search from beginning ?? */
+        arrive_at_ublock(
+			 j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			 iukp0, rukp0, usub, perm_u, xsup, grid
+			 );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj) {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+            if ( segsize > *ldu ) *ldu = segsize;
+        }
+    }
+    return temp_ncols;
+}
+
+int_t estimate_bigu_size(int_t nsupers,
+			 int_t ldt, /* Largest segment of all U(k,:) columns */
+			 int_t**Ufstnz_br_ptr, /* point to U index[] array */
+			 Glu_persist_t *Glu_persist,
+			 gridinfo_t* grid, int_t* perm_u)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t ncols = 0; /* Count local number of nonzero columns */
+    int_t ldu = 0;   /* Count local max. size of nonzero columns */
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i) perm_u[i] = i;
+
+    for (int lk = myrow; lk < nsupers; lk += Pr ) {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu) );
+    }
+
+    int_t max_ncols = 0;
+    int_t max_ldu = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+    MPI_Allreduce(&ldu, &max_ldu, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+#if ( PRNTlevel>=1 )
+    printf("max_ncols %d, max_ldu %d, ldt %d, bigu_size=%d\n",
+	   max_ncols, max_ldu, ldt, max_ldu*max_ncols);
+#endif
+    return(max_ldu * max_ncols);
+}
diff --git a/SRC/util_dist.h b/SRC/util_dist.h
new file mode 100644
index 0000000..b369037
--- /dev/null
+++ b/SRC/util_dist.h
@@ -0,0 +1,147 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Header for utilities
+ */
+
+#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
+#define __SUPERLU_UTIL
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "superlu_enum_consts.h"
+
+/*
+ * Macros
+ */
+#ifndef USER_ABORT
+#define USER_ABORT(msg) superlu_abort_and_exit_dist(msg)
+#endif
+
+#define ABORT(err_msg) \
+ { char msg[256];\
+   sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
+   USER_ABORT(msg); }
+
+
+#ifndef USER_MALLOC
+#define USER_MALLOC(size) superlu_malloc_dist(size)
+#endif
+
+#define SUPERLU_MALLOC(size) USER_MALLOC(size)
+
+#ifndef USER_FREE
+#define USER_FREE(addr) superlu_free_dist(addr)
+#endif
+
+#define SUPERLU_FREE(addr) USER_FREE(addr)
+
+#define CHECK_MALLOC(pnum, where) {                 \
+    extern long int superlu_malloc_total;        \
+    printf("(%d) %s: superlu_malloc_total (MB) %.6f\n", \
+	   pnum, where, superlu_malloc_total*1e-6); \
+}
+
+#define SUPERLU_MAX(x, y) 	( (x) > (y) ? (x) : (y) )
+#define SUPERLU_MIN(x, y) 	( (x) < (y) ? (x) : (y) )
+
+    
+/* 
+ * Constants 
+ */
+#define EMPTY	(-1)
+#ifndef FALSE
+#define FALSE	(0)
+#endif
+#ifndef TRUE
+#define TRUE	(1)
+#endif
+
+/*
+ * Type definitions
+ */
+typedef float    flops_t;
+typedef unsigned char Logical;
+
+/*
+#ifdef _CRAY
+#define int short
+#endif
+*/
+
+typedef struct {
+    int     *panel_histo; /* histogram of panel size distribution */
+    double  *utime;       /* running time at various phases */
+    flops_t *ops;         /* operation count at various phases */
+    int     TinyPivots;   /* number of tiny pivots */
+    int     RefineSteps;  /* number of iterative refinement steps */
+    int     num_look_aheads; /* number of look ahead */
+    /*-- new --*/
+    float   current_buffer; /* bytes allocated for buffer in numerical factorization */
+    float   peak_buffer;    /* monitor the peak buffer size (bytes) */
+    float   gpu_buffer;     /* monitor the buffer allocated on GPU (bytes) */
+} SuperLUStat_t;
+
+/* Headers for 2 types of dynamatically managed memory */
+typedef struct e_node {
+    int size;      /* length of the memory that has been used */
+    void *mem;     /* pointer to the new malloc'd store */
+} ExpHeader;
+
+typedef struct {
+    int  size;
+    int  used;
+    int  top1;  /* grow upward, relative to &array[0] */
+    int  top2;  /* grow downward */
+    void *array;
+} LU_stack_t;
+
+/* Constants */
+#define GluIntArray(n)   (5 * (n) + 5)
+#define NO_MEMTYPE  6      /* 0: lusup;
+			      1: ucol;
+			      2: lsub;
+			      3: usub
+			      4: llvl; level number in L for ILU(k)
+			      5: ulvl; level number in U for ILU(k)
+                           */
+
+/* Macros to manipulate stack */
+#define StackFull(x)         ( x + stack.used >= stack.size )
+#define NotDoubleAlign(addr) ( (long)addr & 7 )
+#define DoubleAlign(addr)    ( ((long)addr + 7) & ~7L )
+#define TempSpace(n, w)      ( (2*w + 4 + NO_MARKER)*m*sizeof(int) + \
+			      (w + 1)*n*sizeof(double) )
+#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
+
+#define FIRSTCOL_OF_SNODE(i)	(xsup[i])
+
+#if ( PROFlevel>=1 )
+#define TIC(t)          t = SuperLU_timer_()
+#define TOC(t2, t1)     t2 = SuperLU_timer_() - t1
+#else
+#define TIC(t)
+#define TOC(t2, t1)
+#endif
+
+/*********************************************************
+ * Macros used for easy access of sparse matrix entries. *
+ *********************************************************/
+#define L_SUB_START(col)     ( Lstore->rowind_colptr[col] )
+#define L_SUB(ptr)           ( Lstore->rowind[ptr] )
+#define L_NZ_START(col)      ( Lstore->nzval_colptr[col] )
+#define L_FST_SUPC(superno)  ( Lstore->sup_to_col[superno] )
+#define U_NZ_START(col)      ( Ustore->colptr[col] )
+#define U_SUB(ptr)           ( Ustore->rowind[ptr] )
+
+#endif /* __SUPERLU_UTIL */
diff --git a/SRC/xerr_dist.c b/SRC/xerr_dist.c
new file mode 100644
index 0000000..c09ef76
--- /dev/null
+++ b/SRC/xerr_dist.c
@@ -0,0 +1,33 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief
+
+<pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Modified: November 21, 1999
+ *
+</pre> 
+*/
+#include <stdio.h>
+#include "Cnames.h"
+
+/* xerbla */
+int xerr_dist(char *srname, int *info)
+{
+    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
+		srname, *info);
+    return 0;
+} /* xerr_dist */
+
diff --git a/SRC/zSchCompUdt-2Ddynamic.c b/SRC/zSchCompUdt-2Ddynamic.c
new file mode 100644
index 0000000..46fa613
--- /dev/null
+++ b/SRC/zSchCompUdt-2Ddynamic.c
@@ -0,0 +1,524 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief This file contains the main loop of pdgstrf which involves rank k
+ *        update of the Schur complement.
+ *        Uses 2D partitioning for the scatter phase.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY guided 
+double tt_start;
+double tt_end;
+
+if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+    int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
+    int temp_nbrow;   /* nonzero rows in current block L(i,k) */
+    lptr  = lptr0;
+    luptr = luptr0;
+    /**
+     * Seperating L blocks into the top part within look-ahead window
+     * and the remaining ones.
+     */
+     int lookAheadBlk=0, RemainBlk=0;
+
+     tt_start = SuperLU_timer_();
+
+     /* Loop through all blocks in L(:,k) to set up pointers to the start 
+      * of each block in the data arrays.
+      *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
+      *   - lookAheadStRow[i] := number of nonzero rows before block i
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[] 
+      *   - (ditto Remain_Info[i])
+      */
+     for (int i = 0; i < nlb; ++i) {
+	 ib = lsub[lptr];            /* block number of L(i,k). */
+	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+        
+	 int look_up_flag = 1; /* assume ib is outside look-up window */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
+	     {
+		 if(ib == perm_c_supno[j]) {
+		     look_up_flag=0; /* flag ib is within look-up window */
+                     break; /* Sherry -- can exit the loop?? */
+                 }
+	     }
+	 
+	 if( look_up_flag == 0 ) { /* ib is within look up window */
+	     if (lookAheadBlk==0) {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
+	     } else {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];   
+	     }
+	     lookAheadStRow[lookAheadBlk] = cum_nrow;
+	     lookAhead_lptr[lookAheadBlk] = lptr;
+	     lookAhead_ib[lookAheadBlk] = ib; 
+	     lookAheadBlk++;
+	 } else { /* ib is not in look up window */
+
+	     if (RemainBlk==0) {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow;
+	     } else {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;   
+	     }
+
+             RemainStRow[RemainBlk] = cum_nrow;
+             // Remain_lptr[RemainBlk] = lptr;
+	     Remain_info[RemainBlk].lptr = lptr;
+	     // Remain_ib[RemainBlk] = ib; 
+	     Remain_info[RemainBlk].ib = ib; 
+	     RemainBlk++;
+	 }
+	 
+         cum_nrow +=temp_nbrow;
+	 
+	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+	 lptr += temp_nbrow;     /* Move to next block */
+	 luptr += temp_nbrow;
+     }  /* for i ... all blocks in L(:,k) */
+
+     lptr = lptr0;
+     luptr = luptr0;
+
+     /* leading dimension of L buffer */
+#if 0
+     int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
+#else /* Piyush fix */
+     int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
+#endif
+
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef __OPENMP
+     /* #pragma omp parallel for -- why not?? Sherry */
+#endif
+     for (int i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest  = 0;
+	 int temp_nbrow;
+	 if (i==0) {
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+	 
+	 int StRowSource=lookAheadStRow[i];
+	 
+	 /* Now copying the matrix*/
+	 // #pragma omp parallel for (gives slow down)
+	 for (int j = 0; j < knsupc; ++j) {
+	     memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
+		    &lusup[luptr+j*nsupr+StRowSource],
+		    temp_nbrow * sizeof(doublecomplex) );
+	 }
+     }
+
+     int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+
+    /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for 
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+	 int StRowDest  = 0;
+	 int temp_nbrow;
+         if (i==0)  {
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
+	 }
+
+	 int StRowSource=RemainStRow[i];
+
+	 /* Now copying the matrix*/
+	 // #pragma omp parallel for (gives slow down)
+	 for (int j = 0; j < knsupc; ++j) {
+	     // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
+	     memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
+		    &lusup[luptr+j*nsupr+StRowSource],
+                    temp_nbrow * sizeof(doublecomplex) );
+	 }
+     } /* parallel for i ... */
+
+#if ( PRNTlevel>=1 )
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
+#endif
+#if 0
+     LookAheadRowSepMOP  +=  2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
+#else
+     int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
+                              or remaining part.  */
+     lnbrow = lookAheadBlk==0 ? 0  : lookAheadFullRow[lookAheadBlk-1];
+     rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     nbrow = lnbrow + rnbrow; /* total number of rows in L */
+     LookAheadRowSepMOP += 2*knsupc*(nbrow);
+#endif     
+     
+     /**********************
+      * Gather U blocks *
+      **********************/
+
+     tt_start = SuperLU_timer_();
+#if 0     
+     nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
+#endif
+
+     if ( nbrow > 0 ) { /* L(:,k) is not empty */
+	 /*
+	  * Counting U blocks
+	  */
+	 ncols = 0; /* total number of nonzero columns in U(k,:) */
+	 ldu   = 0;
+	 full  = 1; /* flag the U block is indeed 'full', containing segments
+	               of same length. No need padding 0 */
+	 int temp_ncols=0;
+
+         /* Loop through all blocks in U(k,:) to set up pointers to the start
+          * of each block in the data arrays, store them in Ublock_info[j]
+          * for block U(k,j).
+  	  */
+	 for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+	     temp_ncols = 0;
+	     arrive_at_ublock(
+			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			      iukp0, rukp0, usub, perm_u, xsup, grid
+			      );
+	     Ublock_info[j].iukp = iukp;
+	     Ublock_info[j].rukp = rukp;
+	     Ublock_info[j].jb = jb;
+	     
+	     /* Prepare to call GEMM. */
+	     jj = iukp;
+	     
+	     for (; jj < iukp+nsupc; ++jj) {
+		 segsize = klst - usub[jj];
+		 if ( segsize ) {
+                    ++temp_ncols;
+                    if ( segsize != ldu ) full = 0; /* need padding 0 */
+                    if ( segsize > ldu ) ldu = segsize;
+		 }
+	     }
+
+	     Ublock_info[j].full_u_cols = temp_ncols;
+	     ncols += temp_ncols;
+	 }
+
+	 /* Now doing prefix sum on full_u_cols.
+	  * After this, full_u_cols is the number of nonzero columns
+          * from block 0 to block j.
+          */
+	 for ( j = jj0+1; j < nub; ++j) {
+	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
+	 }
+            
+	 tempu = bigU; /* buffer the entire row block U(k,:) */
+
+         /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
+#ifdef _OPENMP        
+#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
+	lead_zero, jj, i) \
+        default (shared) schedule(SCHEDULE_STRATEGY)
+#endif
+        for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+
+            if(j==jj0) tempu = bigU;
+            else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+
+            /* == processing each of the remaining columns == */
+            arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			     iukp0, rukp0, usub,perm_u, xsup, grid);
+
+            /* Copy from U(k,:) to tempu[], padding zeros.  */            
+            for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+                    lead_zero = ldu - segsize;
+                    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                    tempu += lead_zero;
+                    for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+                    rukp += segsize;
+                    tempu += segsize;
+                }
+            }
+
+            rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+
+        }   /* parallel for j:jjj_st..jjj */
+
+        tempu = bigU;  /* setting to the start of padded U(k,:) */
+
+    }  /* end if (nbrow>0) */
+
+#if ( PRNTlevel>=1 )
+    GatherUTimer += SuperLU_timer_() - tt_start;
+#endif
+    GatherMOP += 2*ldu*ncols;
+
+    int Lnbrow   = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+    int Rnbrow   = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+    int jj_cpu=nub;       /*limit between CPU and GPU */
+    int thread_id;
+    tempv = bigV;
+
+    /**************************************
+     * Perform GEMM followed by Scatter *
+     **************************************/
+
+    if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+        /* Perform a large GEMM call */
+        ncols = Ublock_info[nub-1].full_u_cols;
+        schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+        stat->ops[FACT]    += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+
+        /***************************************************************
+         * Updating look-ahead blocks in both L and U look-ahead windows.
+         ***************************************************************/
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
+     {
+ 	thread_id = omp_get_thread_num();
+ 
+ 	/* Ideally, should organize the loop as:
+                for (j = 0; j < nub; ++j) {
+                    for (lb = 0; lb < lookAheadBlk; ++lb) {
+ 	               L(lb,k) X U(k,j) -> tempv[]
+                    }
+                }
+ 	   But now, we use collapsed loop to achieve more parallelism.
+ 	   Total number of block updates is:
+ 	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+ 	*/
+#pragma omp for \
+    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+ 	thread_id = 0;
+#endif
+ 	/* Each thread is assigned one loop index ij, responsible for 
+ 	   block update L(lb,k) * U(k,j) -> tempv[]. */
+        for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	    if ( thread_id == 0 ) tt_start = SuperLU_timer_();
+
+            int j   = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+            int lb  = ij%lookAheadBlk;
+
+            int* indirect_thread    = indirect + ldt*thread_id;
+            int* indirect2_thread   = indirect2 + ldt*thread_id;
+            doublecomplex* tempv1 = bigV + thread_id*ldt*ldt; 
+
+            /* Getting U block U(k,j) information */
+            /* unsigned long long ut_start, ut_end; */
+            int_t rukp =  Ublock_info[j].rukp;
+            int_t iukp =  Ublock_info[j].iukp;
+            int jb   =  Ublock_info[j].jb;
+            int nsupc = SuperSize(jb);
+            int ljb = LBj (jb, grid);  /* destination column block */
+            int st_col;
+            int ncols;
+            if ( j>jj0 ) { /* jj0 was set to 0 */
+                ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+                st_col = Ublock_info[j-1].full_u_cols;
+            } else {
+                ncols  = Ublock_info[j].full_u_cols;
+                st_col = 0;   
+            }
+
+            /* Getting L block L(i,k) information */
+            int_t lptr = lookAhead_lptr[lb];
+            int ib   = lookAhead_ib[lb];
+            int temp_nbrow = lsub[lptr+1];
+            lptr += LB_DESCRIPTOR;
+            int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+
+#if ( PRNTlevel>= 1)
+	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
+	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+#endif
+
+#if defined (USE_VENDOR_BLAS)            
+            zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+            zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0) {
+		tt_end = SuperLU_timer_();
+		LookAheadGEMMTimer += tt_end - tt_start;
+		tt_start = tt_end;
+	    }
+#endif
+            if ( ib < jb ) {
+                zscatter_u (
+				 ib, jb,
+				 nsupc, iukp, xsup,
+				 klst, temp_nbrow,
+				 lptr, temp_nbrow, lsub,
+				 usub, tempv1,
+				 Ufstnz_br_ptr, Unzval_br_ptr,
+				 grid
+			        );
+            } else {
+                zscatter_l (
+				 ib, ljb, 
+				 nsupc, iukp, xsup,
+ 				 klst, temp_nbrow,
+				 lptr, temp_nbrow,
+				 usub, lsub, tempv1,
+				 indirect_thread, indirect2_thread,
+				 Lrowind_bc_ptr, Lnzval_bc_ptr,
+				 grid
+				);
+            }
+
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0)
+		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+        } /* end omp for ij = ... */
+#ifdef _OPENMP
+    } /* end omp parallel */
+#endif
+        LookAheadGEMMFlOp  += 2*(double)Lnbrow * (double)ldu * (double)ncols;
+        stat->ops[FACT]    += 2*(double)Lnbrow * (double)ldu * (double)ncols;
+        LookAheadScatterMOP += 3*Lnbrow*ncols;
+    } /* end if Lnbrow < ... */
+    
+    /***************************************************************
+     * Updating remaining rows and columns on CPU.
+     ***************************************************************/
+    Rnbrow  = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+    ncols   = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    schur_flop_counter  += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+    stat->ops[FACT]     += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
+    {
+	thread_id = omp_get_thread_num();
+ 
+	/* Ideally, should organize the loop as:
+               for (j = 0; j < jj_cpu; ++j) {
+                   for (lb = 0; lb < RemainBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+                   }
+               }
+	   But now, we use collapsed loop to achieve more parallelism.
+	   Total number of block updates is:
+	      (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	*/
+#pragma omp for \
+    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+    thread_id = 0;
+#endif
+	/* Each thread is assigned one loop index ij, responsible for 
+	   block update L(lb,k) * U(k,j) -> tempv[]. */
+    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
+	int j   = ij / RemainBlk + jj0; 
+	int lb  = ij % RemainBlk;
+
+	int* indirect_thread = indirect + ldt*thread_id;
+	int* indirect2_thread = indirect2 + ldt*thread_id;
+	doublecomplex* tempv1 = bigV + thread_id*ldt*ldt; 
+
+	/* Getting U block U(k,j) information */
+	/* unsigned long long ut_start, ut_end; */
+	int_t rukp =  Ublock_info[j].rukp;
+	int_t iukp =  Ublock_info[j].iukp;
+	int jb   =  Ublock_info[j].jb;
+	int nsupc = SuperSize(jb);
+	int ljb = LBj (jb, grid);
+	int st_col;
+	int ncols;
+	if ( j>jj0 ) {
+	    ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+	    st_col = Ublock_info[j-1].full_u_cols;
+	} else {
+	    ncols  = Ublock_info[j].full_u_cols;
+	    st_col = 0;   
+	}
+
+	/* Getting L block L(i,k) information */
+	int_t lptr = Remain_info[lb].lptr;
+	int ib   = Remain_info[lb].ib;
+	int temp_nbrow = lsub[lptr+1];
+	lptr += LB_DESCRIPTOR;
+	int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+#if ( PRNTlevel>=1 )
+	if ( thread_id==0 ) tt_start = SuperLU_timer_();
+#endif
+
+	/* calling GEMM */
+#if defined (USE_VENDOR_BLAS)
+	zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
+	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+	zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
+	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+
+#if ( PRNTlevel>=1 )
+	if (thread_id==0) {
+	    tt_end = SuperLU_timer_();
+	    RemainGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	}
+#endif
+
+	/* Now scattering the block */
+	if ( ib<jb ) {
+	    zscatter_u(
+			    ib, jb,
+			    nsupc, iukp, xsup,
+			    klst, temp_nbrow,
+			    lptr, temp_nbrow,lsub,
+			    usub, tempv1,
+			    Ufstnz_br_ptr, Unzval_br_ptr,
+			    grid
+		           );
+	} else {
+	    zscatter_l(
+			    ib, ljb,
+			    nsupc, iukp, xsup,
+			    klst, temp_nbrow,
+			    lptr, temp_nbrow,
+			    usub, lsub, tempv1,
+			    indirect_thread, indirect2_thread,
+			    Lrowind_bc_ptr,Lnzval_bc_ptr,
+			    grid
+			   );
+	}
+
+#if ( PRNTlevel>=1 )
+	if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+    } /* end omp for (int ij =...) */
+#ifdef _OPENMP
+    } /* end omp parallel region */
+#endif
+}  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/zSchCompUdt-cuda.c b/SRC/zSchCompUdt-cuda.c
new file mode 100644
index 0000000..22bea9b
--- /dev/null
+++ b/SRC/zSchCompUdt-cuda.c
@@ -0,0 +1,553 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief This file contains the main loop of pzgstrf which involves
+ *        rank k update of the Schur complement.
+ *        Uses CUDA GPU.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY dynamic
+
+#define cublasCheckErrors(fn) \
+    do { \
+        cublasStatus_t __err = fn; \
+        if (__err != CUBLAS_STATUS_SUCCESS) { \
+            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
+                (int)(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while(0);
+
+
+if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
+    ldu   =0;
+    full  =1;
+    int cum_nrow;
+    int temp_nbrow;
+
+    lptr = lptr0;
+    luptr = luptr0;
+    
+    nbrow= lsub[1];
+    if (myrow==krow) nbrow = lsub[1]-lsub[3];
+
+    if (nbrow>0) {
+        
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
+        int num_streams_used,        /*number of streams that will be used*/
+        ncpu_blks;                     /*Number of CPU dgemm blks*/
+
+        int jjj, jjj_st,jjj_global;        
+        for (j = jj0; j < nub; ++j) {
+            arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
+
+            ncols =0 ;  //initialize at 0 
+            jj = iukp;
+            int temp_ldu=0; 
+            for (; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+		    ++ncols;
+		}
+                temp_ldu = SUPERLU_MAX(temp_ldu, segsize);
+            }
+
+            full_u_cols[j] = ncols;
+            blk_ldu[j] = temp_ldu;
+        } /* end for j = jj0..nub */
+
+        jjj = jj0; /* initialization */
+            
+        // #pragma omp barrier 
+        while ( jjj < nub ) {
+            jjj_st=jjj;
+#ifdef _OPENMP
+#pragma omp single
+#endif
+            {
+                ldu = blk_ldu[jjj_st];
+                for (j = jjj_st; j < nub ; ++j) {
+                    
+                    /* prefix sum */
+                    if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
+
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);   
+
+                    /* break condition */
+                    /* the number of columns that can be processed is limited by buffer size*/
+                    if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
+                        break;
+                    }
+                } /* end for j=jjj_st to nub */  
+
+                jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj will be nub */
+                
+                // TAU_STATIC_TIMER_START("work_divison");
+                /* Divide CPU-GPU gemm here */
+                gemm_division_cpu_gpu(
+		       &num_streams_used, /*number of streams that will be used*/
+		       stream_end_col,    /*array holding last column blk for each partition*/
+		       &ncpu_blks,        /*Number of CPU gemm blks*/
+		       			  /*input*/
+		       nbrow,             /*number of row in A matrix*/
+		       ldu,               /*number of k in dgemm*/
+		       nstreams,
+		       full_u_cols + jjj_st, /*array containing prefix sum of work load*/
+		       jjj_global-jjj_st     /*Number of work load */
+                );
+                // TAU_STATIC_TIMER_STOP("work_divison");
+
+            } /* pragma omp single */
+
+            jjj = jjj_global;
+            // printf("thread_id %d, jjj %d \n",thread_id,jjj );
+            if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
+                printf("allocate more memory for buffer !!!!\n");
+                if(nbrow * full_u_cols[jjj_st] > buffer_size)
+                    printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
+            }
+            
+            // #pragma omp barrier 
+            /* gathering circuit */
+            assert(jjj_st<nub);
+            assert(jjj-1<nub);
+            // TAU_STATIC_TIMER_START("GATHER_U");
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY )
+#endif
+            for (j = jjj_st; j < jjj; ++j) {
+                if (j==jjj_st) tempu = bigU;
+                else tempu = bigU + ldu*full_u_cols[j-1];
+
+                /* == processing each of the remaining columns == */
+                arrive_at_ublock(j,&iukp,&rukp,&jb,&ljb,&nsupc,
+				 iukp0,rukp0,usub,perm_u,xsup,grid);
+
+                // tempu = tempU2d;
+                for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                    segsize = klst - usub[jj];
+                    if ( segsize ) {
+                        lead_zero = ldu - segsize;
+                        for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                        tempu += lead_zero;
+                        for (i = 0; i < segsize; ++i)
+                            tempu[i] = uval[rukp+i];
+                        rukp += segsize;
+                        tempu += segsize;
+                    }
+                }
+
+                rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+
+            } /* end for j=jjj_st to jjj */  
+
+	    if ( num_streams_used > 0 ) {
+#ifdef PI_DEBUG
+		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size );
+		assert(nbrow*ldu<=ldt*max_row_size);
+#endif 
+		cudaMemcpy2DAsync(dA, nbrow*sizeof(doublecomplex),
+				  &lusup[luptr+(knsupc-ldu)*nsupr],
+				  nsupr*sizeof(doublecomplex), nbrow*sizeof(doublecomplex),
+				  ldu, cudaMemcpyHostToDevice, streams[0]);
+	    }
+                
+	    for (int i = 0; i < num_streams_used; ++i) {
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1]; 
+		int st_col = full_u_cols[st-1];
+		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
+		tempu = bigU;
+                    
+		doublecomplex *tempv1 = bigV + full_u_cols[st-1]*nbrow;
+
+		/* Following is for testing purpose */
+#ifdef GPU_ACC
+		int stream_id = i;
+		int b_offset  = ldu * st_col;
+		int c_offset  = st_col * nbrow;
+		size_t B_stream_size = ldu * num_col_stream * sizeof(doublecomplex);
+		size_t C_stream_size = nbrow * num_col_stream * sizeof(doublecomplex);
+		
+		assert(ldu*(st_col+num_col_stream) < bigu_size);
+		assert(nbrow*(st_col+num_col_stream) < buffer_size);
+		
+		cudaMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
+				cudaMemcpyHostToDevice, streams[stream_id]);
+		
+		cublasCheckErrors(
+				  cublasSetStream(handle[stream_id],
+						  streams[stream_id])
+				  );
+		
+		cublasCheckErrors(
+				  cublasZgemm(handle[stream_id],
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      nbrow, num_col_stream, ldu,
+ 					      (const cuDoubleComplex*) &alpha,
+					      (const cuDoubleComplex*) dA,
+					      nbrow,
+					      (const cuDoubleComplex*) &dB[b_offset], 
+					      ldu,
+					      (const cuDoubleComplex*) &beta,
+					      (cuDoubleComplex*)&dC[c_offset],
+                                              nbrow)
+				  );
+		
+		checkCuda( cudaMemcpyAsync(tempv1, dC+c_offset,
+					   C_stream_size,
+					   cudaMemcpyDeviceToHost,
+					   streams[stream_id]) );
+#else 
+		if ( num_col_stream > 0 ) {   
+		    my_zgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
+			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
+			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
+			      tempv1, &nbrow, 1, 1);
+		}
+		
+#endif 
+		
+	    } /* end for i = 1 to num_streams used */
+	    
+	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
+	    int st_col = 0;        /*special case for cpu */
+	    tempv = bigV + nbrow * st_col;
+	    tempu = bigU;
+	    
+	    double tstart = SuperLU_timer_();
+#if defined (USE_VENDOR_BLAS)            
+	    zgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
+#else
+	    zgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow);
+#endif
+	    gemm_timer += SuperLU_timer_() -tstart;
+	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
+	    
+	    // printf("after zgemm \n");
+	    
+            /* Now scattering blocks handled by cpu */
+            int temp_ncol;
+	    
+            /* scatter first blocks which cpu has computated*/
+            tstart = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel  \
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,	\
+	    segsize,lead_zero,					\
+	    ib, temp_nbrow,ilst,lib,index,			\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,			\
+	    nzval,     lb ,                     jj, i)		\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+                int thread_id = omp_get_thread_num();
+        
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                doublecomplex* tempv1;
+                
+                if (ncpu_blks< omp_get_num_threads()) {
+                    // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
+                    
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+                        #ifdef PI_DEBUG
+                            printf("scattering %d  block column\n",j);
+                        #endif
+
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+#endif
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            int cum_nrow = 0;
+                            int temp_nbrow;
+                            lptr = lptr0;
+                            luptr = luptr0;
+                            for (int i = 0; i < lb; ++i) {
+                                ib = lsub[lptr];        /* Row block L(i,k). */
+                                temp_nbrow = lsub[lptr+1];   /* Number of full rows. */
+                                lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+                                lptr += temp_nbrow;
+                                luptr += temp_nbrow;
+                                cum_nrow +=temp_nbrow;
+                            }
+
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) {  /* A(i,j) is in U. */
+                                #ifdef PI_DEBUG
+                                    printf("cpu scatter \n");
+                                    printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+                                #endif
+
+                                tempv = tempv1+cum_nrow;
+                                zscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                
+                                tempv = tempv1+cum_nrow;
+
+                                zscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                    // TAU_STATIC_TIMER_STOP("SPECIAL_CPU_SCATTER");
+                } else {
+#ifdef _OPENMP
+#pragma omp for schedule(SCHEDULE_STRATEGY) nowait
+#endif
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+                        #ifdef PI_DEBUG
+                            printf("scattering %d  block column\n",j);
+                        #endif 
+
+                        /* == processing each of the remaining columns == */
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+			    /* Now gather the result into the destination block. */
+			    if ( ib < jb ) {  /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("cpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+
+				tempv = tempv1+cum_nrow;
+                                zscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+			    } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                zscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+			    } /* if ib < jb ... */
+
+			    lptr += temp_nbrow;
+			    luptr += temp_nbrow;
+			    cum_nrow += temp_nbrow;
+			
+			} /* for lb ... */
+
+			luptr=luptr0;
+		    } /* for j = jjj_st ... */
+		}     /* else if (ncpu_blks >= omp_get_num_threads()) */
+	    }         /* parallel region */
+
+	    scatter_timer += SuperLU_timer_() - tstart; 
+#ifdef _OPENMP
+#pragma omp parallel							\
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
+	    segsize,lead_zero,						\
+	    ib, temp_nbrow,ilst,lib,index,				\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,				\
+	    nzval,     lb ,                     jj, i)			\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+                int thread_id = omp_get_thread_num();
+        
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                doublecomplex* tempv1;
+                for(i = 0; i < num_streams_used; i++) { /* i is private variable */
+                    checkCuda(cudaStreamSynchronize (streams[i]));
+                    int jjj_st1 = (i==0) ? jjj_st + ncpu_blks : jjj_st + stream_end_col[i-1];
+                    int jjj_end = jjj_st + stream_end_col[i];
+                    assert(jjj_end-1<nub);
+                    assert(jjj_st1>jjj_st) ;
+
+                    /* now scatter it */
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait 
+                    for (j = jjj_st1; j < jjj_end; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering %d  block column\n",j);
+#endif 
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+                        for (lb = 0; lb < nlb; lb++) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) { /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("gpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+                                zscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("gpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                zscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+			    
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+                    
+                } /* end for i = 0 to nstreams */
+                // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
+                // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
+            } /* end pragma omp parallel */
+            // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
+        }  /* end while(jjj<nub) */
+ 
+    } /* if nbrow>0 */
+
+ }   /* if msg1 and msg 2 */
+
+
+
diff --git a/SRC/zdistribute.c b/SRC/zdistribute.c
new file mode 100644
index 0000000..4938ca3
--- /dev/null
+++ b/SRC/zdistribute.c
@@ -0,0 +1,749 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Distribute the matrix onto the 2D process mesh.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ * </pre>
+ */
+
+float
+zdistribute(fact_t fact, int_t n, SuperMatrix *A, 
+            Glu_freeable_t *Glu_freeable,
+	    LUstruct_t *LUstruct, gridinfo_t *grid)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
+          len, len1, nsupc;
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+    int iam, jbrow, kcol, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NCPformat *Astore;
+    doublecomplex *a;
+    int_t *asub;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;   
+    int_t *lsub, *xlsub, *usub, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+    int   *index1;        /* temporary pointer to array of int */
+    doublecomplex *lusup, *uval; /* nonzero values in L and U */
+    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in 
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+    doublecomplex *dense, *dense_col; /* SPA */
+    doublecomplex zero = {0.0, 0.0};
+    int_t  ldaspa;     /* LDA of SPA */
+    int_t iword, zword;
+    float mem_use = 0.0;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 ) 
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = A->Store;
+    a        = Astore->nzval;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    zword = sizeof(doublecomplex);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zdistribute()");
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+        /* ---------------------------------------------------------------
+         * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
+         * --------------------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doublecomplexCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+#if ( PRNTlevel>=1 )
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword;
+#endif
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else { 
+        /* --------------------------------------------------
+         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
+         * -------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* No L and U data structures are available yet.
+	   We need to set up the L and U data structures and propagate
+	   the values of A into them.          */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+    
+	if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+#if ( PRNTlevel>=1 )
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+#endif
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr = 
+               (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+	
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword;
+#endif
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+	
+            
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+	
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+	
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+#if ( PRNTlevel>=1 )
+        mem_use -= 2.0*k * iword;
+#endif
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if (!(dense=doublecomplexCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
+              *sp_ienv_dist(3)))))
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*zword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr = (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+#if ( PRNTlevel>=1 )
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+#endif
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+		
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		}
+
+		jbrow = PROW( jb, grid );
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see 
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in 
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) ) 
+			ABORT("Malloc fails for index[]");
+		    Lrowind_bc_ptr[ljb] = index;
+		    if (!(Lnzval_bc_ptr[ljb] = doublecomplexMalloc_dist(((size_t)len)*nsupc))) {
+			fprintf(stderr, "col block " IFMT " ", jb);
+			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
+		    }
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len; 
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[], and
+		       the initial values of A from SPA into Lnzval[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		      MPI_MAX, grid->comm);
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit zdistribute()");
+#endif
+
+    return (mem_use);
+} /* ZDISTRIBUTE */
+
diff --git a/SRC/zdistribute_mark.c b/SRC/zdistribute_mark.c
new file mode 100644
index 0000000..ae927cb
--- /dev/null
+++ b/SRC/zdistribute_mark.c
@@ -0,0 +1,711 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Distribute the matrix onto the 2D process mesh
+ *
+ * <pre>
+ * NOTE zdistribute_mark.c
+ * ====
+ * This version is faster for Mark Baertschy's matrices, remains to be
+ * tested for the other matrices.
+ *
+ * Main difference: there is no dense SPA involved when distributing A into
+ * the U structure. That is, the entries in upper triangle of A are loaded
+ * directly into U.
+ * 
+ * The locations of modifications have XSL comments.
+ *
+ * Date: Apr 23 09:54:15 PDT 2001
+ * </pre>
+ */
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ * 
+ * Arguments
+ * =========
+ * 
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = NCP; Dtype = Z; Mtype = GE.
+ *
+ * LUstruct (input) LUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * </pre>
+ */
+int_t
+zdistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable,
+	    LUstruct_t *LUstruct, gridinfo_t *grid)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc;
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+    int iam, jbrow, kcol, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NCPformat *Astore;
+    doublecomplex *a;
+    int_t *asub;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;   
+    int_t *lsub, *xlsub, *usub, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+    doublecomplex *lusup, *uval; /* nonzero values in L and U */
+    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+    /*-- Counts to be used in factorization. --*/
+    int_t  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  *ilsum;         /* starting position of each supernode in 
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+    doublecomplex *dense, *dense_col; /* SPA */
+    doublecomplex zero = {0.0, 0.0};
+    int_t  ldaspa;     /* LDA of SPA */
+    int_t mem_use = 0, iword, zword;
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = A->Store;
+    a        = Astore->nzval;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    zword = sizeof(doublecomplex);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zdistribute()");
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	for (lb = 0; lb < nrbu; ++lb) 
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+#if ( PRNTlevel>=1 )
+	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword;
+#endif
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		}
+		
+		/* Gather the values of A from SPA into Unzval[]. */
+		for (lb = 0; lb < nrbu; ++lb) {
+		    index = Ufstnz_br_ptr[lb];
+		    if ( index && index[Urb_indptr[lb]] == jb ) {
+			uval = Unzval_br_ptr[lb];
+			len = Urb_indptr[lb] + UB_DESCRIPTOR;
+			gb = lb * grid->nprow + myrow;/* Global block number */
+			k = FstBlockC( gb+1 );
+			irow = ilsum[lb] - FstBlockC( gb );
+			for (jj = 0, dense_col = dense; jj < nsupc; ++jj) {
+			    j = index[len+jj];
+			    for (i = j; i < k; ++i) {
+				uval[Urb_length[lb]++] = dense_col[irow+i];
+				dense_col[irow+i] = zero;
+			    }
+			    dense_col += ldaspa;
+			}
+			Urb_indptr[lb] += UB_DESCRIPTOR + nsupc;
+		    }
+		} /* for lb ... */
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+
+    } else {
+	/* No L and U data structures are available yet.
+	   We need to set up the L and U data structures and propagate
+	   the values of A into them.          */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+    
+	if ( !(ToRecv = intCalloc_dist(nsupers)) )
+	    ABORT("Calloc fails for ToRecv[].");
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index = intMalloc_dist(j)) )
+	    ABORT("Malloc fails for index[].");
+#if ( PRNTlevel>=1 )
+	mem_use = k*sizeof(int_t*) + (j + nsupers)*iword;
+#endif
+	for (i = 0; i < j; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
+
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr =
+	       (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+	
+	if ( !(ToSendD = intCalloc_dist(k)) )
+	    ABORT("Malloc fails for ToSendD[].");
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+        ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+#if ( PRNTlevel>=1 )	
+	mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword;
+#endif
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+	
+            
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+	
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+	
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		/* XSL 4-23-01 */
+		if ( !(Unzval_br_ptr[lb] = doublecomplexCalloc_dist(len)) )
+		    ABORT("Calloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	} /* for lb ... */
+
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Ucbs);
+#if ( PRNTlevel>=1 )
+        mem_use -= 2*k * iword;
+#endif
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+#if ( PRNTlevel>=1 )	
+	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*zword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr =
+	       (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+#if ( PRNTlevel>=1 )
+	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
+#endif
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+		
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			if ( irow < fsupc ) continue; /* Skip U. XSL 4-23-01 */
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		}
+
+		jbrow = PROW( jb, grid );
+
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+
+		kseen = 0;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow ) 
+			    bsendx_plist[ljb][pr] = YES;
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				/* Initialize block length to 0. XSL 4-23-01 */
+				index[Urb_indptr[lb]+1] = 0;
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				len = Urb_indptr[lb];
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = FstBlockC( gb+1 );
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else {
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+		} /* for j ... */
+#if 1
+		/* XSL 4-23-01 */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    /* Gather the initial values of A directly into Uval.
+		       (No SPA is involved.)    */
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i]; 
+			if ( irow >= fsupc ) continue; /* Skip L */
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    len = Urb_indptr[lb];
+			    jj = index[len]; /* First nonzero in segment */
+			    uval[Urb_length[lb] + irow - jj] = a[i];
+			}
+		    }
+		    /* Now increment the index pointer for each row block */
+		    for (lb = 0; lb < nrbu; ++lb) {
+			if ( rb_marker[lb] == jb+1 ) { /* Not an empty block */
+			    gb = lb*grid->nprow + myrow; /* Global block # */
+			    index = Ufstnz_br_ptr[lb];
+			    jj = index[Urb_indptr[lb]];
+			    k = FstBlockC( gb+1 ) - jj;
+			    Urb_length[lb] += k;
+			    /* Increment the block length */
+			    index[Urb_indptr[lb]+fsupc-j-1] += k;
+			    Urb_indptr[lb] += 1;
+			}
+		    }
+		} /* for j = fsupc ... */
+#else
+		/* Figure out how many nonzeros in each block, and gather
+		   the initial values of A from SPA into Uval. */
+		for (lb = 0; lb < nrbu; ++lb) {
+		    if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */
+			index = Ufstnz_br_ptr[lb];
+			uval = Unzval_br_ptr[lb];
+			len = Urb_indptr[lb];
+			gb = lb * grid->nprow + myrow;/* Global block number */
+			k = FstBlockC( gb+1 );
+			irow = ilsum[lb] - FstBlockC( gb );
+			for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) {
+			    j = index[len+jj];  /* First nonzero in segment. */
+			    bnnz += k - j;
+			    for (i = j; i < k; ++i) {
+				uval[Urb_length[lb]++] = dense_col[irow + i];
+				dense_col[irow + i] = zero;
+			    }
+			    dense_col += ldaspa;
+			}
+			index[len-1] = bnnz; /* Set block length in Descriptor */
+			Urb_indptr[lb] += nsupc;
+		    }
+		} /* for lb ... */
+#endif
+
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow )
+			fsendx_plist[ljb][pr] = YES;
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in 
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) ) 
+			ABORT("Malloc fails for index[]");
+		    Lrowind_bc_ptr[ljb] = index;
+		    if ( !(Lnzval_bc_ptr[ljb] = 
+			   doublecomplexMalloc_dist(len*nsupc)) ) {
+			fprintf(stderr, "col block %d ", jb);
+			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
+		    }
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len; 
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[], and
+		       the initial values of A from SPA into Lnzval[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+		      MPI_MAX, grid->comm);
+
+    } /* if fact == SamePattern_SameRowPerm */
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit zdistribute()");
+#endif
+
+    return (mem_use);
+} /* ZDISTRIBUTE */
+
diff --git a/SRC/zgsequ_dist.c b/SRC/zgsequ_dist.c
new file mode 100644
index 0000000..3a3df5b
--- /dev/null
+++ b/SRC/zgsequ_dist.c
@@ -0,0 +1,193 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Computes row and column scalings
+ */
+
+/*
+ * File name:	zgsequ.c
+ * History:     Modified from LAPACK routine ZGEEQU
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre>   
+    Purpose   
+    =======   
+
+    ZGSEQU_DIST computes row and column scalings intended to equilibrate an   
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make   
+    the largest element in each row and column of the matrix B with   
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+    number and BIGNUM = largest safe number.  Use of these scaling   
+    factors is not guaranteed to reduce the condition number of A but   
+    works well in practice.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+ 
+    Arguments   
+    =========   
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
+	    
+    R       (output) double*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors   
+            for A.
+	    
+    C       (output) double*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+	    
+    ROWCND  (output) double*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+            AMAX is neither too large nor too small, it is not worth   
+            scaling by R.
+	    
+    COLCND  (output) double*
+            If INFO = 0, COLCND contains the ratio of the smallest   
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            worth scaling by C.
+	    
+    AMAX    (output) double*
+            Absolute value of largest matrix element.  If AMAX is very   
+            close to overflow or very close to underflow, the matrix   
+            should be scaled.
+	    
+    INFO    (output) int*
+            = 0:  successful exit   
+            < 0:  if INFO = -i, the i-th argument had an illegal value   
+            > 0:  if INFO = i,  and i is   
+                  <= M:  the i-th row of A is exactly zero   
+                  >  M:  the (i-M)-th column of A is exactly zero   
+
+    ===================================================================== 
+</pre>
+*/
+
+void
+zgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd,
+	    double *colcnd, double *amax, int_t *info)
+{
+
+    /* Local variables */
+    NCformat *Astore;
+    doublecomplex *Aval;
+    int i, j, irow;
+    double rcmin, rcmax;
+    double bignum, smlnum;
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	xerr_dist("zgsequ_dist", &i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (doublecomplex *) Astore->nzval;
+    
+    /* Get machine constants. */
+    smlnum = dmach_dist("S");
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    r[irow] = SUPERLU_MAX( r[irow], slud_z_abs1(&Aval[i]) );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = 0; i < A->nrow; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    c[j] = SUPERLU_MAX( c[j], slud_z_abs1(&Aval[i]) * r[irow] );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    return;
+
+} /* zgsequ_dist */
+
+
diff --git a/SRC/zlangs_dist.c b/SRC/zlangs_dist.c
new file mode 100644
index 0000000..d921d79
--- /dev/null
+++ b/SRC/zlangs_dist.c
@@ -0,0 +1,118 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Returns the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
+ */
+
+/*
+ * File name:	zlangs.c
+ * History:     Modified from lapack routine ZLANGE
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre> 
+    Purpose   
+    =======   
+
+    ZLANGS_DIST returns the value of the one norm, or the Frobenius norm, or 
+    the infinity norm, or the element of largest absolute value of a 
+    real matrix A.   
+
+    Description   
+    ===========   
+
+    ZLANGE returns the value   
+
+       ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+                (   
+                ( norm1(A),         NORM = '1', 'O' or 'o'   
+                (   
+                ( normI(A),         NORM = 'I' or 'i'   
+                (   
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+
+    Arguments   
+    =========   
+
+    NORM    (input) CHARACTER*1   
+            Specifies the value to be returned in ZLANGE as described above.   
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A. 
+
+   ===================================================================== 
+</pre>
+*/
+double zlangs_dist(char *norm, SuperMatrix *A)
+{
+    /* Local variables */
+    NCformat *Astore;
+    doublecomplex   *Aval;
+    int      i, j, irow;
+    double   value=0., sum;
+    double   *rwork;
+
+    Astore = A->Store;
+    Aval   = Astore->nzval;
+    
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+	
+    } else if ( strncmp(norm, "M", 1)==0 ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
+		value = SUPERLU_MAX( value, slud_z_abs( &Aval[i]) );
+	
+    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+		sum += slud_z_abs( &Aval[i] );
+	    value = SUPERLU_MAX(value,sum);
+	}
+	
+    } else if ( strncmp(norm, "I", 1)==0 ) {
+	/* Find normI(A). */
+	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
+	    ABORT("SUPERLU_MALLOC fails for rwork.");
+	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
+		irow = Astore->rowind[i];
+		rwork[irow] += slud_z_abs( &Aval[i] );
+	    }
+	value = 0.;
+	for (i = 0; i < A->nrow; ++i)
+	    value = SUPERLU_MAX(value, rwork[i]);
+	
+	SUPERLU_FREE (rwork);
+	
+    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else
+	ABORT("Illegal norm specified.");
+
+    return (value);
+
+} /* zlangs_dist */
+
diff --git a/SRC/zlaqgs_dist.c b/SRC/zlaqgs_dist.c
new file mode 100644
index 0000000..1dd9163
--- /dev/null
+++ b/SRC/zlaqgs_dist.c
@@ -0,0 +1,145 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Equilibrates a general sparse M by N matrix A
+ */
+
+/*
+ * File name:	zlaqgs.c
+ * History:     Modified from LAPACK routine ZLAQGE
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose   
+    =======   
+
+    ZLAQGS_DIST equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.   
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments   
+    =========   
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of 
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
+	    
+    R       (input) double*, dimension (A->nrow)
+            The row scale factors for A.
+	    
+    C       (input) double*, dimension (A->ncol)
+            The column scale factors for A.
+	    
+    ROWCND  (input) double
+            Ratio of the smallest R(i) to the largest R(i).
+	    
+    COLCND  (input) double
+            Ratio of the smallest C(i) to the largest C(i).
+	    
+    AMAX    (input) double
+            Absolute value of largest matrix entry.
+	    
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.   
+            = 'N':  No equilibration   
+            = 'R':  Row equilibration, i.e., A has been premultiplied by  
+                    diag(R).   
+            = 'C':  Column equilibration, i.e., A has been postmultiplied  
+                    by diag(C).   
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).   
+
+    Internal Parameters   
+    ===================   
+
+    THRESH is a threshold value used to decide if row or column scaling   
+    should be done based on the ratio of the row or column scaling   
+    factors.  If ROWCND < THRESH, row scaling is done, and if   
+    COLCND < THRESH, column scaling is done.   
+
+    LARGE and SMALL are threshold values used to decide if row scaling   
+    should be done based on the absolute size of the largest matrix   
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+
+    ===================================================================== 
+</pre>
+*/
+void
+zlaqgs_dist(SuperMatrix *A, double *r, double *c, 
+	    double rowcnd, double colcnd, double amax, char *equed)
+{
+#define THRESH    (0.1)
+    
+    /* Local variables */
+    NCformat *Astore;
+    doublecomplex   *Aval;
+    int i, j, irow;
+    double large, small, cj;
+    double temp;
+
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (doublecomplex *) Astore->nzval;
+    
+    /* Initialize LARGE and SMALL. */
+    small = dmach_dist("Safe minimum") / dmach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    for (j = 0; j < A->ncol; ++j) {
+		cj = c[j];
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    zd_mult(&Aval[i], &Aval[i], cj);
+                }
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		zd_mult(&Aval[i], &Aval[i], r[irow]);
+	    }
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Row and column scaling */
+	for (j = 0; j < A->ncol; ++j) {
+	    cj = c[j];
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		temp = cj * r[irow];
+		zd_mult(&Aval[i], &Aval[i], temp);
+	    }
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* zlaqgs_dist */
+
diff --git a/SRC/zldperm_dist.c b/SRC/zldperm_dist.c
new file mode 100644
index 0000000..8df8360
--- /dev/null
+++ b/SRC/zldperm_dist.c
@@ -0,0 +1,174 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Finds a row permutation so that the matrix has large entries on the diagonal
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
+		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
+		    int_t [], int_t []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *   ZLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal 
+ *              entries of the permuted matrix are one in absolute value and 
+ *              all the off-diagonal entries are less than or equal to one in 
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) doublecomplex*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors. 
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * </pre>
+ */
+
+int
+zldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
+	doublecomplex nzval[], int_t *perm, double u[], double v[])
+{ 
+    int_t i, liw, ldw, num;
+    int_t *iw, icntl[10], info[10];
+    double *dw;
+    double *nzval_abs = doubleMalloc_dist(nnz);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter zldperm_dist()");
+#endif
+    liw = 5*n;
+    if ( job == 3 ) liw = 10*n + nnz;
+    if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
+    ldw = 3*n + nnz;
+    if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
+	    
+    /* Increment one to get 1-based indexing. */
+    for (i = 0; i <= n; ++i) ++colptr[i];
+    for (i = 0; i < nnz; ++i) ++adjncy[i];
+#if ( DEBUGlevel>=2 )
+    printf("LDPERM(): n %d, nnz %d\n", n, nnz);
+    PrintInt10("colptr", n+1, colptr);
+    PrintInt10("adjncy", nnz, adjncy);
+#endif
+	
+    /* 
+     * NOTE:
+     * =====
+     *
+     * MC64AD assumes that column permutation vector is defined as:
+     * perm(i) = j means column i of permuted A is in column j of original A.
+     *
+     * Since a symmetric permutation preserves the diagonal entries. Then
+     * by the following relation:
+     *     P'(A*P')P = P'A
+     * we can apply inverse(perm) to rows of A to get large diagonal entries.
+     * But, since 'perm' defined in MC64AD happens to be the reverse of
+     * SuperLU's definition of permutation vector, therefore, it is already
+     * an inverse for our purpose. We will thus use it directly.
+     *
+     */
+    mc64id_dist(icntl);
+    /* Suppress error and warning messages. */
+    icntl[0] = -1;
+    icntl[1] = -1;
+
+    for (i = 0; i < nnz; ++i) nzval_abs[i] = slud_z_abs1(&nzval[i]);
+    mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval_abs, &num, perm,
+	        &liw, iw, &ldw, dw, icntl, info);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("perm", n, perm);
+    printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
+#endif
+    if ( info[0] == 1 ) { /* Structurally singular */
+        printf(".. The last " IFMT " permutations:\n", n-num);
+	PrintInt10("perm", n-num, &perm[num]);
+    }
+
+    /* Restore to 0-based indexing. */
+    for (i = 0; i <= n; ++i) --colptr[i];
+    for (i = 0; i < nnz; ++i) --adjncy[i];
+    for (i = 0; i < n; ++i) --perm[i];
+
+    if ( job == 5 )
+        for (i = 0; i < n; ++i) {
+	    u[i] = dw[i];
+	    v[i] = dw[n+i];
+	}
+
+    SUPERLU_FREE(iw);
+    SUPERLU_FREE(dw);
+    SUPERLU_FREE(nzval_abs);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit zldperm_dist()");
+#endif
+   return (info[0]);
+}
+
diff --git a/SRC/zlook_ahead_update.c b/SRC/zlook_ahead_update.c
new file mode 100644
index 0000000..683f0af
--- /dev/null
+++ b/SRC/zlook_ahead_update.c
@@ -0,0 +1,250 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/************************************************************************/
+/*! @file 
+ * \brief Look-ahead update of the Schur complement.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    doublecomplex zero = {0.0, 0.0};
+
+    /* Search along the row for the pointers {iukp, rukp} pointing to
+     * block U(k,j).
+     * j    -- current block in look-ahead window, initialized to 0 on entry
+     * iukp -- point to the start of index[] medadata
+     * rukp -- point to the start of nzval[] array
+     * jb   -- block number of block U(k,j), update destination column
+     */
+    arrive_at_ublock(
+		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         	     iukp0, rukp0, usub, perm_u, xsup, grid
+		    );
+    j++;
+    jj0++;
+    jj = iukp;
+
+    while (usub[jj] == klst) ++jj; /* Skip zero segments */
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+    full = 1; /* flag the U block is indeed 'full', containing segments
+                 of same length. No need padding 0.  */
+    for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize != ldu) full = 0; /* need padding 0 */
+            if (segsize > ldu)  ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+    if (0) {
+        tempu = &uval[rukp];
+    }
+    else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+#if ( DEBUGlevel>=3 )
+        printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+                iam, full, k, jb, ldu, ncols, nsupc);
+        ++num_copy;
+#endif
+        tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+        for (jj = iukp; jj < iukp + nsupc; ++jj) {
+            segsize = klst - usub[jj];
+            if (segsize) {
+                lead_zero = ldu - segsize;
+                for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (i = 0; i < segsize; ++i) {
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        tempu = bigU;
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    } /* if full ... */
+
+    nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+    // double ttx =SuperLU_timer_();
+
+    int current_b = 0; /* Each thread starts searching from first block.
+                          This records the moving search target.           */
+    lptr = lptr0; /* point to the start of index[] in supernode L(:,k) */
+    luptr = luptr0;
+
+#ifdef _OPENMP
+    /* Sherry -- examine all the shared variables ??
+       'firstprivate' ensures that the private variables are initialized
+       to the values before entering the loop  */
+#pragma omp parallel for \
+    firstprivate(lptr,luptr,ib,tempv,current_b)	private(lb) \
+    default(shared) schedule(dynamic)
+#endif
+    for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
+        int temp_nbrow; /* automatic variable is private */
+
+        /* Search for the L block that my thread will work on.
+           No need to search from 0, can continue at the point where
+           it is left from last iteration.
+           Note: Blocks may not be sorted in L. Different thread picks up
+	   different lb.   */
+        for (; current_b < lb; ++current_b) {
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;   /* move to next block */
+            luptr += temp_nbrow;  /* move to next block */
+        }
+
+#ifdef _OPENMP        
+        int_t thread_id = omp_get_thread_num ();
+#else
+        int_t thread_id = 0;
+#endif
+        doublecomplex * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread  = indirect + ldt * thread_id;
+        int *indirect2_thread = indirect2 + ldt * thread_id;        
+        ib = lsub[lptr];        /* block number of L(i,k) */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+	/* assert (temp_nbrow <= nbrow); */
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+        /* calling gemm */
+#if defined (USE_VENDOR_BLAS)
+        zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+        /* Now scattering the output*/
+        if (ib < jb) {    /* A(i,j) is in U. */
+            zscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            zscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread, 
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+        ++current_b;         /* move to next block */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+
+    } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
+
+    rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
+    iukp += nsupc;
+
+    /* =========================================== *
+     * == factorize L(:,j) and send if possible == *
+     * =========================================== */
+    kk = jb; /* destination column that is just updated */
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+        /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+        /* double ttt1 = SuperLU_timer_(); */
+#if ( VAMPIR>=1 )
+        VT_begin (5);
+#endif
+
+        PZGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+#if ( VAMPIR>=1 )
+        VT_end (5);
+#endif
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Multicasts numeric values of L(:,kk) to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+#if ( VAMPIR>=1 )
+                VT_begin (1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( VAMPIR>=1 )
+                VT_end (1);
+#endif
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d, tags %d:%d \n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj,
+			SLU_MPI_TAG(0,kk0), SLU_MPI_TAG(1,kk0));
+#endif
+            }  /* end if ( ToSendR[lk][pj] != EMPTY ) */
+        } /* end for pj ... */
+    } /* end if( look_ahead[kk] == k0 && kcol == mycol ) */
+} /* end while j < nub and perm_u[j] <k0+NUM_LOOK_AHEAD */
+
diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c
new file mode 100644
index 0000000..896c06d
--- /dev/null
+++ b/SRC/zmemory_dist.c
@@ -0,0 +1,168 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Memory utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * </pre>
+ */
+
+#include "superlu_zdefs.h"
+
+
+/* Variables external to this file */
+extern LU_stack_t stack;
+
+
+void *zuser_malloc_dist(int_t bytes, int_t which_end)
+{
+    void *buf;
+    
+    if ( StackFull(bytes) ) return (NULL);
+
+    if ( which_end == HEAD ) {
+	buf = (char*) stack.array + stack.top1;
+	stack.top1 += bytes;
+    } else {
+	stack.top2 -= bytes;
+	buf = (char*) stack.array + stack.top2;
+    }
+    
+    stack.used += bytes;
+    return buf;
+}
+
+
+void zuser_free_dist(int_t bytes, int_t which_end)
+{
+    if ( which_end == HEAD ) {
+	stack.top1 -= bytes;
+    } else {
+	stack.top2 += bytes;
+    }
+    stack.used -= bytes;
+}
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * </pre>
+ */
+int_t zQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+		       SuperLUStat_t *stat, superlu_dist_mem_usage_t *mem_usage)
+{
+    register int_t dword, gb, iword, k, nb, nsupers;
+    int_t *index, *xsup;
+    int iam, mycol, myrow;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    iword = sizeof(int_t);
+    dword = sizeof(doublecomplex);
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    mem_usage->for_lu = 0.;
+
+    /* For L factor */
+    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->npcol + mycol; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Lrowind_bc_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)
+		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
+		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
+	    }
+	}
+    }
+
+    /* For U factor */
+    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->nprow + myrow; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Ufstnz_br_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)(index[2] * iword);
+		mem_usage->for_lu += (float)(index[1] * dword);
+	    }
+	}
+    }
+
+    /* Working storage to support factorization */
+    mem_usage->total = mem_usage->for_lu;
+#if 0
+    mem_usage->total +=
+	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
+		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
+    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
+    mem_usage->total +=
+	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
+    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
+    k = CEILING( nsupers, grid->nprow );
+    mem_usage->total += (float)(2 * k * iword);
+#else
+    /*mem_usage->total += stat->current_buffer;*/
+    printf(".. zQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
+    mem_usage->total += stat->peak_buffer;
+#endif
+
+    return 0;
+} /* zQuerySpace_dist */
+
+
+/*
+ * Allocate storage for original matrix A
+ */
+void
+zallocateA_dist(int_t n, int_t nnz, doublecomplex **a, int_t **asub, int_t **xa)
+{
+    *a    = (doublecomplex *) doublecomplexMalloc_dist(nnz);
+    *asub = (int_t *) intMalloc_dist(nnz);
+    *xa   = (int_t *) intMalloc_dist(n+1);
+}
+
+
+doublecomplex *doublecomplexMalloc_dist(int_t n)
+{
+    doublecomplex *buf;
+    buf = (doublecomplex *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(doublecomplex) ); 
+    return (buf);
+}
+
+doublecomplex *doublecomplexCalloc_dist(int_t n)
+{
+    doublecomplex *buf;
+    register int_t i;
+    doublecomplex zero = {0.0, 0.0};
+    buf = (doublecomplex *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(doublecomplex));
+    if ( !buf ) return (buf);
+    for (i = 0; i < n; ++i) buf[i] = zero;
+    return (buf);
+}
+
diff --git a/SRC/zmyblas2_dist.c b/SRC/zmyblas2_dist.c
new file mode 100644
index 0000000..9fb79b0
--- /dev/null
+++ b/SRC/zmyblas2_dist.c
@@ -0,0 +1,208 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Level 2 BLAS operations: solves and matvec, written in C
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * </pre>
+ */
+/*
+ * File name:		zmyblas2.c
+ * Purpose:
+ *     Level 2 BLAS operations: solves and matvec, written in C.
+ * Note:
+ *     This is only used when the system lacks an efficient BLAS library.
+ */
+#include "dcomplex.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense UNIT lower triangular system. The unit lower 
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
+ * The solution will be returned in the rhs vector.
+ * </pre>
+ */
+void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs )
+{
+    int k;
+    doublecomplex x0, x1, x2, x3, temp;
+    doublecomplex *M0;
+    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
+    register int firstcol = 0;
+
+    M0 = &M[0];
+
+
+    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
+      	Mki0 = M0 + 1;
+      	Mki1 = Mki0 + ldm + 1;
+      	Mki2 = Mki1 + ldm + 1;
+      	Mki3 = Mki2 + ldm + 1;
+
+      	x0 = rhs[firstcol];
+      	zz_mult(&temp, &x0, Mki0); Mki0++;
+      	z_sub(&x1, &rhs[firstcol+1], &temp);
+      	zz_mult(&temp, &x0, Mki0); Mki0++;
+	z_sub(&x2, &rhs[firstcol+2], &temp);
+	zz_mult(&temp, &x1, Mki1); Mki1++;
+	z_sub(&x2, &x2, &temp);
+      	zz_mult(&temp, &x0, Mki0); Mki0++;
+	z_sub(&x3, &rhs[firstcol+3], &temp);
+	zz_mult(&temp, &x1, Mki1); Mki1++;
+	z_sub(&x3, &x3, &temp);
+	zz_mult(&temp, &x2, Mki2); Mki2++;
+	z_sub(&x3, &x3, &temp);
+
+ 	rhs[++firstcol] = x1;
+      	rhs[++firstcol] = x2;
+      	rhs[++firstcol] = x3;
+      	++firstcol;
+    
+      	for (k = firstcol; k < ncol; k++) {
+	    zz_mult(&temp, &x0, Mki0); Mki0++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	    zz_mult(&temp, &x1, Mki1); Mki1++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	    zz_mult(&temp, &x2, Mki2); Mki2++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	    zz_mult(&temp, &x3, Mki3); Mki3++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	}
+
+        M0 += 4 * ldm + 4;
+    }
+
+    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
+        Mki0 = M0 + 1;
+        Mki1 = Mki0 + ldm + 1;
+
+        x0 = rhs[firstcol];
+	zz_mult(&temp, &x0, Mki0); Mki0++;
+	z_sub(&x1, &rhs[firstcol+1], &temp);
+
+      	rhs[++firstcol] = x1;
+      	++firstcol;
+    
+      	for (k = firstcol; k < ncol; k++) {
+	    zz_mult(&temp, &x0, Mki0); Mki0++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	    zz_mult(&temp, &x1, Mki1); Mki1++;
+	    z_sub(&rhs[k], &rhs[k], &temp);
+	} 
+    }
+    
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ * </pre>
+ */
+void
+zusolve (
+	int ldm,	/* in */
+	int ncol,	/* in */
+	doublecomplex *M,	/* in */
+	doublecomplex *rhs	/* modified */
+)
+{
+    doublecomplex xj, temp;
+    int jcol, j, irow;
+
+    jcol = ncol - 1;
+
+    for (j = 0; j < ncol; j++) {
+
+	slud_z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
+
+	rhs[jcol] = xj;
+	
+	for (irow = 0; irow < jcol; irow++) {
+	    zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
+	    z_sub(&rhs[irow], &rhs[irow], &temp);
+	}
+
+	jcol--;
+
+    }
+    return;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ * </pre> 
+ */
+void zmatvec (
+	int ldm,	/* in -- leading dimension of M */
+	int nrow,	/* in */ 
+	int ncol,	/* in */
+	doublecomplex *M,	/* in */
+	doublecomplex *vec,	/* in */
+	doublecomplex *Mxvec	/* in/out */
+)
+{
+    doublecomplex vi0, vi1, vi2, vi3;
+    doublecomplex *M0, temp;
+    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
+    register int firstcol = 0;
+    int k;
+
+    M0 = &M[0];
+
+    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+	Mki2 = Mki1 + ldm;
+	Mki3 = Mki2 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];	
+	for (k = 0; k < nrow; k++) {
+	    zz_mult(&temp, &vi0, Mki0); Mki0++;
+	    z_add(&Mxvec[k], &Mxvec[k], &temp);
+	    zz_mult(&temp, &vi1, Mki1); Mki1++;
+	    z_add(&Mxvec[k], &Mxvec[k], &temp);
+	    zz_mult(&temp, &vi2, Mki2); Mki2++;
+	    z_add(&Mxvec[k], &Mxvec[k], &temp);
+	    zz_mult(&temp, &vi3, Mki3); Mki3++;
+	    z_add(&Mxvec[k], &Mxvec[k], &temp);
+	}
+
+	M0 += 4 * ldm;
+    }
+
+    while ( firstcol < ncol ) {		/* Do 1 column */
+ 	Mki0 = M0;
+	vi0 = vec[firstcol++];
+	for (k = 0; k < nrow; k++) {
+	    zz_mult(&temp, &vi0, Mki0); Mki0++;
+	    z_add(&Mxvec[k], &Mxvec[k], &temp);
+	}
+	M0 += ldm;
+    }
+    return;	
+}
+
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
new file mode 100644
index 0000000..668a995
--- /dev/null
+++ b/SRC/zreadMM.c
@@ -0,0 +1,240 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief 
+ * Contributed by Francois-Henry Rouet.
+ *
+ */
+#include <ctype.h>
+#include "superlu_zdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    doublecomplex *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
+    int expand;
+
+    /* 	File format:
+     *    %%MatrixMarket matrix coordinate real general/symmetric/...
+     *    % ...
+     *    % (optional comments)
+     *    % ...
+     *    #rows    #non-zero
+     *    Triplet in the rest of lines: row    col    value
+     */
+
+     /* 1/ read header */ 
+     fgets(line,512,fp);
+     for (p=line; *p!='\0'; *p=tolower(*p),p++);
+
+     if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
+       printf("Invalid header (first line does not contain 5 tokens)\n");
+       exit;
+     }
+ 
+     if(strcmp(banner,"%%matrixmarket")) {
+       printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n");
+       exit(-1);
+     }
+
+     if(strcmp(mtx,"matrix")) {
+       printf("Not a matrix; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(crd,"coordinate")) {
+       printf("Not in coordinate format; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(arith,"real")) {
+       if(!strcmp(arith,"complex")) {
+         printf("Complex matrix; use zreadMM instead!\n");
+         exit(-1);
+       }
+       else if(!strcmp(arith, "pattern")) {
+         printf("Pattern matrix; values are needed!\n");
+         exit(-1);
+       }
+       else {
+         printf("Unknown arithmetic\n");
+         exit(-1);
+       }
+     }
+
+     if(strcmp(sym,"general")) {
+       printf("Symmetric matrix: will be expanded\n");
+       expand=1;
+     } else
+       expand=0;
+
+     /* 2/ Skip comments */
+     while(banner[0]=='%') {
+       fgets(line,512,fp);
+       sscanf(line,"%s",banner);
+     }
+
+     /* 3/ Read n and nnz */
+#ifdef _LONGINT
+    sscanf(line, "%ld%ld%ld",m, n, nonz);
+#else
+    sscanf(line, "%d%d%d",m, n, nonz);
+#endif
+
+    if(*m!=*n) {
+      printf("Rectangular matrix!. Abort\n");
+      exit(-1);
+   }
+
+    if(expand)
+      new_nonz = 2 * *nonz - *n;
+    else
+      new_nonz = *nonz;
+
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* 4/ Read triplets of values */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#else
+	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz].r, val[nz].i);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+            if(expand) {
+	        if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	          ++nz;
+	          row[nz] = col[nz-1];
+	          col[nz] = row[nz-1];
+	          val[nz] = val[nz-1];
+	          ++xa[col[nz]];
+	        }
+            }	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+    if(expand) {
+      printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+    }
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+static void zreadrhs(int m, doublecomplex *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i);
+    fclose(fp);
+}
+
+
diff --git a/SRC/zreadhb.c b/SRC/zreadhb.c
new file mode 100644
index 0000000..7b2102a
--- /dev/null
+++ b/SRC/zreadhb.c
@@ -0,0 +1,292 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+#include "dcomplex.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_zdefs.h"
+
+/*
+ * Prototypes
+ */
+static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
+static void zReadValues(FILE *, int_t, doublecomplex *, int_t, int_t);
+static int DumpLine(FILE *);
+static int ParseIntFormat(char *, int_t *, int_t *);
+static int ParseFloatFormat(char *, int_t *, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ * </pre>
+ */
+
+void
+zreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+	     doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+
+    register int_t i, numer_lines, rhscrd = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter zreadhb_dist()");
+#endif
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+
+    /* Line 2 */
+    for (i=0; i<5; i++) {
+	fscanf(fp, "%14c", buf); buf[14] = 0;
+	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+	if (i == 3) numer_lines = tmp;
+	if (i == 4 && tmp) rhscrd = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+    
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
+    
+    if (tmp != 0)
+	if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+	if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    zallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    fscanf(fp, "%20c", buf);
+    DumpLine(fp);
+
+    /* Line 5: right-hand side */    
+    if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) {
+	printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+	printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+	printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+	printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+    
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]);
+#endif
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read rowind[" IFMT "] = " IFMT "\n", *nonz-1, (*rowind)[*nonz-1]);
+#endif
+    if ( numer_lines ) {
+        zReadValues(fp, *nonz, *nzval, valnum, valsize);
+    }
+
+    fclose(fp);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit zreadhb_dist()");
+#endif
+}
+
+/* Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp); 
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+    
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); 
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+	   && *tmp != 'F' && *tmp != 'f') {
+       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); 
+
+    return 0;
+}
+
+static void
+ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+    
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    item = atoi(&buf[j*persize]); 
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	    where[i++] = item - 1;
+	}
+    }
+}
+
+/* Read complex numbers as pairs of (real, imaginary) */
+void
+zReadValues(FILE *fp, int_t n, doublecomplex *destination, 
+             int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    register int_t pair;
+    register double realpart;
+    char tmp, buf[100];
+    
+    i = 0;
+    pair = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    if ( pair == 0 ) {
+	  	/* The value is real part */
+		realpart = atof(&buf[s]);
+		pair = 1;
+	    } else {
+		/* The value is imaginary part */
+	        destination[i].r = realpart;
+		destination[i++].i = atof(&buf[s]);
+		pair = 0;
+	    }
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+}
+
diff --git a/SRC/zreadrb.c b/SRC/zreadrb.c
new file mode 100644
index 0000000..646edf1
--- /dev/null
+++ b/SRC/zreadrb.c
@@ -0,0 +1,355 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file zreadrb.c
+ * \brief Read a matrix stored in Rutherford-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * </pre>
+ *
+ * Purpose
+ * =======
+ *
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Rutherford-Boeing format 
+ * as described below.
+ *
+ * Line 1 (A72, A8)
+ *      Col. 1 - 72   Title (TITLE)
+ *      Col. 73 - 80  Matrix name / identifier (MTRXID)
+ *
+ * Line 2 (I14, 3(1X, I13))
+ *      Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ *      Col. 16 - 28  Number of lines for pointers (PTRCRD)
+ *      Col. 30 - 42  Number of lines for row (or variable) indices (INDCRD)
+ *      Col. 44 - 56  Number of lines for numerical values (VALCRD)
+ *
+ * Line 3 (A3, 11X, 4(1X, I13))
+ *      Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ *      Col. 15 - 28  Compressed Column: Number of rows (NROW)
+ *                    Elemental: Largest integer used to index variable (MVAR)
+ *      Col. 30 - 42  Compressed Column: Number of columns (NCOL)
+ *                    Elemental: Number of element matrices (NELT)
+ *      Col. 44 - 56  Compressed Column: Number of entries (NNZERO)
+ *                    Elemental: Number of variable indeces (NVARIX)
+ *      Col. 58 - 70  Compressed Column: Unused, explicitly zero
+ *                    Elemental: Number of elemental matrix entries (NELTVL)
+ *
+ * Line 4 (2A16, A20)
+ *      Col. 1 - 16   Fortran format for pointers (PTRFMT)
+ *      Col. 17 - 32  Fortran format for row (or variable) indices (INDFMT)
+ *      Col. 33 - 52  Fortran format for numerical values of coefficient matrix
+ *                    (VALFMT)
+ *                    (blank in the case of matrix patterns)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *      R Real matrix
+ *      C Complex matrix
+ *      I integer matrix
+ *      P Pattern only (no numerical values supplied)
+ *      Q Pattern only (numerical values supplied in associated auxiliary value
+ *        file)
+ *
+ * Second Character:
+ *      S Symmetric
+ *      U Unsymmetric
+ *      H Hermitian
+ *      Z Skew symmetric
+ *      R Rectangular
+ *
+ * Third Character:
+ *      A Compressed column form
+ *      E Elemental form
+ *
+ * </pre>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_zdefs.h"
+
+/*! \brief Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp);
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp);
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+           && *tmp != 'F' && *tmp != 'f') {
+        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
+
+    return 0;
+}
+
+static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+        fgets(buf, 100, fp);    /* read a line at a time */
+        for (j=0; j<perline && i<n; j++) {
+            tmp = buf[(j+1)*persize];     /* save the char at that place */
+            buf[(j+1)*persize] = 0;       /* null terminate */
+            item = atoi(&buf[j*persize]); 
+            buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+            where[i++] = item - 1;
+        }
+    }
+
+    return 0;
+}
+
+/*! \brief Read complex numbers as pairs of (real, imaginary) */
+static int zReadValues(FILE *fp, int n, doublecomplex *destination, int perline, int persize)
+{
+    register int i, j, k, s, pair;
+    register double realpart;
+    char tmp, buf[100];
+    
+    i = pair = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    if ( pair == 0 ) {
+	  	/* The value is real part */
+		realpart = atof(&buf[s]);
+		pair = 1;
+	    } else {
+		/* The value is imaginary part */
+	        destination[i].r = realpart;
+		destination[i++].i = atof(&buf[s]);
+		pair = 0;
+	    }
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+
+    return 0;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+static void
+FormFullA(int_t n, int_t *nonz, doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    doublecomplex *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (doublecomplex*) SUPERLU_MALLOC( *nonz * sizeof(doublecomplex)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (doublecomplex*) SUPERLU_MALLOC( new_nnz * sizeof(doublecomplex)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+    
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+	++k;
+      }
+      
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
+
+void
+zreadrb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+        doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, numer_lines = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int sym;
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+    fputs(buf, stdout);
+
+    /* Line 2 */
+    for (i=0; i<4; i++) {
+        fscanf(fp, "%14c", buf); buf[14] = 0;
+        tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+        if (i == 3) numer_lines = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if (DEBUGlevel >= 1)
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
+    if (tmp != 0)
+        if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+        if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    zallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    DumpLine(fp);
+
+#if (DEBUGlevel >= 1)
+    if ( !iam ) {
+        printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+        printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+        printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+        printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+    if ( numer_lines ) {
+        zReadValues(fp, *nonz, *nzval, valnum, valsize);
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+
+    fclose(fp);
+}
diff --git a/SRC/zreadtriple.c b/SRC/zreadtriple.c
new file mode 100644
index 0000000..21cdf08
--- /dev/null
+++ b/SRC/zreadtriple.c
@@ -0,0 +1,177 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_zdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    doublecomplex *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    
+    /* 	File format:
+     *    First line:  #rows    #non-zero
+     *    Triplet in the rest of lines:
+     *                 row    col    value
+     */
+
+#ifdef _LONGINT
+    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+#else
+    fscanf(fp, "%d%d%d", m, n, nonz);
+#endif
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(doublecomplex))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#else
+	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz].r, val[nz].i);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+void zreadrhs(int m, doublecomplex *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf%lf\n", &(b[i].r), &(b[i].i));
+
+    fclose(fp);
+}
+
+
diff --git a/SRC/zreadtriple_noheader.c b/SRC/zreadtriple_noheader.c
new file mode 100644
index 0000000..f77c327
--- /dev/null
+++ b/SRC/zreadtriple_noheader.c
@@ -0,0 +1,198 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief 
+ *
+ */
+#include <stdio.h>
+#include "superlu_zdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    i, j, k, jsize, lasta, nnz, nz, new_nonz, minn = 100;
+    doublecomplex *a, *val, vali;
+    int_t    *asub, *xa, *row, *col;
+    int      zero_base = 0, ret_val = 0;
+
+    /* 	File format: Triplet in a line for each nonzero entry:
+     *                 row    col    value
+     *         or      row    col    real_part	imaginary_part
+     */
+
+    /* First pass: determine N and NNZ */
+    nz = *n = 0;
+
+#ifdef _LONGINT
+    ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+#else
+    ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
+#endif
+
+    while (ret_val != EOF) {
+	*n = SUPERLU_MAX(*n, i);
+	*n = SUPERLU_MAX(*n, j);
+	minn = SUPERLU_MIN(minn, i);
+	minn = SUPERLU_MIN(minn, j);
+	++nz;
+
+#ifdef _LONGINT
+        ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+#else
+        ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
+#endif
+    }
+    
+    if ( minn == 0 ) { /* zero-based indexing */
+	zero_base = 1;
+	++(*n);
+	printf("triplet file: row/col indices are zero-based.\n");
+    } else {
+	printf("triplet file: row/col indices are one-based.\n");
+    }
+
+    *m = *n;
+    *nonz = nz;
+    rewind(fp);
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+
+    /* Second pass: read the actual matrix values */
+    printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz);
+    zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(doublecomplex))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#else
+	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+#endif
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif	
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+    
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+    
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+#if 0
+void zreadrhs(int m, doublecomplex *b)
+{
+    FILE *fp, *fopen();
+    int i, j;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%lf%lf\n", &(b[i].r), &(b[i].i));
+
+    fclose(fp);
+}
+#endif
+
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
new file mode 100644
index 0000000..069d3b1
--- /dev/null
+++ b/SRC/zscatter.c
@@ -0,0 +1,516 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+static void
+zscatter_l_1 (int ib,
+           int ljb,
+           int nsupc,
+           int_t iukp,
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int * usub,
+           int * lsub,
+           doublecomplex *tempv,
+           int * indirect_thread,
+           int_t ** Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+	   gridinfo_t * grid)
+{
+    // TAU_STATIC_TIMER_START("SCATTER_LB");
+    // printf("hello\n");
+    int_t rel, i, segsize, jj;
+    doublecomplex *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    while (ijb != ib)
+    {
+        /* Search for dest block --
+           blocks are not ordered! */
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+
+        ijb = index[lptrj];
+    }
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    lptrj += LB_DESCRIPTOR;
+    for (i = 0; i < index[lptrj - 1]; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    // tempv =bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        // printf("segsize %d \n",segsize);
+        if (segsize) {
+            /*#pragma _CRI cache_bypass nzval,tempv */
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+	        z_sub(&nzval[indirect_thread[rel]], &nzval[indirect_thread[rel]],
+                         &tempv[i]);
+                // printf("i (src) %d, perm (dest) %d  \n",i,indirect_thread[rel]);
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                // if(!(*(long*)&zz == *(long*)&tempv[i]) )
+                printf ("(%d %d, %0.3e, %0.3e, %3e ) ", ljb,
+                        nzval - Lnzval_bc_ptr[ljb] + indirect_thread[rel],
+                        nzval[indirect_thread[rel]] + tempv[i],
+                        nzval[indirect_thread[rel]],tempv[i]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }
+            // printf("\n");
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }
+        nzval += ldv;
+        // printf("%d\n",nzval );
+    }
+    // TAU_STATIC_TIMER_STOP("SCATTER_LB");
+} /* zscatter_l_1 */
+
+static void
+zscatter_l (
+           int ib,    /* row block number of source block L(i,k) */
+           int ljb,   /* local column block number of dest. block L(i,j) */
+           int nsupc, /* number of columns in destination supernode */
+           int_t iukp, /* point to destination supernode's index[] */
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr, /* Input, point to index[] location of block L(i,k) */
+	   int temp_nbrow, /* number of rows in block L(i,k) */
+           int_t* usub,
+           int_t* lsub,
+           doublecomplex *tempv,
+           int* indirect_thread,int* indirect2,
+           int_t ** Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+           gridinfo_t * grid)
+{
+    
+    int_t rel, i, segsize, jj;
+    doublecomplex *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    
+    while (ijb != ib)  /* Search for destination block L(i,j) */
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+    
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow; 
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow=index[lptrj - 1];
+    
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] =indirect_thread[rel]; 
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                z_sub(&nzval[indirect2[i]], &nzval[indirect2[i]], &tempv[i]);
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+    
+} /* zscatter_l */
+
+
+static void
+zscatter_u (int ib,
+           int jb,
+           int nsupc,
+           int_t iukp,
+           int_t * xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int_t* lsub,
+           int_t* usub,
+           doublecomplex* tempv,
+           int_t ** Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+           gridinfo_t * grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    // TAU_STATIC_TIMER_START("SCATTER_U");
+    // TAU_STATIC_TIMER_START("SCATTER_UB");
+
+    int_t jj, i, fnz, rel;
+    int segsize;
+    doublecomplex *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* Reinitilize the pointers to the begining of the 
+     * k-th column/row of L/U factors.
+     * usub[] - index array for panel U(k,:)
+     */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+        // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point to fstnz index of
+       block U(i,j). */
+    iuip_lib += UB_DESCRIPTOR;
+
+    // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            ucol = &Unzval_br_ptr[lib][ruip_lib];
+
+            // printf("========Entering loop=========\n");
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+
+                rel = lsub[lptr + i] - fnz;
+                // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
+                // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
+
+                z_sub(&ucol[rel], &ucol[rel], &tempv[i]);
+
+                // printf("hello\n");
+
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                if (!(*(long *) &zz == *(long *) &tempv[i]))
+                    printf ("(%d, %0.3e, %0.3e ) ", rel, ucol[rel] + tempv[i],
+                            ucol[rel]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }                       /*ig segsize */
+        ruip_lib += ilst - fnz;
+
+    }                           /*for jj=0:nsupc */
+#ifdef PI_DEBUG
+    // printf("\n");
+#endif
+    // TAU_STATIC_TIMER_STOP("SCATTER_UB");
+} /* zscatter_u */
+
+
+/*Divide CPU-GPU dgemm work here*/
+#ifdef PI_DEBUG
+int Ngem = 2;
+// int_t Ngem = 0;
+int min_gpu_col = 6;
+#else
+
+    // int_t Ngem = 0;
+
+#endif
+
+
+#ifdef GPU_ACC
+
+void
+gemm_division_cpu_gpu(
+    int* num_streams_used,  /*number of streams that will be used */
+    int* stream_end_col,    /*array holding last column blk for each partition */
+    int * ncpu_blks,        /*Number of CPU dgemm blks */
+    /*input */
+    int nbrow,              /*number of row in A matrix */
+    int ldu,                /*number of k in dgemm */
+    int nstreams, 
+    int* full_u_cols,       /*array containing prefix sum of work load */
+    int num_blks            /*Number of work load */
+)
+{
+    int Ngem = sp_ienv(7);  /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2 
+     */
+    int i, j;
+
+    // {
+    //     *num_streams_used=0;
+    //     *ncpu_blks = num_blks;
+    //     return;
+    // }
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (full_u_cols[num_blks - 1] < (Ngem / (nbrow * ldu)) || num_blks == 1 )
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+#ifdef PI_DEBUG
+        printf ("full_u_cols[num_blks-1] %d  %d \n",
+                full_u_cols[num_blks - 1], (Ngem / (nbrow * ldu)));
+        printf ("Early return \n");
+#endif
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (full_u_cols[i + 1] > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+        full_u_cols[num_blks - 1] - full_u_cols[*ncpu_blks - 1];
+
+#ifdef PI_DEBUG
+    printf ("Remaining cols %d num_blks %d cpu_blks %d \n", cols_remain,
+            num_blks, *ncpu_blks);
+#endif
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+#ifdef PI_DEBUG
+        printf ("%d %d  %d %d \n", full_u_cols[num_blks - 1],
+                full_u_cols[*ncpu_blks], *ncpu_blks, nstreams);
+#endif
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+#ifdef PI_DEBUG
+        printf ("cols_per_stream :\t%d\n", cols_per_stream);
+#endif
+
+        int_t cutoff = cols_per_stream + full_u_cols[*ncpu_blks - 1];
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+#ifdef PI_DEBUG
+                printf ("i %d, j %d, %d  %d ", i, j, full_u_cols[j + 1],
+                        cutoff);
+#endif
+                if (full_u_cols[j + 1] > cutoff)
+                {
+#ifdef PI_DEBUG
+                    printf ("cutoff met \n");
+#endif
+                    cutoff = cols_per_stream + full_u_cols[j];
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+#ifdef PI_DEBUG
+                printf ("\n");
+#endif
+            }
+
+        }
+
+    }
+}
+
+void
+gemm_division_new (int * num_streams_used,   /*number of streams that will be used */
+                   int * stream_end_col, /*array holding last column blk for each partition */
+                   int * ncpu_blks,  /*Number of CPU dgemm blks */
+                        /*input */
+                   int nbrow,    /*number of row in A matrix */
+                   int ldu,  /*number of k in dgemm */
+                   int nstreams,
+                   Ublock_info_t *Ublock_info,    /*array containing prefix sum of work load */
+                   int num_blks  /*Number of work load */
+    )
+{
+    int Ngem = sp_ienv(7); /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2 
+     */
+    int_t i, j;
+
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (Ublock_info[num_blks - 1].full_u_cols < (Ngem / (nbrow * ldu)) || num_blks == 1)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (Ublock_info[i + 1].full_u_cols > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+       Ublock_info [num_blks - 1].full_u_cols - Ublock_info[*ncpu_blks - 1].full_u_cols;
+
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+
+        int_t cutoff = cols_per_stream + Ublock_info[*ncpu_blks - 1].full_u_cols;
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+                if (Ublock_info[j + 1].full_u_cols > cutoff)
+                {
+
+                    cutoff = cols_per_stream + Ublock_info[j].full_u_cols;
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+
+            }
+
+        }
+
+    }
+}
+
+#endif  /* defined GPU_ACC */
diff --git a/SRC/zsp_blas2_dist.c b/SRC/zsp_blas2_dist.c
new file mode 100644
index 0000000..c8b3ba3
--- /dev/null
+++ b/SRC/zsp_blas2_dist.c
@@ -0,0 +1,515 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Solves one of the systems of equations A*x = b,   or   A'*x = b
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		sp_blas2.c
+ * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
+ */
+
+#include "superlu_zdefs.h"
+
+
+/* 
+ * Function prototypes 
+ */
+#ifndef USE_VENDOR_BLAS
+void zusolve(int, int, doublecomplex*, doublecomplex*);
+void zlsolve(int, int, doublecomplex*, doublecomplex*);
+void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ *   Purpose
+ *   =======
+ *
+ *   sp_ztrsv() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.   
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes,
+ *             i.e., L has types: Stype = SC, Dtype = Z, Mtype = TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = NC, Dtype = Z, Mtype = TRU.
+ *    
+ *   x       - (input/output) doublecomplex*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * </pre>
+ */
+int
+sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+	      SuperMatrix *U, doublecomplex *x, int *info)
+{
+
+#ifdef _CRAY
+    _fcd ftcs1 = _cptofcd("L", strlen("L")),
+	 ftcs2 = _cptofcd("N", strlen("N")),
+	 ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    doublecomplex   *Lval, *Uval;
+    int incx = 1, incy = 1;
+    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
+    doublecomplex comp_zero = {0.0, 0.0};
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    doublecomplex *work;
+    flops_t solve_ops;
+    /*extern SuperLUStat_t SuperLUStat;*/
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U", 1) != 0 ) *info = -1;
+    else if ( strncmp(trans, "N", 1) != 0 && strncmp(trans, "T", 1) != 0 )
+	*info = -2;
+    else if ( strncmp(diag, "U", 1) != 0 && strncmp(diag, "N", 1) !=0 )
+	*info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerr_dist("sp_ztrsv", &i);
+	return 0;
+    }
+
+    Lstore = L->Store;
+    Lval = Lstore->nzval;
+    Ustore = U->Store;
+    Uval = Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = doublecomplexCalloc_dist(L->nrow)) )
+	ABORT("Malloc fails for work in sp_ztrsv().");
+    
+    if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = L_FST_SUPC(k);
+		istart = L_SUB_START(fsupc);
+		nsupr = L_SUB_START(fsupc+1) - istart;
+		nsupc = L_FST_SUPC(k+1) - fsupc;
+		luptr = L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+
+	        solve_ops += 4 * nsupc * (nsupc - 1);
+	        solve_ops += 8 * nrow * nsupc;
+
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
+			irow = L_SUB(iptr);
+			++luptr;
+			zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
+			z_sub(&x[irow], &x[irow], &comp_zero);
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx, 1, 1, 1);
+		
+		    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
+#endif		
+#else
+		    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+			&x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = L_SUB(iptr);
+			z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
+			work[i] = comp_zero;
+
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+		
+    	        solve_ops += 4 * nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    slud_z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
+		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
+			irow = U_SUB(i);
+			zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
+			z_sub(&x[irow], &x[irow], &comp_zero);
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else		
+		    zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
+				i++) {
+			    irow = U_SUB(i);
+			zz_mult(&comp_zero, &x[jcol], &Uval[i]);
+			z_sub(&x[irow], &x[irow], &comp_zero);
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = L_FST_SUPC(k);
+	    	istart = L_SUB_START(fsupc);
+	    	nsupr = L_SUB_START(fsupc+1) - istart;
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		solve_ops += 8 * (nsupr - nsupc) * nsupc;
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = L_NZ_START(jcol) + nsupc; 
+				i < L_NZ_START(jcol+1); i++) {
+			irow = L_SUB(iptr);
+			zz_mult(&comp_zero, &x[irow], &Lval[i]);
+		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += 4 * nsupc * (nsupc - 1);
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
+			irow = U_SUB(i);
+			zz_mult(&comp_zero, &x[irow], &Uval[i]);
+		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
+		    }
+		}
+
+		solve_ops += 4 * nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    slud_z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
+    SUPERLU_FREE(work);
+    return 0;
+}
+
+
+
+/*! \brief
+
+<pre>
+  Purpose   
+    =======   
+
+    sp_zgemv()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) doublecomplex
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Before entry, the leading m by n part of the array A must   
+             contain the matrix of coefficients.   
+
+    X      - (input) doublecomplex*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) doublecomplex
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) doublecomplex*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+</pre>
+*/
+int
+sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A,
+	      doublecomplex *x, int incx, doublecomplex beta,
+	      doublecomplex *y, int incy)
+{
+    /* Local variables */
+    NCformat *Astore;
+    doublecomplex   *Aval;
+    int info;
+    doublecomplex temp, temp1;
+    int lenx, leny, i, j, irow;
+    int iy, jx, jy, kx, ky;
+    int notran;
+    doublecomplex comp_zero = {0.0, 0.0};
+    doublecomplex comp_one = {1.0, 0.0};
+
+    notran = (strncmp(trans, "N", 1)==0);
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    
+    /* Test the input parameters */
+    info = 0;
+    if ( !notran && strncmp(trans, "T", 1) != 0 && strncmp(trans, "C", 1) != 0)
+	info = 1;
+    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
+    else if (incx == 0) info = 5;
+    else if (incy == 0)	info = 8;
+    if (info != 0) {
+	xerr_dist("sp_zgemv ", &info);
+	return 0;
+    }
+
+    /* Quick return if possible. */
+    if (A->nrow == 0 || A->ncol == 0 || 
+	z_eq(&alpha, &comp_zero) && 
+	z_eq(&beta, &comp_one))
+	return 0;
+
+
+    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+       up the start points in  X  and  Y. */
+    if ( strncmp(trans, "N", 1)==0 ) {
+	lenx = A->ncol;
+	leny = A->nrow;
+    } else {
+	lenx = A->nrow;
+	leny = A->ncol;
+    }
+    if (incx > 0) kx = 0;
+    else kx =  - (lenx - 1) * incx;
+    if (incy > 0) ky = 0;
+    else ky =  - (leny - 1) * incy;
+
+    /* Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+    /* First form  y := beta*y. */
+    if ( !z_eq(&beta, &comp_one) ) {
+	if (incy == 1) {
+	    if ( z_eq(&beta, &comp_zero) )
+		for (i = 0; i < leny; ++i) y[i] = comp_zero;
+	    else
+		for (i = 0; i < leny; ++i) 
+		  zz_mult(&y[i], &beta, &y[i]);
+	} else {
+	    iy = ky;
+	    if ( z_eq(&beta, &comp_zero) )
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = comp_zero;
+		    iy += incy;
+		}
+	    else
+		for (i = 0; i < leny; ++i) {
+		    zz_mult(&y[iy], &beta, &y[iy]);
+		    iy += incy;
+		}
+	}
+    }
+    
+    if ( z_eq(&alpha, &comp_zero) ) return 0;
+
+    if ( notran ) {
+	/* Form  y := alpha*A*x + y. */
+	jx = kx;
+	if (incy == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		if ( !z_eq(&x[jx], &comp_zero) ) {
+		    zz_mult(&temp, &alpha, &x[jx]);
+		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+			irow = Astore->rowind[i];
+			zz_mult(&temp1, &temp,  &Aval[i]);
+			z_add(&y[irow], &y[irow], &temp1);
+		    }
+		}
+		jx += incx;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    } else {
+	/* Form  y := alpha*A'*x + y. */
+	jy = ky;
+	if (incx == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		temp = comp_zero;
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    irow = Astore->rowind[i];
+		    zz_mult(&temp1, &Aval[i], &x[irow]);
+		    z_add(&temp, &temp, &temp1);
+		}
+		zz_mult(&temp1, &alpha, &temp);
+		z_add(&y[jy], &y[jy], &temp1);
+		jy += incy;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    }
+    return 0;
+} /* sp_zgemv */
+
diff --git a/SRC/zsp_blas3_dist.c b/SRC/zsp_blas3_dist.c
new file mode 100644
index 0000000..92e8354
--- /dev/null
+++ b/SRC/zsp_blas3_dist.c
@@ -0,0 +1,136 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file 
+ * \brief Sparse BLAS3, using some dense BLAS3 operations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		sp_blas3.c
+ * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
+ */
+
+#include "superlu_zdefs.h"
+
+/*! \brief
+
+<pre>
+  Purpose   
+    =======   
+
+    sp_z performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) doublecomplex
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = Z; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    B      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) doublecomplex
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.  
+</pre> 
+*/
+int
+sp_zgemm_dist(char *transa, int n, doublecomplex alpha, SuperMatrix *A,
+	      doublecomplex *b, int ldb,  doublecomplex beta,
+	      doublecomplex *c, int ldc)
+{
+
+    int    incx = 1, incy = 1;
+    int    j;
+
+    for (j = 0; j < n; ++j) {
+	sp_zgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
+    }
+    return 0;
+}
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
new file mode 100644
index 0000000..5f00b08
--- /dev/null
+++ b/SRC/zutil_dist.c
@@ -0,0 +1,497 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ */
+
+#include <math.h>
+#include "superlu_zdefs.h"
+
+void
+zCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
+			    doublecomplex *nzval, int_t *rowind, int_t *colptr,
+			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NCformat *) A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+zCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
+			       int_t nnz_loc, int_t m_loc, int_t fst_row,
+			       doublecomplex *nzval, int_t *colind, int_t *rowptr,
+			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat_loc *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NRformat_loc *) A->Store;
+    Astore->nnz_loc = nnz_loc;
+    Astore->fst_row = fst_row;
+    Astore->m_loc = m_loc;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/*! \brief Convert a row compressed storage into a column compressed storage.
+ */
+void
+zCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
+                         doublecomplex *a, int_t *colind, int_t *rowptr,
+                         doublecomplex **at, int_t **rowind, int_t **colptr)
+{
+    register int i, j, col, relpos;
+    int_t *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (doublecomplex *) doublecomplexMalloc_dist(nnz);
+    *rowind = intMalloc_dist(nnz);
+    *colptr = intMalloc_dist(n+1);
+    marker = intCalloc_dist(n);
+    
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+/*! \brief Copy matrix A into matrix B. */
+void
+zCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void zPrint_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i;
+    doublecomplex       *dp;
+    
+    printf("\nCompCol matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NCformat *) A->Store;
+    printf("nrow %lld, ncol %lld, nnz %lld\n", (long long) A->nrow,
+	    (long long) A->ncol, (long long) Astore->nnz);
+    if ( (dp = (doublecomplex *) Astore->nzval) != NULL ) {
+        printf("nzval:\n");
+        for (i = 0; i < Astore->nnz; ++i) printf("%f\t%f\n", dp[i].r, dp[i].i);
+    }
+    printf("\nrowind:\n");
+    for (i = 0; i < Astore->nnz; ++i) 
+        printf("%lld  ", (long long) Astore->rowind[i]);
+    printf("\ncolptr:\n");
+    for (i = 0; i <= A->ncol; ++i) 
+        printf("%lld  ", (long long) Astore->colptr[i]);
+    printf("\nend CompCol matrix.\n");
+}
+
+void zPrint_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    doublecomplex       *dp;
+    
+    printf("\nDense matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (doublecomplex *) Astore->nzval;
+    printf("nrow %lld, ncol %lld, lda %lld\n", 
+        (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f\t%f\n", dp[i].r, dp[i].i);
+    printf("\nend Dense matrix.\n");
+}
+
+int zPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc  *Astore;
+    int_t  nnz_loc, m_loc;
+    doublecomplex  *dp;
+    
+    printf("\n==== CompRowLoc matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    printf("nrow %ld, ncol %ld\n", 
+            (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc, 
+            (long int) m_loc, (long int) Astore->fst_row);
+    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
+    PrintInt10("colind", nnz_loc, Astore->colind);
+    if ( (dp = (doublecomplex *) Astore->nzval) != NULL )
+        PrintDoublecomplex("nzval", nnz_loc, dp);
+    printf("==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+int file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
+{
+    NRformat_loc     *Astore;
+    int_t  nnz_loc, m_loc;
+    doublecomplex       *dp;
+    
+    fprintf(fp, "\n==== CompRowLoc matrix: ");
+    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    fprintf(fp, "nrow %ld, ncol %ld\n", (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    fprintf(fp, "nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
+    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
+    if ( (dp = (doublecomplex *) Astore->nzval) != NULL )
+        file_PrintDoublecomplex(fp, "nzval", nnz_loc, dp);
+    fprintf(fp, "==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+void
+zCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, doublecomplex *x,
+			  int_t ldx, Stype_t stype, Dtype_t dtype,
+			  Mtype_t mtype)
+{
+    DNformat    *Xstore;
+    
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (doublecomplex *) x;
+}
+
+void
+zCopy_Dense_Matrix_dist(int_t M, int_t N, doublecomplex *X, int_t ldx,
+			doublecomplex *Y, int_t ldy)
+{
+/*! \brief
+ *
+ * <pre>
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ * </pre>
+ */
+    int    i, j;
+    
+    for (j = 0; j < N; ++j)
+        for (i = 0; i < M; ++i)
+            Y[i + j*ldy] = X[i + j*ldx];
+}
+
+void
+zCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
+			      doublecomplex *nzval, int_t *nzval_colptr,
+			      int_t *rowind, int_t *rowind_colptr,
+			      int_t *col_to_sup, int_t *sup_to_col,
+			      Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    SCformat *Lstore;
+
+    L->Stype = stype;
+    L->Dtype = dtype;
+    L->Mtype = mtype;
+    L->nrow = m;
+    L->ncol = n;
+    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
+    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
+    Lstore = L->Store;
+    Lstore->nnz = nnz;
+    Lstore->nsuper = col_to_sup[n];
+    Lstore->nzval = nzval;
+    Lstore->nzval_colptr = nzval_colptr;
+    Lstore->rowind = rowind;
+    Lstore->rowind_colptr = rowind_colptr;
+    Lstore->col_to_sup = col_to_sup;
+    Lstore->sup_to_col = sup_to_col;
+
+}
+
+void
+zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx)
+{
+    int  i, j;
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) {
+	    if ( i % 2 ) x[i + j*ldx].r = 1.0;
+	    else x[i + j*ldx].r = 2.0;
+	    x[i + j*ldx].i = 0.0;
+	}
+}
+
+/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
+ */
+void
+zFillRHS_dist(char *trans, int_t nrhs, doublecomplex *x, int_t ldx,
+	      SuperMatrix *A, doublecomplex *rhs, int_t ldb)
+{
+    doublecomplex one = {1.0, 0.0};
+    doublecomplex zero = {0.0, 0.0};
+
+    sp_zgemm_dist(trans, nrhs, one, A, x, ldx, zero, rhs, ldb);
+
+}
+
+/*! \brief Fills a doublecomplex precision array with a given value.
+ */
+void 
+zfill_dist(doublecomplex *a, int_t alen, doublecomplex dval)
+{
+    register int_t i;
+    for (i = 0; i < alen; i++) a[i] = dval;
+}
+
+
+
+/*! \brief Check the inf-norm of the error vector 
+ */
+void zinf_norm_error_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx,
+			  doublecomplex *xtrue, int_t ldxtrue,
+                          gridinfo_t *grid)
+{
+    double err, xnorm;
+    doublecomplex *x_work, *xtrue_work;
+    doublecomplex temp;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+        z_sub(&temp, &x_work[i], &xtrue_work[i]);
+	err = SUPERLU_MAX(err, slud_z_abs(&temp));
+	xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i]));
+      }
+      err = err / xnorm;
+      printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
+void PrintDoublecomplex(char *name, int_t len, doublecomplex *x)
+{
+    register int_t i;
+    
+    printf("%10s:\tReal\tImag\n", name);
+    for (i = 0; i < len; ++i)
+	printf("\t" IFMT "\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
+}
+
+int file_PrintDoublecomplex(FILE *fp, char *name, int_t len, doublecomplex *x)
+{
+    register int_t i;
+    
+    fprintf(fp, "%10s:\tReal\tImag\n", name);
+    for (i = 0; i < len; ++i)
+	fprintf(fp, "\t" IFMT "\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
+    return 0;
+}
+
+/*! \brief Print the blocks in the factored matrix L.
+ */
+void zPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
+		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
+{
+    register int c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb;
+    register int_t k, mycol, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    doublecomplex *nzval;
+
+    printf("\n[%d] L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam);
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    printf("[%d] block column %d (local # %d), nsupc %d, # row blocks %d\n",
+		   iam, gb, lb, nsupc, nb);
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", 
+		       iam, c, index[k], len);
+		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
+		for (j = 0; j < nsupc; ++j) {
+		    PrintDoublecomplex("nzval", len, &nzval[r + j*nsupr]);
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+	printf("(%d)", iam);
+ 	PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
+	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
+    }
+    printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
+    k = CEILING( nsupers, grid->nprow );
+    PrintInt10("fmod", k, Llu->fmod);
+    
+} /* ZPRINTLBLOCKS */
+
+
+/*! \brief Print the blocks in the factored matrix U.
+ */
+void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, 
+		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
+{
+    register int c, extra, jb, k, lb, len, nb, nrb, nsupc;
+    register int_t myrow, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    doublecomplex *nzval;
+
+    printf("\n[%d] U BLOCKS IN ROW-MAJOR ORDER -->\n", iam);
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    nb = index[0];
+	    printf("[%d] block row " IFMT " (local # %d), # column blocks %d\n",
+		   iam, lb*grid->nprow+myrow, lb, nb);
+	    r  = 0;
+	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
+		jb = index[k];
+		len = index[k+1];
+		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", 
+		       iam, c, jb, index[k+1]);
+		nsupc = SuperSize( jb );
+		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
+		PrintDoublecomplex("nzval", len, &nzval[r]);
+		k += UB_DESCRIPTOR + nsupc;
+		r += len;
+	    }
+
+	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
+	}
+    }
+} /* ZPRINTUBLOCKS */
+
+int
+zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm,
+                 gridinfo_t *grid)
+{
+  int_t procs = grid->nprow*grid->npcol;
+  fprintf(fp, "TotalIndSend " IFMT "\tTotalValSend " IFMT "\n", gsmv_comm->TotalIndSend,
+	  gsmv_comm->TotalValSend);
+  file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start);
+  file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend);
+  file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv);
+  file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend);
+  file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv);
+  file_PrintInt32(fp, "SendCounts", procs, gsmv_comm->SendCounts);
+  file_PrintInt32(fp, "RecvCounts", procs, gsmv_comm->RecvCounts);
+  return 0;
+}
+
+
+/* cg5.cua
+            b = A*x           y = L\b
+   0        1 + 4.0000i       1.0000 + 4.0000i
+   1        0 + 5.0000i	      1.3529 + 5.4118i
+   2        1 + 4.0000i	      1.0000 + 4.0000i
+   3        2 + 3.0000i	      2.0000 + 3.0000i
+   4        1 + 4.0000i	      3.5882 + 4.3529i
+   5        1 + 4.0000i	      4.1250 + 3.3202i
+   6          + 5.0000i	      4.4640 + 3.8632i
+   7        2 + 3.0000i	      2.0000 + 3.0000i
+   8        2 + 3.0000i	      2.0000 + 3.0000i
+   9        1 + 4.0000i	      1.0000 + 4.0000i
+  10        1 + 4.0000i	      3.5882 + 4.3529i
+  11          + 5.0000i	           0 + 5.0000i
+  12        1 + 4.0000i	      5.1793 + 4.6604i
+  13        2 + 3.0000i	      2.0000 + 3.0000i
+  14        1 + 4.0000i	      1.0000 + 4.0000i
+  15          + 5.0000i	      1.3529 + 5.4118i
+  16        1 + 4.0000i	      4.0045 + 3.8950i
+  17          + 5.0000i	      3.0338 + 4.6248i
+  18        1 + 4.0000i	      5.4495 + 2.2703i
+  19          + 5.0000i	      4.0980 + 3.7290i
+  20          + 5.0000i	      4.2680 + 3.7739i
+  21          + 5.0000i	      5.3514 + 2.9480i
+  22        1 + 4.0000i	      4.4178 + 2.0476i
+  23        1 + 4.0000i	      3.5615 + 2.8322i
+  24          + 5.0000i	      4.7526 + 2.2605i
+*/
diff --git a/cmake/XSDKDefaults.cmake b/cmake/XSDKDefaults.cmake
new file mode 100644
index 0000000..df2ddef
--- /dev/null
+++ b/cmake/XSDKDefaults.cmake
@@ -0,0 +1,182 @@
+##################################################################################
+#
+#                    Set defaults for XSDK CMake projects
+#
+##################################################################################
+
+#
+# This module implements standard behavior for XSDK CMake projects.  The main
+# thing it does in XSDK mode (i.e. USE_XSDK_DEFAULTS=TRUE) is to print out
+# when the env vars CC, CXX, FC and compiler flags CFLAGS, CXXFLAGS, and
+# FFLAGS/FCFLAGS are used to select the compilers and compiler flags (raw
+# CMake does this silently) and to set BUILD_SHARED_LIBS=TRUE and
+# CMAKE_BUILD_TYPE=DEBUG by default.  It does not implement *all* of the
+# standard XSDK configuration parameters.  The parent CMake project must do
+# that.
+#
+# Note that when USE_XSDK_DEFAULTS=TRUE, then the Fortran flags will be read
+# from either of the env vars FFLAGS or FCFLAGS.  If both are set, but are the
+# same, then FFLAGS it used (which is the same as FCFLAGS).  However, if both
+# are set but are not equal, then a FATAL_ERROR is raised and CMake configure
+# processing is stopped.
+#
+# To be used in a parent project, this module must be included after
+#
+#   PROJECT(${PROJECT_NAME}  NONE)
+#
+# is called but before the compilers are defined and processed using:
+#
+#   ENABLE_LANGUAGE(<LANG>)
+#
+# For example, one would do:
+#
+#   PROJECT(${PROJECT_NAME}  NONE)
+#   ...
+#   SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
+#   INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/stdk/XSDKDefaults.cmake")
+#   ...
+#   ENABLE_LANGUAGE(C)
+#   ENABLE_LANGUAGE(C++)
+#   ENABLE_LANGUAGE(Fortran)
+#
+# The variable `USE_XSDK_DEFAULTS_DEFAULT` is used as the default for the
+# cache var `USE_XSDK_DEFAULTS`.  That way, a project can decide if it wants
+# XSDK defaults turned on or off by default and users can independently decide
+# if they want the CMake project to use standard XSDK behavior or raw CMake
+# behavior.
+#
+# By default, the XSDKDefaults.cmake module assumes that the project will need
+# C, C++, and Fortran.  If any language is not needed then, set
+# XSDK_ENABLE_C=OFF, XSDK_ENABLE_CXX=OFF, or XSDK_ENABLE_Fortran=OFF *before*
+# including this module.  Note, these variables are *not* cache vars because a
+# project either does or does not have C, C++ or Fortran source files, the
+# user has nothing to do with this so there is no need for cache vars.  The
+# parent CMake project just needs to tell XSDKDefault.cmake what languages is
+# needs or does not need.
+#
+# For example, if the parent CMake project only needs C, then it would do:
+#
+#   PROJECT(${PROJECT_NAME}  NONE)'
+#   ...
+#   SET(USE_XSDK_DEFAULTS_DEFAULT TRUE)
+#   SET(XSDK_ENABLE_CXX OFF)
+#   SET(XSDK_ENABLE_Fortran OFF)
+#   INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/stdk/XSDKDefaults.cmake")
+#   ...
+#   ENABLE_LANGAUGE(C)
+#
+# This module code will announce when it sets any variables.
+#
+
+#
+# Helper functions
+#
+
+IF (NOT COMMAND PRINT_VAR)
+  FUNCTION(PRINT_VAR  VAR_NAME)
+    MESSAGE("${VAR_NAME} = '${${VAR_NAME}}'")
+  ENDFUNCTION()
+ENDIF()
+
+IF (NOT COMMAND SET_DEFAULT)
+  MACRO(SET_DEFAULT VAR)
+    IF ("${${VAR}}" STREQUAL "")
+      SET(${VAR} ${ARGN})
+    ENDIF()
+  ENDMACRO()
+ENDIF()
+
+#
+# XSDKDefaults.cmake control variables
+#
+
+# USE_XSDK_DEFAULTS
+IF ("${USE_XSDK_DEFAULTS_DEFAULT}" STREQUAL "")
+  SET(USE_XSDK_DEFAULTS_DEFAULT  FALSE)
+ENDIF()
+SET(USE_XSDK_DEFAULTS  ${USE_XSDK_DEFAULTS_DEFAULT}  CACHE  BOOL
+  "Use XSDK defaults and behavior.")
+PRINT_VAR(USE_XSDK_DEFAULTS)
+
+SET_DEFAULT(XSDK_ENABLE_C  TRUE)
+SET_DEFAULT(XSDK_ENABLE_CXX  TRUE)
+SET_DEFAULT(XSDK_ENABLE_Fortran  TRUE)
+
+# Handle the compiler and flags for a language
+MACRO(XSDK_HANDLE_LANG_DEFAULTS  CMAKE_LANG_NAME  ENV_LANG_NAME
+  ENV_LANG_FLAGS_NAMES
+  )
+
+  # Announce using env var ${ENV_LANG_NAME}
+  IF (NOT "$ENV{${ENV_LANG_NAME}}" STREQUAL "" AND
+    "${CMAKE_${CMAKE_LANG_NAME}_COMPILER}" STREQUAL ""
+    )
+    MESSAGE("-- " "XSDK: Setting CMAKE_${CMAKE_LANG_NAME}_COMPILER from env var"
+      " ${ENV_LANG_NAME}='$ENV{${ENV_LANG_NAME}}'!")
+    SET(CMAKE_${CMAKE_LANG_NAME}_COMPILER "$ENV{${ENV_LANG_NAME}}" CACHE FILEPATH
+      "XSDK: Set by default from env var ${ENV_LANG_NAME}")
+  ENDIF()
+
+  # Announce using env var ${ENV_LANG_FLAGS_NAME}
+  FOREACH(ENV_LANG_FLAGS_NAME  ${ENV_LANG_FLAGS_NAMES})
+    IF (NOT "$ENV{${ENV_LANG_FLAGS_NAME}}" STREQUAL "" AND
+      "${CMAKE_${CMAKE_LANG_NAME}_FLAGS}" STREQUAL ""
+      )
+      MESSAGE("-- " "XSDK: Setting CMAKE_${CMAKE_LANG_NAME}_FLAGS from env var"
+        " ${ENV_LANG_FLAGS_NAME}='$ENV{${ENV_LANG_FLAGS_NAME}}'!")
+      SET(CMAKE_${CMAKE_LANG_NAME}_FLAGS "$ENV{${ENV_LANG_FLAGS_NAME}} " CACHE  STRING
+        "XSDK: Set by default from env var ${ENV_LANG_FLAGS_NAME}")
+      # NOTE: CMake adds the space after $ENV{${ENV_LANG_FLAGS_NAME}} so we
+      # duplicate that here!
+    ENDIF()
+  ENDFOREACH()
+
+ENDMACRO()
+
+
+#
+# Set XSDK Defaults
+#
+
+# Set default compilers and flags
+IF (USE_XSDK_DEFAULTS)
+
+  # Handle env vars for languages C, C++, and Fortran
+
+  IF (XSDK_ENABLE_C)
+    XSDK_HANDLE_LANG_DEFAULTS(C  CC  CFLAGS)
+  ENDIF()
+
+  IF (XSDK_ENABLE_CXX)
+    XSDK_HANDLE_LANG_DEFAULTS(CXX  CXX  CXXFLAGS)
+  ENDIF()
+
+  IF (XSDK_ENABLE_Fortran)
+    SET(ENV_FFLAGS "$ENV{FFLAGS}")
+    SET(ENV_FCFLAGS "$ENV{FCFLAGS}")
+    IF (
+      (NOT "${ENV_FFLAGS}" STREQUAL "") AND (NOT "${ENV_FCFLAGS}" STREQUAL "")
+      AND
+      ("${CMAKE_Fortran_FLAGS}" STREQUAL "")
+      )
+      IF (NOT "${ENV_FFLAGS}" STREQUAL "${ENV_FCFLAGS}")
+        MESSAGE(FATAL_ERROR "Error, env vars FFLAGS='${ENV_FFLAGS}' and"
+          " FCFLAGS='${ENV_FCFLAGS}' are both set in the env but are not equal!")
+      ENDIF()
+    ENDIF()
+    XSDK_HANDLE_LANG_DEFAULTS(Fortran  FC  "FFLAGS;FCFLAGS")
+  ENDIF()
+  
+  # Set XSDK defaults for other CMake variables
+  
+  IF ("${BUILD_SHARED_LIBS}"  STREQUAL  "")
+    MESSAGE("-- " "XSDK: Setting default BUILD_SHARED_LIBS=TRUE")
+    SET(BUILD_SHARED_LIBS  TRUE  CACHE  BOOL  "Set by default in XSDK mode")
+  ENDIF()
+  
+  IF ("${CMAKE_BUILD_TYPE}"  STREQUAL  "")
+    MESSAGE("-- " "XSDK: Setting default CMAKE_BUILD_TYPE=DEBUG")
+    SET(CMAKE_BUILD_TYPE  DEBUG  CACHE  STRING  "Set by default in XSDK mode")
+  ENDIF()
+
+ENDIF()
diff --git a/make.inc.in b/make.inc.in
new file mode 100644
index 0000000..15383ac
--- /dev/null
+++ b/make.inc.in
@@ -0,0 +1,39 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   March 1, 2016	version 5.0.0
+#
+#  Modified:	    
+#		    
+#
+############################################################################
+#
+#  The name of the libraries to be created/linked to
+#
+SuperLUroot	= ${CMAKE_SOURCE_DIR}/build
+DSUPERLULIB   	= $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
+
+LIBS		= $(DSUPERLULIB) ${BLAS_LIB_EXPORT} ${PARMETIS_LIB_EXPORT}
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         = @CMAKE_AR@
+ARCHFLAGS    = cr
+RANLIB       = @CMAKE_RANLIB@
+
+CC           = @CMAKE_C_COMPILER@
+CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@
+# CFLAGS       += -D${DirDefs}
+# CFLAGS       += @COMPILE_DEFINITIONS@ 
+NOOPTS       = -O0
+FORTRAN	     = @CMAKE_Fortran_COMPILER@
+
+LOADER       = $(CC)
+LOADOPTS     = -Wl,-rpath, at CMAKE_INSTALL_RPATH@ @CMAKE_EXE_LINKER_FLAGS@
diff --git a/run_cmake_build.csh b/run_cmake_build.csh
new file mode 100644
index 0000000..42b6482
--- /dev/null
+++ b/run_cmake_build.csh
@@ -0,0 +1,56 @@
+#!/bin/csh
+
+if ( ! $?NERSC_HOST ) then
+    echo "NERSC_HOST undefined"
+else
+  if ( "$NERSC_HOST" == "edison" ) then
+    setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3 
+#    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
+    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+    cmake .. \
+    -DUSE_XSDK_DEFAULTS=FALSE\
+    -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+    -DCMAKE_C_FLAGS="-std=c99 -fPIC" \
+#    -DCMAKE_EXE_LINKER_FLAGS="-shared" \
+    -DCMAKE_Fortran_COMPILER=ftn \
+    -Denable_blaslib=OFF \
+#    -DTPL_BLAS_LIBRARIES=" " \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DCMAKE_INSTALL_PREFIX=..
+  endif
+
+  if ( "$NERSC_HOST" == "cori" ) then
+    setenv PARMETIS_ROOT ~/Cori/lib/parmetis-4.0.3
+    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
+#    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+    cmake .. \
+    -DUSE_XSDK_DEFAULTS=TRUE\
+    -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \
+    -Denable_blaslib=OFF \
+    -DCMAKE_Fortran_COMPILER=ftn \
+    -DCMAKE_C_FLAGS="-std=c99 -fPIC" \
+    -DCMAKE_EXE_LINKER_FLAGS="-shared" \
+    -DCMAKE_INSTALL_PREFIX=..
+  endif
+endif
+
+set THISHOST=`hostname -s`
+#echo $THISHOST
+if ( "$THISHOST" == "ssg1" ) then
+  setenv PARMETIS_ROOT ~/lib/static/parmetis-4.0.3 
+  setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
+    echo $PARMETIS_ROOT
+  cmake .. \
+    -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+    -DCMAKE_C_FLAGS="-std=c99 -g" \
+    -Denable_blaslib=OFF \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DCMAKE_C_COMPILER=mpicc \
+    -DCMAKE_INSTALL_PREFIX=..
+endif
+
+# make VERBOSE=1
+# make test

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/superlu-dist.git