[superlu-dist] 01/01: New upstream version 5.2.2+dfsg1

Wed Nov 8 09:06:05 UTC 2017

This is an automated email from the git hooks/post-receive script.

dparsons pushed a commit to annotated tag upstream/5.2.2+dfsg1
in repository superlu-dist.

commit d99f988c13b18c2e9810e13706c9c6776ded6e9c
Author: Drew Parsons <dparsons at debian.org>
Date:   Wed Nov 8 17:03:38 2017 +0800

    New upstream version 5.2.2+dfsg1
---
 .gitignore                                |    5 +-
 CBLAS/Makefile                            |   10 +-
 CMakeLists.txt                            |   93 +-
 DOC/ug.pdf                                |  Bin 687318 -> 0 bytes
 DoxyConfig                                |    4 +-
 EXAMPLE/CMakeLists.txt                    |   22 +-
 EXAMPLE/README                            |    2 +
 EXAMPLE/dcreate_matrix.c                  |    5 +
 EXAMPLE/pddrive.c                         |   16 +-
 EXAMPLE/pddrive1.c                        |   13 +-
 EXAMPLE/pddrive2.c                        |   13 +-
 EXAMPLE/pddrive3.c                        |   13 +-
 EXAMPLE/pddrive4.c                        |    3 +-
 EXAMPLE/pzdrive.c                         |   16 +-
 EXAMPLE/pzdrive1.c                        |   13 +-
 EXAMPLE/pzdrive2.c                        |   13 +-
 EXAMPLE/pzdrive3.c                        |   13 +-
 EXAMPLE/pzdrive4.c                        |    3 +-
 EXAMPLE/zcreate_matrix.c                  |    5 +
 INSTALL/superlu_timer.c                   |   54 -
 MAKE_INC/make.cuda_gpu                    |    4 +-
 MAKE_INC/make.mac-x                       |   43 +
 MAKE_INC/make.mpich                       |    2 +-
 MAKE_INC/{make.mpich => make.ssg1}        |   20 +-
 MAKE_INC/make.xc30                        |    2 +-
 MAKE_INC/make.xt5                         |    3 +-
 Makefile                                  |    3 +-
 README                                    |  251 ---
 README.md                                 |  274 +++
 SRC/CMakeLists.txt                        |   17 +-
 SRC/Makefile                              |   22 +-
 SRC/colamd.c                              | 3424 +++++++++++++++++++++++++++++
 SRC/colamd.h                              |  259 +++
 SRC/dSchCompUdt-2Ddynamic.c               |  762 ++++---
 SRC/dbinary_io.c                          |   40 +
 SRC/dlook_ahead_update.c                  |  115 +-
 SRC/dmemory_dist.c                        |    7 +-
 SRC/dreadMM.c                             |   26 +-
 SRC/dscatter.c                            |   87 +-
 SRC/get_perm_c.c                          |   41 +
 SRC/mc64ad_dist.c                         | 2654 ----------------------
 SRC/memory.c                              |   31 +-
 SRC/pdgssvx.c                             |   24 +-
 SRC/pdgstrf.c                             |  442 ++--
 SRC/pdgstrf2.c                            |  165 +-
 SRC/psymbfact.h                           |    5 +-
 SRC/pzgssvx.c                             |   24 +-
 SRC/pzgstrf.c                             |  442 ++--
 SRC/pzgstrf2.c                            |  165 +-
 SRC/sp_colorder.c                         |    3 +-
 SRC/sp_ienv.c                             |    5 +-
 SRC/static_schedule.c                     |   10 +
 SRC/superlu_ddefs.h                       |    2 +-
 SRC/superlu_defs.h                        |   15 +-
 SRC/superlu_dist_config.h                 |    4 +
 SRC/superlu_dist_config.h.in              |    9 +
 SRC/superlu_dist_version.c                |   30 +
 SRC/superlu_enum_consts.h                 |    3 +
 SRC/superlu_zdefs.h                       |    2 +-
 SRC/util.c                                |   86 +-
 SRC/zSchCompUdt-2Ddynamic.c               |  762 ++++---
 SRC/zbinary_io.c                          |   40 +
 SRC/zlook_ahead_update.c                  |  115 +-
 SRC/zmemory_dist.c                        |    7 +-
 SRC/zreadMM.c                             |   32 +-
 SRC/zscatter.c                            |   87 +-
 TEST/#pztest.c#                           |  517 +++++
 TEST/CMakeLists.txt                       |   79 +
 TEST/Makefile                             |   56 +
 TEST/README                               |   12 +
 {EXAMPLE => TEST}/dcreate_matrix.c        |    5 +
 TEST/pdcompute_resid.c                    |  155 ++
 TEST/pdtest.c                             |  519 +++++
 TEST/pdtest.sh                            |   64 +
 TEST/pzcompute_resid.c                    |  154 ++
 TEST/pztest.c                             |  518 +++++
 TEST/pztest.sh                            |   64 +
 TEST/runtest.cmake                        |   13 +
 {EXAMPLE => TEST}/zcreate_matrix.c        |    5 +
 compile.out                               |   62 +
 MAKE_INC/make.mpich => make.inc           |   24 +-
 make.inc.in                               |    7 +-
 run_cmake_build.csh                       |    2 +-
 run_cmake_build.csh => run_cmake_build.sh |   55 +-
 superlu_dist.pc.in                        |   12 +
 85 files changed, 8861 insertions(+), 4344 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2eb65d5..adcaf5c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,12 @@
 *~
 
-# You have to ignore this genrated file or git will complain that it is an
+# You have to ignore this generated file or git will complain that it is an
 # unknown file!
 /make.inc
 
 # If the instructions are telling people to create this build dir under the
 # source tree, you had better put in an ignore for this.
 /build/
+
+# Ignore Testing/ folder
+Testing/
diff --git a/CBLAS/Makefile b/CBLAS/Makefile
index 5812c03..d0eca9a 100644
--- a/CBLAS/Makefile
+++ b/CBLAS/Makefile
@@ -66,28 +66,28 @@ ALLBLAS = input_error_dist.o
 
 all: single double complex complex16
 
-single: $(SBLAS1) $(SBLAS2) $(SBLAS3)
+single: $(SBLAS1) $(SBLAS2) $(SBLAS3) $(ALLBLAS)
 	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \
 	$(SBLAS2) $(SBLAS3)
 	$(RANLIB) $(BLASLIB)
 
-double: $(DBLAS1) $(DBLAS2) $(DBLAS3)
+double: $(DBLAS1) $(DBLAS2) $(DBLAS3) $(ALLBLAS)
 	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \
 	$(DBLAS2) $(DBLAS3)
 	$(RANLIB) $(BLASLIB)
 
-complex: $(CBLAS1) $(CBLAS2) $(CBLAS3)
+complex: $(CBLAS1) $(CBLAS2) $(CBLAS3) $(ALLBLAS)
 	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(ALLBLAS) \
 	$(CBLAS2) $(CBLAS3)
 	$(RANLIB) $(BLASLIB)
 
-complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3)
+complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3) $(ALLBLAS)
 	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ALLBLAS) \
 	$(ZBLAS2) $(ZBLAS3)
 	$(RANLIB) $(BLASLIB)
 
 .c.o:
-	$(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
+	$(CC) $(CFLAGS) $(CDEFS) -c $< $(VERBOSE)
 
 clean:	
 	rm -f *.o
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d082edc..437306d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 # Project version numbers
 project(SuperLU_DIST NONE)
 set(VERSION_MAJOR "5")
-set(VERSION_MINOR "1")
-set(VERSION_BugFix "3")
+set(VERSION_MINOR "2")
+set(VERSION_BugFix "2")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
 ######################################################################
@@ -20,6 +20,8 @@ set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 MESSAGE("\nProcess XSDK defaults ...")
 # SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
 INCLUDE("cmake/XSDKDefaults.cmake")
+INCLUDE(CTest)
+
 ######################################################################
 
 ######################################################################
@@ -27,20 +29,8 @@ INCLUDE("cmake/XSDKDefaults.cmake")
 # Usual initialization stuff
 #
 ######################################################################
-# setup options
-option(enable_blaslib   "Build the CBLAS library" ${enable_blaslib_DEFAULT})
-option(enable_parmetislib   "Build the ParMETIS library" ON)
-option(enable_doc       "Build doxygen documentation" OFF)
-option(enable_double    "Enable double precision library" ON)
-option(enable_complex16 "Enable complex16 precision library" ON)
-option(enable_examples  "Build examples" ON)
-option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
-option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
-
-if (NOT CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX /usr/local)
-endif()
-
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)    ## ????
+set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 
 #---- For shared library
 
@@ -82,13 +72,28 @@ else()
   set(enable_blaslib_DEFAULT ON)
 endif()
 
+if (NOT CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX /usr/local)
+endif()
+
+
+# setup options
+option(enable_blaslib   "Build the CBLAS library" ${enable_blaslib_DEFAULT})
+option(enable_parmetislib   "Build the ParMETIS library" ON)
+option(enable_doc       "Build doxygen documentation" OFF)
+option(enable_double    "Enable double precision library" ON)
+option(enable_complex16 "Enable complex16 precision library" ON)
+option(enable_tests  "Build tests" ON)
+option(enable_examples  "Build examples" ON)
+option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
+option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
+
 
 # setup required compiler defines and options.
 ## get_directory_property( DirDefs COMPILE_DEFINITIONS )
-set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
+# set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
 if(XSDK_INDEX_SIZE EQUAL 64)
     message("-- Using 64 bit integer for index size")
-    set(CMAKE_C_FLAGS "-D_LONGINT ${CMAKE_C_FLAGS}")
 endif()	
 set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
 
@@ -98,6 +103,23 @@ set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
 #
 ######################################################################
 #
+#--------------------- MPI ---------------------
+find_package(MPI)
+if(MPI_C_FOUND)
+    set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" )
+endif()
+#--------------------- OpenMP ---------------------
+find_package(OpenMP)
+## include(FindOpenMP)  # Strumpack uses this
+if(OPENMP_FOUND)
+  set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
+# On edison, OpenMP_EXE_LINKER_FLAGS is empty
+#  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+ message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'")
+ message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
+endif()
 #--------------------- BLAS ---------------------
 if(NOT enable_blaslib)
 #  set(TPL_BLAS_LIBRARIES "" CACHE FILEPATH
@@ -125,27 +147,12 @@ else()
     add_subdirectory(CBLAS)
     set(BLAS_LIB blas)
     if (BUILD_SHARED_LIBS)  # export to be referenced by downstream makefile
-        set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.so)
+        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so)
     else()
-        set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.a)
+        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.a)
     endif()
 endif()
 
-#--------------------- MPI ---------------------
-find_package(MPI)
-if(MPI_C_FOUND)
-    set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" )
-endif()
-#--------------------- OpenMP ---------------------
-find_package(OpenMP)
-if(OPENMP_FOUND)
-  set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
-# On edison, OpenMP_EXE_LINKER_FLAGS is empty
-# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
-# message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
-endif()
 #--------------------- ParMETIS ---------------------
 if (enable_parmetislib)   ## want to use parmetis
   if (NOT TPL_PARMETIS_LIBRARIES)
@@ -197,16 +204,16 @@ include_directories(${MPI_C_INCLUDE_PATH})
 
 add_subdirectory(SRC)
 
-if(enable_tests)
-  enable_testing()
-  add_subdirectory(TESTING)
-endif()
-
 if(enable_doc)
   message(FATAL_ERROR "Documentation build requested but not implemented.")
   #implement doxygen
 endif()
 
+if(enable_tests)
+  enable_testing()
+  add_subdirectory(TEST)
+endif()
+
 if(enable_examples)
   enable_testing()
   add_subdirectory(EXAMPLE)
@@ -215,3 +222,9 @@ endif()
 # file(WRITE "make.defs" "# can be exposed to users" ${CMAKE_C_COMPILER})
 # configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc)
 configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc)
+configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h)
+
+# Add pkg-config support
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc
+	DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/DOC/ug.pdf b/DOC/ug.pdf
deleted file mode 100644
index f854405..0000000
Binary files a/DOC/ug.pdf and /dev/null differ
diff --git a/DoxyConfig b/DoxyConfig
index 5bbc5a0..9760183 100644
--- a/DoxyConfig
+++ b/DoxyConfig
@@ -31,7 +31,7 @@ PROJECT_NAME           = SuperLU Distributed
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = 5.0.0
+PROJECT_NUMBER         = 5.2.2
 e
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
@@ -513,7 +513,7 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = SRC/ EXAMPLE/ FORTRAN/
+INPUT                  = SRC/ EXAMPLE/ FORTRAN/ TEST/
 
 # This tag can be used to specify the character encoding of the source files 
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 5eb7473..2ecb368 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -3,51 +3,52 @@ include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
 # Libs linked to all of the examples
 set(all_link_libs superlu_dist ${BLAS_LIB} m)
 
-function(add_superlu_dist_test target input nprow npcol)
-    set(TEST_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
-    set(TEST_OUTPUT "${SuperLU_DIST_BINARY_DIR}/EXAMPLE/${target}.out")
+function(add_superlu_dist_example target input nprow npcol)
+    set(EXAMPLE_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
+    set(EXAMPLE_OUTPUT "${SuperLU_DIST_BINARY_DIR}/EXAMPLE/${target}.out")
 
 ##  get_target_property(TEST_LOC ${target} LOCATION)
-    set(TEST_LOC ${CMAKE_CURRENT_BINARY_DIR})
+    set(EXAMPLE_LOC ${CMAKE_CURRENT_BINARY_DIR})
 
     MATH( EXPR procs "${nprow}*${npcol}" )
 #    message("MPIEXEC_FLAG is ${MPIEXEC_NUMPROC_FLAG}")
 
 # corresponding to mpiexec -n 4 pddrive -r <nprow> -c <npcol> g20.rua
     add_test(${target} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
-             ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}")
+             ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${EXAMPLE_INPUT}")
+
 #     add_test(NAME ${target} COMMAND "${CMAKE_COMMAND}"
 #              -DTEST=${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
 #             ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}"
 #	     -DOUTPUT=${target}.out
 #	    -P "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/runexample.cmake" )
 
-
 # MPI variables:
 # ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} PROCS
 #  	${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS)
 
-endfunction(add_superlu_dist_test)
+endfunction(add_superlu_dist_example)
 
 
 if(enable_double)
   set(DEXM pddrive.c dcreate_matrix.c)
   add_executable(pddrive ${DEXM})
   target_link_libraries(pddrive ${all_link_libs})
-  add_superlu_dist_test(pddrive big.rua 2 2)
 
   set(DEXM1 pddrive1.c dcreate_matrix.c)
   add_executable(pddrive1 ${DEXM1})
   target_link_libraries(pddrive1 ${all_link_libs})
-  add_superlu_dist_test(pddrive1 big.rua 2 2)
+  add_superlu_dist_example(pddrive1 big.rua 2 2)
 
   set(DEXM2 pddrive2.c dcreate_matrix.c dcreate_matrix_perturbed.c)
   add_executable(pddrive2 ${DEXM2})
   target_link_libraries(pddrive2 ${all_link_libs})
+  add_superlu_dist_example(pddrive2 big.rua 2 2)
 
   set(DEXM3 pddrive3.c dcreate_matrix.c)
   add_executable(pddrive3 ${DEXM3})
   target_link_libraries(pddrive3 ${all_link_libs})
+  add_superlu_dist_example(pddrive3 big.rua 2 2)
 
   set(DEXM4 pddrive4.c dcreate_matrix.c)
   add_executable(pddrive4 ${DEXM4})
@@ -84,14 +85,17 @@ if(enable_complex16)
   set(ZEXM1 pzdrive1.c zcreate_matrix.c)
   add_executable(pzdrive1 ${ZEXM1})
   target_link_libraries(pzdrive1 ${all_link_libs})
+  add_superlu_dist_example(pzdrive1 cg20.cua 2 2)
 
   set(ZEXM2 pzdrive2.c zcreate_matrix.c zcreate_matrix_perturbed.c)
   add_executable(pzdrive2 ${ZEXM2})
   target_link_libraries(pzdrive2 ${all_link_libs})
+  add_superlu_dist_example(pzdrive2 cg20.cua 2 2)
 
   set(ZEXM3 pzdrive3.c zcreate_matrix.c)
   add_executable(pzdrive3 ${ZEXM3})
   target_link_libraries(pzdrive3 ${all_link_libs})
+  add_superlu_dist_example(pzdrive3 cg20.cua 2 2)
 
   set(ZEXM4 pzdrive4.c zcreate_matrix.c)
   add_executable(pzdrive4 ${ZEXM4})
diff --git a/EXAMPLE/README b/EXAMPLE/README
index f773812..7146acd 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -50,3 +50,5 @@ command.
 
 4. To run the complex examples pzdrive4 and pzdrive4_ABglobal, you may type:
    % mpiexec -n 10 pzdrive4 cg20.cua
+
+
diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c
index 77292d7..a622463 100644
--- a/EXAMPLE/dcreate_matrix.c
+++ b/EXAMPLE/dcreate_matrix.c
@@ -89,9 +89,14 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
 #endif
 
     if ( !iam ) {
+        double t = SuperLU_timer_();
+
         /* Read the matrix stored on disk in Harwell-Boeing format. */
         dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
 
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+
 	/* Broadcast matrix A to the other PEs. */
 	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
 	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 3ebca24..750613c 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -63,7 +63,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
-    extern int cpp_defs();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -108,8 +108,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
 
 #if ( VAMPIR>=1 )
@@ -138,7 +143,7 @@ int main(int argc, char *argv[])
         options.ParSymbFact       = NO;
         options.ColPerm           = METIS_AT_PLUS_A;
         options.RowPerm           = LargeDiag;
-        options.ReplaceTinyPivot  = YES;
+        options.ReplaceTinyPivot  = NO;
         options.IterRefine        = DOUBLE;
         options.Trans             = NOTRANS;
         options.SolveInitialized  = NO;
@@ -151,12 +156,13 @@ int main(int argc, char *argv[])
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
-    options.ReplaceTinyPivot = NO;
+    options.ReplaceTinyPivot = YES;
 #endif
 
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     m = A.nrow;
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 9c01607..37e9ea8 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -60,6 +60,7 @@ int main(int argc, char *argv[])
     int    iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -104,8 +105,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
 
 #if ( VAMPIR>=1 )
@@ -137,7 +143,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -149,6 +155,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     m = A.nrow;
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index 0cf3191..e4d9508 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -64,6 +64,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     /* prototypes */
     extern int dcreate_matrix_perturbed
@@ -113,8 +114,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
     
 #if ( DEBUGlevel>=1 )
@@ -142,7 +148,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -154,6 +160,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     /* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index e591f39..2a971b1 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -69,6 +69,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -113,8 +114,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
     
 #if ( DEBUGlevel>=1 )
@@ -161,7 +167,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -173,6 +179,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     /* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index d0192ec..1a03add 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -66,6 +66,7 @@ int main(int argc, char *argv[])
     int      nrhs = 1;   /* Number of right-hand side. */
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
 
     /* ------------------------------------------------------------
@@ -153,7 +154,7 @@ int main(int argc, char *argv[])
             options.Equil = YES;
             options.ColPerm = METIS_AT_PLUS_A;
             options.RowPerm = LargeDiag;
-            options.ReplaceTinyPivot = YES;
+            options.ReplaceTinyPivot = NO;
             options.Trans = NOTRANS;
             options.IterRefine = DOUBLE;
             options.SolveInitialized = NO;
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index 33e0a9d..b1785b8 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -62,7 +62,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
-    extern int cpp_defs();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -107,8 +107,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
 
 #if ( VAMPIR>=1 )
@@ -137,7 +142,7 @@ int main(int argc, char *argv[])
         options.ParSymbFact       = NO;
         options.ColPerm           = METIS_AT_PLUS_A;
         options.RowPerm           = LargeDiag;
-        options.ReplaceTinyPivot  = YES;
+        options.ReplaceTinyPivot  = NO;
         options.IterRefine        = DOUBLE;
         options.Trans             = NOTRANS;
         options.SolveInitialized  = NO;
@@ -150,12 +155,13 @@ int main(int argc, char *argv[])
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
-    options.ReplaceTinyPivot = NO;
+    options.ReplaceTinyPivot = YES;
 #endif
 
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     m = A.nrow;
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index 402a133..50726ce 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -59,6 +59,7 @@ int main(int argc, char *argv[])
     int    iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -103,8 +104,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
 
 #if ( VAMPIR>=1 )
@@ -136,7 +142,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -148,6 +154,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     m = A.nrow;
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index b75f6ef..bf8f69c 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -63,6 +63,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     /* prototypes */
     extern int zcreate_matrix_perturbed
@@ -112,8 +113,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
     
 #if ( DEBUGlevel>=1 )
@@ -141,7 +147,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -153,6 +159,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     /* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index f251587..e086a77 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -68,6 +68,7 @@ int main(int argc, char *argv[])
     int      iam, info, ldb, ldx, nrhs;
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
@@ -112,8 +113,13 @@ int main(int argc, char *argv[])
     iam = grid.iam;
     if ( iam >= nprow * npcol )	goto out;
     if ( !iam ) {
-	printf("Input matrix file: %s\n", *cpp);
-        printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+	int v_major, v_minor, v_bugfix;
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
     }
     
 #if ( DEBUGlevel>=1 )
@@ -160,7 +166,7 @@ int main(int argc, char *argv[])
         options.Equil = YES;
         options.ColPerm = METIS_AT_PLUS_A;
         options.RowPerm = LargeDiag;
-        options.ReplaceTinyPivot = YES;
+        options.ReplaceTinyPivot = NO;
         options.Trans = NOTRANS;
         options.IterRefine = DOUBLE;
         options.SolveInitialized = NO;
@@ -172,6 +178,7 @@ int main(int argc, char *argv[])
     if (!iam) {
 	print_sp_ienv_dist(&options);
 	print_options_dist(&options);
+	fflush(stdout);
     }
 
     /* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 8a1caad..8d170d5 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -65,6 +65,7 @@ int main(int argc, char *argv[])
     int      nrhs = 1;   /* Number of right-hand side. */
     char     **cpp, c;
     FILE *fp, *fopen();
+    int cpp_defs();
 
 
     /* ------------------------------------------------------------
@@ -152,7 +153,7 @@ int main(int argc, char *argv[])
             options.Equil = YES;
             options.ColPerm = METIS_AT_PLUS_A;
             options.RowPerm = LargeDiag;
-            options.ReplaceTinyPivot = YES;
+            options.ReplaceTinyPivot = NO;
             options.Trans = NOTRANS;
             options.IterRefine = DOUBLE;
             options.SolveInitialized = NO;
diff --git a/EXAMPLE/zcreate_matrix.c b/EXAMPLE/zcreate_matrix.c
index 87774cf..8660143 100644
--- a/EXAMPLE/zcreate_matrix.c
+++ b/EXAMPLE/zcreate_matrix.c
@@ -88,9 +88,14 @@ int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs,
 #endif
 
     if ( !iam ) {
+        double t = SuperLU_timer_();
+
         /* Read the matrix stored on disk in Harwell-Boeing format. */
         zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
 
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+
 	/* Broadcast matrix A to the other PEs. */
 	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
 	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
diff --git a/INSTALL/superlu_timer.c b/INSTALL/superlu_timer.c
deleted file mode 100644
index 3a2ffcc..0000000
--- a/INSTALL/superlu_timer.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* 
- * Purpose
- * ======= 
- *	Returns the time in seconds used by the process.
- *
- * Note: the timer function call is machine dependent. Use conditional
- *       compilation to choose the appropriate function.
- *
- */
-
-
-#ifdef SUN 
-/*
- * 	It uses the system call gethrtime(3C), which is accurate to 
- *	nanoseconds. 
-*/
-#include <sys/time.h>
- 
-double SuperLU_timer_() {
-    return ( (double)gethrtime() / 1e9 );
-}
-
-#elif defined ( UNIX_TIMER )
-
-#include <sys/types.h>
-#include <sys/times.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifndef CLK_TCK
-#define CLK_TCK 60
-#endif
-
-double SuperLU_timer_()
-{
-    struct tms use;
-    double tmp;
-    times(&use);
-    tmp = use.tms_utime;
-    tmp += use.tms_stime;
-    return (double)(tmp) / (double) CLK_TCK;
-}
-
-#else
-
-#include <mpi.h>
-
-double SuperLU_timer_()
-{
-    return MPI_Wtime();
-}
-
-#endif
-
diff --git a/MAKE_INC/make.cuda_gpu b/MAKE_INC/make.cuda_gpu
index 2e8b8a8..4ae0eac 100644
--- a/MAKE_INC/make.cuda_gpu
+++ b/MAKE_INC/make.cuda_gpu
@@ -62,9 +62,9 @@ RANLIB       	= ranlib
 CC           	= mpicc
 # CFLAGS should be set to be the C flags that include optimization
 CFLAGS          = ${CUDA_FLAGS} ${INCS} -std=c99 -O3 -Wall -w2 -openmp -mkl \
-		-DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 \
-#		-D_LONGINT 
+		-DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 
 #	-Wunused-variable 
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 #
 # NOOPTS should be set to be the C flags that turn off any optimization
 NOOPTS		= -O0
diff --git a/MAKE_INC/make.mac-x b/MAKE_INC/make.mac-x
new file mode 100644
index 0000000..8beb9a3
--- /dev/null
+++ b/MAKE_INC/make.mac-x
@@ -0,0 +1,43 @@
+############################################################################
+#
+#  Program:         SuperLU_DIST
+#
+#  Module:          make.inc
+#
+#  Purpose:         Top-level Definitions
+#
+#  Creation date:   March 1, 2016	version 5.0.0
+#
+#  Modified:	    
+#		    
+#
+############################################################################
+#
+#  The name of the libraries to be created/linked to
+#
+SuperLUroot	= /Users/xsli/Dropbox/Codes/SuperLU/superlu_dist.git/
+DSUPERLULIB   	= $(SuperLUroot)/lib/libsuperlu_dist.a
+
+BLASLIB		= $(SuperLUroot)/CBLAS/libblas.a
+
+LIBS		= $(DSUPERLULIB) ${BLASLIB} /Users/xsli/lib/parmetis-4.0.3/build/Darwin-x86_64/libparmetis/libparmetis.a /Users/xsli/lib/parmetis-4.0.3/build/Darwin-x86_64/libmetis/libmetis.a
+
+#
+#  The archiver and the flag(s) to use when building archive (library)
+#  If your system has no ranlib, set RANLIB = echo.
+#
+ARCH         = /usr/bin/ar
+ARCHFLAGS    = cr
+RANLIB       = /usr/bin/ranlib
+
+CC           = /Users/xsli/lib/mpich2-install/bin/mpicc
+CFLAGS 	     = -O3 -DNDEBUG -I/Users/xsli/lib/parmetis-4.0.3/metis/include -I/Users/xsli/lib/parmetis-4.0.3/include  -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -g
+#CFLAGS       += -openmp
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
+# CFLAGS       +=  
+NOOPTS       = -O0
+FORTRAN	     = /usr/local/bin/gfortran
+
+LOADER       = $(CC)
+LOADOPTS     = -openmp
+# LOADOPTS     = -Wl,-rpath,/Users/xsli/Dropbox/Codes/SuperLU/superlu_dist.git/xsli-build/lib
diff --git a/MAKE_INC/make.mpich b/MAKE_INC/make.mpich
index 559a086..db3b92c 100644
--- a/MAKE_INC/make.mpich
+++ b/MAKE_INC/make.mpich
@@ -39,7 +39,7 @@ RANLIB       = /usr/bin/ranlib
 
 CC           = /home/xiaoye/mpich-install/bin/mpicc
 CFLAGS 	     = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS       += -D_LONGINT
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 # CFLAGS       +=  
 NOOPTS       = -O0
 FORTRAN	     = /usr/bin/gfortran
diff --git a/MAKE_INC/make.mpich b/MAKE_INC/make.ssg1
similarity index 55%
copy from MAKE_INC/make.mpich
copy to MAKE_INC/make.ssg1
index 559a086..30f86b6 100644
--- a/MAKE_INC/make.mpich
+++ b/MAKE_INC/make.ssg1
@@ -15,19 +15,10 @@
 #
 #  The name of the libraries to be created/linked to
 #
-VERSION		= 5.1.3
-SuperLUroot	= /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}
+SuperLUroot	= /home/xiaoye/Dropbox/Codes/SuperLU/superlu_dist.git
 DSUPERLULIB   	= $(SuperLUroot)/lib/libsuperlu_dist.a
 
-# BLASDEF 	= -DUSE_VENDOR_BLAS
-
-PARMETIS_DIR	:= ${HOME}/lib/static/parmetis-4.0.3
-I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
-METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
-PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
-
-LIBS		= $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so \
-		${PARMETISLIB} ${METISLIB}
+LIBS		= $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a
 
 #
 #  The archiver and the flag(s) to use when building archive (library)
@@ -38,11 +29,10 @@ ARCHFLAGS    = cr
 RANLIB       = /usr/bin/ranlib
 
 CC           = /home/xiaoye/mpich-install/bin/mpicc
-CFLAGS 	     = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS       += -D_LONGINT
-# CFLAGS       +=  
+CFLAGS 	     = -O3 -DNDEBUG -I/home/xiaoye/lib/static/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/parmetis-4.0.3/include -fopenmp  -DUSE_VENDOR_BLAS -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -g
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 NOOPTS       = -O0
 FORTRAN	     = /usr/bin/gfortran
 
 LOADER       = $(CC)
-LOADOPTS     = -Wl,-rpath=/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}/lib -g # -Wl,-Bdynamic
+LOADOPTS     = -Wl,-rpath,/home/xiaoye/Dropbox/Codes/SuperLU/superlu_dist.git/lib    -Wl,-rpath  -Wl,/home/xiaoye/mpich-install/lib  -Wl,--enable-new-dtags -fopenmp
diff --git a/MAKE_INC/make.xc30 b/MAKE_INC/make.xc30
index dba42bb..06131b8 100644
--- a/MAKE_INC/make.xc30
+++ b/MAKE_INC/make.xc30
@@ -62,7 +62,7 @@ CC           	= cc
 CFLAGS          = -fast -m64 -std=c99 -Wall -openmp \
 		$(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=0 -DPROFlevel=0 \
 # uncomment the following to use 64-bit integer
-# CFLAGS 		+= -D_LONGINT
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 
 # NOOPTS should be set to be the C flags that turn off any optimization
 NOOPTS		= -O0 -std=c99
diff --git a/MAKE_INC/make.xt5 b/MAKE_INC/make.xt5
index 926d28e..1365a6e 100644
--- a/MAKE_INC/make.xt5
+++ b/MAKE_INC/make.xt5
@@ -59,7 +59,8 @@ RANLIB       	= ranlib
 CC      = cc
 INCS	= $(I_PARMETIS)
 # CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = ${INCS} -c99 -fastsse -DDEBUGlevel=0 -DPRNTlevel=1 #-D_LONGINT
+CFLAGS          = ${INCS} -c99 -fastsse -DDEBUGlevel=0 -DPRNTlevel=1
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 #
 # NOOPTS should be set to be the C flags that turn off any optimization
 NOOPTS		= -O0
diff --git a/Makefile b/Makefile
index 7717442..4ef799d 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@
 
 include make.inc
 
-all: install lib example
+all: lib install example
 
 lib: superlulib
 
@@ -43,3 +43,4 @@ cleantesting:
 	( cd INSTALL; $(MAKE) clean )
 	( cd EXAMPLE; $(MAKE) clean )
 	( cd FORTRAN; $(MAKE) clean )
+	( cd TEST; $(MAKE) clean )
diff --git a/README b/README
deleted file mode 100644
index 2cfbba7..0000000
--- a/README
+++ /dev/null
@@ -1,251 +0,0 @@
-		SuperLU_DIST (version 5.1)
-		============================
-
-SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
-A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
-Static pivoting is a technique that combines the numerical stability of
-partial pivoting with the scalability of Cholesky (no pivoting),
-to run accurately and efficiently on large numbers of processors. 
-
-SuperLU_DIST is a parallel extension to the serial SuperLU library.
-It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and 
-column preordering for sparsity are performed sequentially. 
-This "alpha" release contains double-precision real and double-precision
-complex data types.
-
-The distribution contains the following directory structure:
-
-  SuperLU_DIST/README    instructions on installation
-  SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
-  SuperLU_DIST/DOC/  	 the Users' Guide
-  SuperLU_DIST/EXAMPLE/  example programs
-  SuperLU_DIST/INSTALL/  test machine dependent parameters
-  SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
-  SuperLU_DIST/lib/      contains library archive libsuperlu_dist.a
-  SuperLU_DIST/Makefile  top level Makefile that does installation and testing
-  SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
-                         preprocessor definitions, included in all Makefiles.
-                         (You may need to edit it to suit for your system
-                          before compiling the whole package.)
-  SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
-
-
-----------------
-| INSTALLATION |
-----------------
-
-There are two ways to install the package. One requires users to 
-edit makefile manually, the other uses CMake build system.
-The procedures are described below.
-
-1. Manual installation with makefile.
-   Before installing the package, please examine the three things dependent 
-   on your system setup:
-
-   1.1 Edit the make.inc include file.
-
-       This make include file is referenced inside each of the Makefiles
-       in the various subdirectories. As a result, there is no need to 
-       edit the Makefiles in the subdirectories. All information that is
-       machine specific has been defined in this include file. 
-
-       Sample machine-specific make.inc are provided in the MAKE_INC/
-       directory for several platforms, such as Cray XT5 and IBM SP.
-       When you have selected the machine to which you wish to install
-       SuperLU_DIST, copy the appropriate sample include file 
-       (if one is present) into make.inc.
-       For example, if you wish to run SuperLU_DIST on a Cray XT5,  you can do
-
-       	   cp MAKE_INC/make.xc30  make.inc
-   
-	For the systems other than listed above, some porting effort is needed
-   	for parallel factorization routines. Please refer to the Users' Guide 
-   	for detailed instructions on porting.
-
-   	The following CPP definitions can be set in CFLAGS.
-      	  o -D_LONGINT
-          use 64-bit integers for indexing sparse matrices. (default 32 bit)
-
-      	  o -DPRNTlevel=[0,1,2,...]
-          printing level to show solver's execution details. (default 0)
-
-      	  o -DDEBUGlevel=[0,1,2,...]
-          diagnostic printing level for debugging purpose. (default 0)
-      
-   
-   1.2. The BLAS library.
-
-   	The parallel routines in SuperLU_DIST uses some sequential BLAS routines
-   	on each process. If there is BLAS library available on your machine,
-   	you may define the following in the file make.inc:
-            BLASDEF = -DUSE_VENDOR_BLAS
-            BLASLIB = <BLAS library you wish to link with>
-
-   	    The CBLAS/ subdirectory contains the part of the C BLAS needed by 
-   	    SuperLU_DIST package. However, these codes are intended for use
-	    only if there is no faster implementation of the BLAS already
-	    available on your machine. In this case, you should go to the
-	    top-level SuperLU_DIST/ directory and do the following:
-
-	    1) In make.inc, undefine (comment out) BLASDEF, and define:
-               BLASLIB = ../lib/libblas$(PLAT).a
-
-    	    2) Type: make blaslib
-       	       to make the BLAS library from the routines in the
-	       CBLAS/ subdirectory.
-
-
-   1.3. External libraries: Metis and ParMetis.
-
-      If you will use Metis or ParMetis ordering, you will
-      need to install them yourself. Since ParMetis package already
-      contains the source code for the Metis library, you can just
-      download and compile ParMetis from:
-      http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download
-
-      After you have installed it, you should define the following in make.inc:
-        METISLIB = -L<metis directory> -lmetis
-        PARMETISLIB = -L<parmetis directory> -lparmetis
-        I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
-
-   1.4. C preprocessor definition CDEFS.
-
-   	In the header file SRC/Cnames.h, we use macros to determine how
-   	C routines should be named so that they are callable by Fortran.
-   	(Some vendor-supplied BLAS libraries do not have C interfaces. So the 
-    	re-naming is needed in order for the SuperLU BLAS calls (in C) to 
-    	interface with the Fortran-style BLAS.)
-   	The possible options for CDEFS are:
-
-       	o -DAdd_: Fortran expects a C routine to have an underscore
-		  postfixed to the name;
-		  (This is set as the default)
-        o -DNoChange: Fortran expects a C routine name to be identical to
-		      that compiled by C;
-        o -DUpCase: Fortran expects a C routine name to be all uppercase.
-   
-   1.5. Multicore and GPU (optional).
-   
-	To use OpenMP parallelism, need to compile the code with the
-	following CPP definition:
-
-	     -D_OPENMP
-
-        and set the number of threads to be used as follows:
-
- 	     setenv OMP_NUM_THREADS <##>
-
-   	To enable Nvidia GPU access, need to take the following 2 step:
-      	  1) set the following Linux environment variable:
-
-	     setenv ACC GPU
-
-      	  2) Add the CUDA library location in make.inc:
-
-    	  ifeq "${ACC}" "GPU"
-      	       CFLAGS += -DGPU_ACC
-               INCS += -I<CUDA directory>/include
-      	       LIBS += -L<CUDA directory>/lib64 -lcublas -lcudart 
-    	  endif
-
-   A Makefile is provided in each subdirectory. The installation can be done
-   completely automatically by simply typing "make" at the top level.
-
-2. Using CMake build system. 
-   You will need to create a build tree from which to invoke CMake.
-   
-   First, in order to use parallel symbolic factorization function, you
-   need to install ParMETIS parallel ordering package, and define the
-   two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
-
-     setenv PARMETIS_ROOT <Prefix directory of the ParMETIS installation>
-     setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
-
-   Then, the installation procedure is the following.
-
-   From the top level directory, do:
-
-     	mkdir build ; cd build
-   	cmake .. \
-	  -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
-          -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include"
-
-  ( example:
-  setenv PARMETIS_ROOT ~/lib/dynamic/parmetis-4.0.3 
-  setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64 
-  cmake .. \
-    -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
-    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
-    -DCMAKE_C_FLAGS="-std=c99 -g" \
-    -Denable_blaslib=OFF \
-    -DBUILD_SHARED_LIBS=OFF \
-    -DCMAKE_C_COMPILER=mpicc \
-    -DCMAKE_INSTALL_PREFIX=..
-  )
-
-   To actually build, type:
-   	make
-
-   To install the libraries, type:
-        make install
-
-   To run the installation test, type:
-        make test
-        (The outputs are in file: build/Testing/Temporary/LastTest.log)
-
-
-   ++++++++
-   Note on the C-Fortran name mangling handled by C preprocessor definition:
-   ++++++++
-   In the default setting, we assume that Fortran expects a C routine
-   to have an underscore postfixed to the name. Depending on the
-   compiler, you may need to define one of the following flags in
-   during the cmake build to overwrite default setting:
-
-   cmake .. -DCMAKE_C_FLAGS="-DNoChange"
-
-   cmake .. -DCMAKE_C_FLAGS="-DUpCase"
-
-
---------------
-| REFERENCES |
---------------
-
-[1] SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for
-    Unsymmetric Linear Systems.  Xiaoye S. Li and James W. Demmel.
-    ACM Trans. on Math. Solftware, Vol. 29, No. 2, June 2003, pp. 110-140.
-[2] Parallel Symbolic Factorization for Sparse LU with Static Pivoting.
-    L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3,
-    1289-1314, 2007.
-[3] A distributed CPU-GPU sparse direct solver. P. Sao, R. Vuduc and X.S. Li,
-    Proc. of EuroPar-2014 Parallel Processing, August 25-29, 2014.
-    Porto, Portugal.
-
-Xiaoye S. Li         Lawrence Berkeley National Lab, xsli at lbl.gov
-Laura Grigori        INRIA, France, Laura.Grigori at inria.fr
-Piyush Sao           Georgia Institute of Technology, piyush.feynman at gmail.com
-Ichitaro Yamazaki    Univ. of Tennessee, ic.yamazaki at gmail.com
-
---------------------
-| RELEASE VERSIONS |
---------------------
-
-  October 15, 2003   Version 2.0
-  October 1,  2007   Version 2.1
-  Feburary 20, 2008  Version 2.2
-  October 15, 2008   Version 2.3
-  June 9, 2010       Version 2.4 
-  November 23, 2010  Version 2.5
-  March 31, 2013     Version 3.3
-  October 1, 2014    Version 4.0
-  July 15, 2014      Version 4.1
-  September 25, 2015 Version 4.2
-  December 31, 2015  Version 4.3
-  April 8, 2016      Version 5.0.0
-  May 15, 2016       Version 5.1.0
-  October 4, 2016    Version 5.1.1
-  December 31, 2016  Version 5.1.3
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..47ca0fa
--- /dev/null
+++ b/README.md
@@ -0,0 +1,274 @@
+# SuperLU_DIST (version 5.2)
+
+[![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
+[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
+
+SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
+A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
+Static pivoting is a technique that combines the numerical stability of
+partial pivoting with the scalability of Cholesky (no pivoting),
+to run accurately and efficiently on large numbers of processors. 
+
+SuperLU_DIST is a parallel extension to the serial SuperLU library.
+It is targeted for the distributed memory parallel machines.
+SuperLU_DIST is implemented in ANSI C, and MPI for communications.
+Currently, the LU factorization and triangular solution routines,
+which are the most time-consuming part of the solution process,
+are parallelized. The other routines, such as static pivoting and 
+column preordering for sparsity are performed sequentially. 
+This "alpha" release contains double-precision real and double-precision
+complex data types.
+
+### The distribution contains the following directory structure:
+
+```
+SuperLU_DIST/README    instructions on installation
+SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
+	 	       (NOTE: this version is single threaded. If you use the
+		       library with multiple OpenMP threads, performance
+		       relies on a good multithreaded BLAS implementation.)
+SuperLU_DIST/DOC/      the Users' Guide
+SuperLU_DIST/EXAMPLE/  example programs
+SuperLU_DIST/INSTALL/  test machine dependent parameters
+SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
+SuperLU_DIST/TEST/     testing code
+SuperLU_DIST/lib/      contains library archive libsuperlu_dist.a
+SuperLU_DIST/Makefile  top-level Makefile that does installation and testing
+SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
+	               preprocessor definitions, included in all Makefiles.
+	               (You may need to edit it to suit your system
+	               before compiling the whole package.)
+SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
+```
+
+## INSTALLATION
+
+There are two ways to install the package. One requires users to 
+edit makefile manually, the other uses CMake build system.
+The procedures are described below.
+
+### Installation option 1: Manual installation with makefile.
+Before installing the package, please examine the three things dependent 
+on your system setup:
+
+1.1 Edit the make.inc include file.
+
+This make include file is referenced inside each of the Makefiles
+in the various subdirectories. As a result, there is no need to 
+edit the Makefiles in the subdirectories. All information that is
+machine specific has been defined in this include file. 
+
+Sample machine-specific make.inc are provided in the MAKE_INC/
+directory for several platforms, such as Cray XT5, Linux, Mac-OS, and CUDA.
+When you have selected the machine to which you wish to install
+SuperLU_DIST, copy the appropriate sample include file 
+(if one is present) into make.inc.
+
+For example, if you wish to run SuperLU_DIST on a Cray XT5,  you can do
+
+`cp MAKE_INC/make.xt5  make.inc`
+
+For the systems other than listed above, some porting effort is needed
+for parallel factorization routines. Please refer to the Users' Guide 
+for detailed instructions on porting.
+
+The following CPP definitions can be set in CFLAGS.
+
+```
+-DXSDK_INDEX_SIZE=64
+use 64-bit integers for indexing sparse matrices. (default 32 bit)
+
+-DPRNTlevel=[0,1,2,...]
+printing level to show solver's execution details. (default 0)
+
+-DDEBUGlevel=[0,1,2,...]
+diagnostic printing level for debugging purpose. (default 0)
+```      
+
+1.2. The BLAS library.
+The parallel routines in SuperLU_DIST use some BLAS routines on each MPI
+process. Moreover, if you enable OpenMP with multiple threads, you need to
+link with a multithreaded BLAS library. Otherwise performance will be poor.
+A good public domain BLAS library is OpenBLAS (http://www.openblas.net),
+which has OpenMP support.
+
+If you have a BLAS library your machine, you may define the following in
+the file make.inc:
+
+```
+BLASDEF = -DUSE_VENDOR_BLAS
+BLASLIB = <BLAS library you wish to link with>
+```
+
+The CBLAS/ subdirectory contains the part of the C BLAS (single threaded) 
+needed by SuperLU_DIST package. However, these codes are intended for use
+only if there is no faster implementation of the BLAS already
+available on your machine. In this case, you should go to the
+top-level SuperLU_DIST/ directory and do the following:
+
+1) In make.inc, undefine (comment out) BLASDEF, and define:
+` BLASLIB = ../lib/libblas$(PLAT).a`
+
+2) Type: `make blaslib`
+to make the BLAS library from the routines in the
+` CBLAS/ subdirectory.`
+
+1.3. External libraries: Metis and ParMetis.
+
+If you will use Metis or ParMetis ordering, you will
+need to install them yourself. Since ParMetis package already
+contains the source code for the Metis library, you can just
+download and compile ParMetis from:
+[http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download](http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download)
+
+After you have installed it, you should define the following in make.inc:
+```
+METISLIB = -L<metis directory> -lmetis
+PARMETISLIB = -L<parmetis directory> -lparmetis
+I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
+```
+1.4. C preprocessor definition CDEFS.
+In the header file SRC/Cnames.h, we use macros to determine how
+C routines should be named so that they are callable by Fortran.
+(Some vendor-supplied BLAS libraries do not have C interfaces. So the 
+re-naming is needed in order for the SuperLU BLAS calls (in C) to 
+interface with the Fortran-style BLAS.)
+The possible options for CDEFS are:
+
+`-DAdd_`: Fortran expects a C routine to have an underscore
+  postfixed to the name;
+  (This is set as the default)
+`-DNoChange`: Fortran expects a C routine name to be identical to
+      that compiled by C;
+`-DUpCase`: Fortran expects a C routine name to be all uppercase.
+
+1.5. Multicore and GPU (optional).
+
+To use OpenMP parallelism, need to link with an OpenMP library, and
+set the number of threads you wish to use as follows (bash):
+`export OMP_NUM_THREADS=<##>`
+
+To enable NVIDIA GPU access, need to take the following 2 step:
+1) Set the following Linux environment variable:
+`export ACC=GPU`
+
+2) Add the CUDA library location in make.inc:
+```
+ifeq "${ACC}" "GPU"
+CFLAGS += -DGPU_ACC
+INCS += -I<CUDA directory>/include
+LIBS += -L<CUDA directory>/lib64 -lcublas -lcudart 
+endif
+```
+A Makefile is provided in each subdirectory. The installation can be done
+completely automatically by simply typing "make" at the top level.
+
+### Installation option 2: Using CMake build system.
+You will need to create a build tree from which to invoke CMake.
+
+First, in order to use parallel symbolic factorization function, you
+need to install ParMETIS parallel ordering package and define the
+two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
+
+```
+export PARMETIS_ROOT=<Prefix directory of the ParMETIS installation>
+export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
+```
+Then, the installation procedure is the following.
+
+From the top level directory, do:
+```
+mkdir build ; cd build
+cmake .. \
+-DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include"
+
+( Example cmake script: see run_cmake_build.sh
+
+export PARMETIS_ROOT=~/lib/dynamic/parmetis-4.0.3 
+export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 
+cmake .. \
+-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+-DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+-DCMAKE_C_FLAGS="-std=c99 -g" \
+-Denable_blaslib=OFF \
+-DBUILD_SHARED_LIBS=OFF \
+-DCMAKE_C_COMPILER=mpicc \
+-DCMAKE_INSTALL_PREFIX=.
+
+)
+```
+To actually build, type:
+`make`
+
+To install the libraries, type:
+`make install`
+
+To run the installation test, type:
+`ctest`
+(The outputs are in file: `build/Testing/Temporary/LastTest.log`)
+or,
+`ctest -D Experimental`
+or,
+`ctest -D Nightly`
+
+**NOTE:**
+The parallel execution in ctest is invoked by "mpiexec" command which is
+from MPICH environment. If your MPI is not MPICH/mpiexec based, the test
+execution may fail. You can always go to TEST/ directory to perform
+testing manually.
+
+**Note on the C-Fortran name mangling handled by C preprocessor definition:**  
+In the default setting, we assume that Fortran expects a C routine
+to have an underscore postfixed to the name. Depending on the
+compiler, you may need to define one of the following flags in
+during the cmake build to overwrite default setting:
+
+```
+cmake .. -DCMAKE_C_FLAGS="-DNoChange" 
+cmake .. -DCMAKE_C_FLAGS="-DUpCase"
+```
+
+## READING SPARSE MATRIX FILES
+
+The SRC/ directory contains the following routines to read different file 
+formats, they all have the similar calling sequence.
+```
+$ ls -l dread*.c
+dreadMM.c              : Matrix Market, files with suffix .mtx
+dreadhb.c              : Harrell-Boeing, files with suffix .rua
+dreadrb.c              : Rutherford-Boeing, files with suffix .rb
+dreadtriple.c          : triplet, with header
+dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
+```
+
+## REFERENCES
+
+**[1]** SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for Unsymmetric Linear Systems. Xiaoye S. Li and James W. Demmel. ACM Trans. on Math. Software, Vol. 29, No. 2, June 2003, pp. 110-140.  
+**[2]** Parallel Symbolic Factorization for Sparse LU with Static Pivoting. L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3, 1289-1314, 2007.  
+**[3]** A distributed CPU-GPU sparse direct solver. P. Sao, R. Vuduc and X.S. Li, Proc. of EuroPar-2014 Parallel Processing, August 25-29, 2014. Porto, Portugal.  
+
+**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli at lbl.gov](xsli at lbl.gov)  
+**Laura Grigori**, INRIA, France, [laura.grigori at inria.fr](laura.grigori at inria.fr)  
+**Piyush Sao**, Georgia Institute of Technology, [piyush.feynman at gmail.com](piyush.feynman at gmail.com)  
+**Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki at gmail.com](ic.yamazaki at gmail.com)  
+
+## RELEASE VERSIONS
+```
+October 15, 2003    Version 2.0  
+October 1,  2007    Version 2.1  
+Feburary 20, 2008   Version 2.2  
+October 15, 2008    Version 2.3  
+June 9, 2010        Version 2.4  
+November 23, 2010   Version 2.5  
+March 31, 2013      Version 3.3  
+October 1, 2014     Version 4.0  
+July 15, 2014       Version 4.1  
+September 25, 2015  Version 4.2  
+December 31, 2015   Version 4.3  
+April 8, 2016       Version 5.0.0  
+May 15, 2016        Version 5.1.0  
+October 4, 2016     Version 5.1.1  
+December 31, 2016   Version 5.1.3  
+September 30, 2017  Version 5.2.0  
+```
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index b8341c9..36b55d1 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -8,6 +8,8 @@ set(headers
     superlu_enum_consts.h
     supermatrix.h
     util_dist.h
+    colamd.h
+    superlu_dist_config.h
 )
 
 # first: precision-independent files
@@ -32,6 +34,8 @@ set(sources
   xerr_dist.c
   smach_dist.c
   dmach_dist.c
+  colamd.c
+  superlu_dist_version.c
 )
 set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0)
 
@@ -123,5 +127,14 @@ set_target_properties(superlu_dist PROPERTIES
                       VERSION ${PROJECT_VERSION} SOVERSION ${VERSION_MAJOR}
 )
 
-install(TARGETS superlu_dist DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+# Define GNU standard installation directories
+include(GNUInstallDirs)
+
+install(TARGETS superlu_dist
+# DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+     DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+install(FILES ${headers}
+# DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
diff --git a/SRC/Makefile b/SRC/Makefile
index c78083d..532274e 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -30,10 +30,11 @@ include ../make.inc
 # Precision independent routines
 #
 ALLAUX 	= sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
-	  mmd.o comm.o memory.o util.o superlu_grid.o \
+	  colamd.o mmd.o comm.o memory.o util.o superlu_grid.o \
 	  pxerr_dist.o superlu_timer.o symbfact.o \
 	  psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \
-	  static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o
+	  static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o \
+	  superlu_dist_version.o
 
 ifeq "${ACC}" "GPU"
 ALLAUX += cublas_utils.o
@@ -70,16 +71,29 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
 
 all:  double complex16
 
-double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX)
+config_h:
+ifeq ($(XSDK_INDEX_SIZE),64)
+	printf "#define XSDK_INDEX_SIZE 64\n" > superlu_dist_config.h
+else
+	printf "/* #define XSDK_INDEX_SIZE 64 */\n" > superlu_dist_config.h
+endif
+	printf "#if (XSDK_INDEX_SIZE == 64)\n#define _LONGINT 1\n#endif\n" >> superlu_dist_config.h
+
+double: config_h $(DSLUSRC) $(DPLUSRC) $(ALLAUX)
 	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
 		$(DSLUSRC) $(DPLUSRC) $(ALLAUX)
 	$(RANLIB) $(DSUPERLULIB)
 
-complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
+complex16: config_h $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
 	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
 		$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
 	$(RANLIB) $(DSUPERLULIB)
 
+pdgstrf.o: dscatter.c dlook_ahead_update.c dSchCompUdt-2Ddynamic.c pdgstrf.c
+	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c pdgstrf.c $(VERBOSE)
+
+pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c
+	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c pzgstrf.c $(VERBOSE)
 
 .c.o:
 	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
diff --git a/SRC/colamd.c b/SRC/colamd.c
new file mode 100644
index 0000000..5500e68
--- /dev/null
+++ b/SRC/colamd.c
@@ -0,0 +1,3424 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file colamd.c
+ *\brief A sparse matrix column ordering algorithm
+ 
+ <pre>
+    ========================================================================== 
+    === colamd/symamd - a sparse matrix column ordering algorithm ============ 
+    ========================================================================== 
+
+
+    colamd:  an approximate minimum degree column ordering algorithm,
+    	for LU factorization of symmetric or unsymmetric matrices,
+	QR factorization, least squares, interior point methods for
+	linear programming problems, and other related problems.
+
+    symamd:  an approximate minimum degree ordering algorithm for Cholesky
+    	factorization of symmetric matrices.
+
+    Purpose:
+
+	Colamd computes a permutation Q such that the Cholesky factorization of
+	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
+	than A'A.  This also provides a good ordering for sparse partial
+	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+	factorization, and P is computed during numerical factorization via
+	conventional partial pivoting with row interchanges.  Colamd is the
+	column ordering method used in SuperLU, part of the ScaLAPACK library.
+	It is also available as built-in function in MATLAB Version 6,
+	available from MathWorks, Inc. (http://www.mathworks.com).  This
+	routine can be used in place of colmmd in MATLAB.
+
+    	Symamd computes a permutation P of a symmetric matrix A such that the
+	Cholesky factorization of PAP' has less fill-in and requires fewer
+	floating point operations than A.  Symamd constructs a matrix M such
+	that M'M has the same nonzero pattern of A, and then orders the columns
+	of M using colmmd.  The column ordering of M is then returned as the
+	row and column ordering P of A. 
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	September 8, 2003.  Version 2.3.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Copyright and License:
+
+	Copyright (c) 1998-2003 by the University of Florida.
+	All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
+	file.  It requires the colamd.h file.  It is required by the colamdmex.c
+	and symamdmex.c files, for the MATLAB interface to colamd and symamd.
+
+    See the ChangeLog file for changes since Version 1.0.
+
+    ========================================================================== 
+    === Description of user-callable routines ================================ 
+    ========================================================================== 
+
+
+    ----------------------------------------------------------------------------
+    colamd_recommended:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int colamd_recommended (int nnz, int n_row, int n_col) ;
+
+	    or as a C macro
+
+	    #include "colamd.h"
+	    Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ;
+
+	Purpose:
+
+	    Returns recommended value of Alen for use by colamd.  Returns -1
+	    if any input argument is negative.  The use of this routine
+	    or macro is optional.  Note that the macro uses its arguments
+	    more than once, so be careful for side effects, if you pass
+	    expressions as arguments to COLAMD_RECOMMENDED.  Not needed for
+	    symamd, which dynamically allocates its own memory.
+
+	Arguments (all input arguments):
+
+	    int nnz ;		Number of nonzeros in the matrix A.  This must
+				be the same value as p [n_col] in the call to
+				colamd - otherwise you will get a wrong value
+				of the recommended memory to use.
+
+	    int n_row ;		Number of rows in the matrix A.
+
+	    int n_col ;		Number of columns in the matrix A.
+
+    ----------------------------------------------------------------------------
+    colamd_set_defaults:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
+
+	Purpose:
+
+	    Sets the default parameters.  The use of this routine is optional.
+
+	Arguments:
+
+	    double knobs [COLAMD_KNOBS] ;	Output only.
+
+		Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+		entries are removed prior to ordering.  Columns with more than
+		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+		ordering, and placed last in the output column ordering. 
+
+		Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
+		Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n)
+		entries are removed prior to ordering, and placed last in the
+		output ordering.
+
+		COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+		respectively, in colamd.h.  Default values of these two knobs
+		are both 0.5.  Currently, only knobs [0] and knobs [1] are
+		used, but future versions may use more knobs.  If so, they will
+		be properly set to their defaults by the future version of
+		colamd_set_defaults, so that the code that calls colamd will
+		not need to change, assuming that you either use
+		colamd_set_defaults, or pass a (double *) NULL pointer as the
+		knobs array to colamd or symamd.
+
+    ----------------------------------------------------------------------------
+    colamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int colamd (int n_row, int n_col, int Alen, int *A, int *p,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Computes a column ordering (Q) of A such that P(AQ)=LU or
+	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
+	    operations than factorizing the unpermuted matrix A or A'A,
+	    respectively.
+	    
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n_row ;		Input argument.
+
+		Number of rows in the matrix A.
+		Restriction:  n_row >= 0.
+		Colamd returns FALSE if n_row is negative.
+
+	    int n_col ;		Input argument.
+
+		Number of columns in the matrix A.
+		Restriction:  n_col >= 0.
+		Colamd returns FALSE if n_col is negative.
+
+	    int Alen ;		Input argument.
+
+		Restriction (see note):
+		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
+		Colamd returns FALSE if these conditions are not met.
+
+		Note:  this restriction makes an modest assumption regarding
+		the size of the two typedef's structures in colamd.h.
+		We do, however, guarantee that
+
+			Alen >= colamd_recommended (nnz, n_row, n_col)
+		
+		or equivalently as a C preprocessor macro: 
+
+			Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col)
+
+		will be sufficient.
+
+	    int A [Alen] ;	Input argument, undefined on output.
+
+		A is an integer array of size Alen.  Alen must be at least as
+		large as the bare minimum value given above, but this is very
+		low, and can result in excessive run time.  For best
+		performance, we recommend that Alen be greater than or equal to
+		colamd_recommended (nnz, n_row, n_col), which adds
+		nnz/5 to the bare minimum value given above.
+
+		On input, the row indices of the entries in column c of the
+		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
+		in a given column c need not be in ascending order, and
+		duplicate row indices may be be present.  However, colamd will
+		work a little faster if both of these conditions are met
+		(Colamd puts the matrix into this format, if it finds that the
+		the conditions are not met).
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are modified during ordering, and are
+		undefined on output.
+
+	    int p [n_col+1] ;	Both input and output argument.
+
+		p is an integer array of size n_col+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n_col-1.  The value p [n_col] is
+		thus the total number of entries in the pattern of the matrix A.
+		Colamd returns FALSE if these conditions are not met.
+
+		On output, if colamd returns TRUE, the array p holds the column
+		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+		the first column index in the new ordering, and p [n_col-1] is
+		the last.  That is, p [k] = j means that column j of A is the
+		kth pivot column, in AQ, where k is in the range 0 to n_col-1
+		(p [0] = j means that column j of A is the first column in AQ).
+
+		If colamd returns FALSE, then no permutation is returned, and
+		p is undefined on output.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Colamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty rows ignored.
+
+		stats [1]:  number of dense or empty columns ignored (and
+				ordered last in the output permutation p)
+				Note that a row can become "empty" if it
+				contains only "dense" and/or "empty" columns,
+				and similarly a column can become "empty" if it
+				only contains "dense" and/or "empty" rows.
+
+		stats [2]:  number of garbage collections performed.
+				This can be excessively high if Alen is close
+				to the minimum required value.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Colamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of colamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3 	n_row is negative
+
+					stats [4]: n_row
+
+			-4	n_col is negative
+
+					stats [4]: n_col
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: number of nonzeros, p [n_col]
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	A is too small
+
+					stats [4]: required size
+					stats [5]: actual size (Alen)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	(unused; see symamd.c)
+
+			-999	(unused; see symamd.c)
+
+		Future versions may return more statistics in the stats array.
+
+	Example:
+	
+	    See http://www.cise.ufl.edu/research/sparse/colamd/example.c
+	    for a complete example.
+
+	    To order the columns of a 5-by-4 matrix with 11 nonzero entries in
+	    the following nonzero pattern
+
+	    	x 0 x 0
+		x 0 x x
+		0 x x 0
+		0 0 x x
+		x x 0 0
+
+	    with default knobs and no output statistics, do the following:
+
+		#include "colamd.h"
+		#define ALEN COLAMD_RECOMMENDED (11, 5, 4)
+		int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ;
+		int p [ ] = {0, 3, 5, 9, 11} ;
+		int stats [COLAMD_STATS] ;
+		colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
+
+	    The permutation is returned in the array p, and A is destroyed.
+
+    ----------------------------------------------------------------------------
+    symamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int symamd (int n, int *A, int *p, int *perm,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
+		void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+
+	Purpose:
+
+    	    The symamd routine computes an ordering P of a symmetric sparse
+	    matrix A such that the Cholesky factorization PAP' = LL' remains
+	    sparse.  It is based on a column ordering of a matrix M constructed
+	    so that the nonzero pattern of M'M is the same as A.  The matrix A
+	    is assumed to be symmetric; only the strictly lower triangular part
+	    is accessed.  You must pass your selected memory allocator (usually
+	    calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
+	    memory for the temporary matrix M.
+
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n ;		Input argument.
+
+	    	Number of rows and columns in the symmetrix matrix A.
+		Restriction:  n >= 0.
+		Symamd returns FALSE if n is negative.
+
+	    int A [nnz] ;	Input argument.
+
+	    	A is an integer array of size nnz, where nnz = p [n].
+		
+		The row indices of the entries in column c of the matrix are
+		held in A [(p [c]) ... (p [c+1]-1)].  The row indices in a
+		given column c need not be in ascending order, and duplicate
+		row indices may be present.  However, symamd will run faster
+		if the columns are in sorted order with no duplicate entries. 
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n-1, and columns are in the range 0 to n-1.  Symamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are not modified.
+
+	    int p [n+1] ;   	Input argument.
+
+		p is an integer array of size n+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n-1.  The value p [n] is
+		thus the total number of entries in the pattern of the matrix A.
+		Symamd returns FALSE if these conditions are not met.
+
+		The contents of p are not modified.
+
+	    int perm [n+1] ;   	Output argument.
+
+		On output, if symamd returns TRUE, the array perm holds the
+		permutation P, where perm [0] is the first index in the new
+		ordering, and perm [n-1] is the last.  That is, perm [k] = j
+		means that row and column j of A is the kth column in PAP',
+		where k is in the range 0 to n-1 (perm [0] = j means
+		that row and column j of A are the first row and column in
+		PAP').  The array is used as a workspace during the ordering,
+		which is why it must be of length n+1, not just n.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Symamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty row and columns ignored
+				(and ordered last in the output permutation 
+				perm).  Note that a row/column can become
+				"empty" if it contains only "dense" and/or
+				"empty" columns/rows.
+
+		stats [1]:  (same as stats [0])
+
+		stats [2]:  number of garbage collections performed.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Symamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of symamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3	(unused, see colamd.c)
+
+			-4 	n is negative
+
+					stats [4]: n
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: # of nonzeros (p [n]).
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	(unused)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	out of memory (unable to allocate temporary
+				workspace for M or count arrays using the
+				"allocate" routine passed into symamd).
+
+			-999	internal error.  colamd failed to order the
+				matrix M, when it should have succeeded.  This
+				indicates a bug.  If this (and *only* this)
+				error code occurs, please contact the authors.
+				Don't contact the authors if you get any other
+				error code.
+
+		Future versions may return more statistics in the stats array.
+
+	    void * (*allocate) (size_t, size_t)
+
+	    	A pointer to a function providing memory allocation.  The
+		allocated memory must be returned initialized to zero.  For a
+		C application, this argument should normally be a pointer to
+		calloc.  For a MATLAB mexFunction, the routine mxCalloc is
+		passed instead.
+
+	    void (*release) (size_t, size_t)
+
+	    	A pointer to a function that frees memory allocated by the
+		memory allocation routine above.  For a C application, this
+		argument should normally be a pointer to free.  For a MATLAB
+		mexFunction, the routine mxFree is passed instead.
+
+
+    ----------------------------------------------------------------------------
+    colamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_report (int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from colamd.
+
+
+    ----------------------------------------------------------------------------
+    symamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    symamd_report (int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from symamd.
+
+ </pre>
+*/
+
+/* ========================================================================== */
+/* === Scaffolding code definitions  ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif /* NDEBUG */
+
+/*
+   Our "scaffolding code" philosophy:  In our opinion, well-written library
+   code should keep its "debugging" code, and just normally have it turned off
+   by the compiler so as not to interfere with performance.  This serves
+   several purposes:
+
+   (1) assertions act as comments to the reader, telling you what the code
+	expects at that point.  All assertions will always be true (unless
+	there really is a bug, of course).
+
+   (2) leaving in the scaffolding code assists anyone who would like to modify
+	the code, or understand the algorithm (by reading the debugging output,
+	one can get a glimpse into what the code is doing).
+
+   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
+	and "should" be fully functional and bug-free ... but you never know...
+
+    To enable debugging, comment out the "#define NDEBUG" above.  For a MATLAB
+    mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG
+    definition.  The code will become outrageously slow when debugging is
+    enabled.  To control the level of debugging output, set an environment
+    variable D to 0 (little), 1 (some), 2, 3, or 4 (lots).  When debugging,
+    you should see the following message on the standard output:
+
+    	colamd: debug version, D = 1 (THIS WILL BE SLOW!)
+
+    or a similar message for symamd.  If you don't, then debugging has not
+    been enabled.
+
+*/
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include "colamd.h"
+#include <limits.h>
+
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+#include "matrix.h"
+#else
+#include <stdio.h>
+#include <assert.h>
+#endif /* MATLAB_MEX_FILE */
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
+#define PUBLIC
+#define PRIVATE static
+
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+/* -------------------------------------------------------------------------- */
+/* Change for version 2.1:  define TRUE and FALSE only if not yet defined */  
+/* -------------------------------------------------------------------------- */
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+#define EMPTY	(-1)
+
+/* Row and column status */
+#define ALIVE	(0)
+#define DEAD	(-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL		(-1)
+#define DEAD_NON_PRINCIPAL	(-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* ========================================================================== */
+/* === Colamd reporting mechanism =========================================== */
+/* ========================================================================== */
+
+#ifdef MATLAB_MEX_FILE
+
+/* use mexPrintf in a MATLAB mexFunction, for debugging and statistics output */
+#define PRINTF mexPrintf
+
+/* In MATLAB, matrices are 1-based to the user, but 0-based internally */
+#define INDEX(i) ((i)+1)
+
+#else
+
+/* Use printf in standard C environment, for debugging and statistics output. */
+/* Output is generated only if debugging is enabled at compile time, or if */
+/* the caller explicitly calls colamd_report or symamd_report. */
+#define PRINTF printf
+
+/* In C, matrices are 0-based and indices are reported as such in *_report */
+#define INDEX(i) (i)
+
+#endif /* MATLAB_MEX_FILE */
+
+/* ========================================================================== */
+/* === Prototypes of PRIVATE routines ======================================= */
+/* ========================================================================== */
+
+PRIVATE int init_rows_cols
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int p [],
+    int stats [COLAMD_STATS]
+) ;
+
+PRIVATE void init_scoring
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int head [],
+    double knobs [COLAMD_KNOBS],
+    int *p_n_row2,
+    int *p_n_col2,
+    int *p_max_deg
+) ;
+
+PRIVATE int find_ordering
+(
+    int n_row,
+    int n_col,
+    int Alen,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int head [],
+    int n_col2,
+    int max_deg,
+    int pfree
+) ;
+
+PRIVATE void order_children
+(
+    int n_col,
+    Colamd_Col Col [],
+    int p []
+) ;
+
+PRIVATE void detect_super_cols
+(
+
+#ifndef NDEBUG
+    int n_col,
+    Colamd_Row Row [],
+#endif /* NDEBUG */
+
+    Colamd_Col Col [],
+    int A [],
+    int head [],
+    int row_start,
+    int row_length
+) ;
+
+PRIVATE int garbage_collection
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int *pfree
+) ;
+
+PRIVATE int clear_mark
+(
+    int n_row,
+    Colamd_Row Row []
+) ;
+
+PRIVATE void print_report
+(
+    char *method,
+    int stats [COLAMD_STATS]
+) ;
+
+/* ========================================================================== */
+/* === Debugging prototypes and definitions ================================= */
+/* ========================================================================== */
+
+#ifndef NDEBUG
+
+/* colamd_debug is the *ONLY* global variable, and is only */
+/* present when debugging */
+
+PRIVATE int colamd_debug ;	/* debug print level */
+
+#define DEBUG0(params) { (void) PRINTF params ; }
+#define DEBUG1(params) { if (colamd_debug >= 1) (void) PRINTF params ; }
+#define DEBUG2(params) { if (colamd_debug >= 2) (void) PRINTF params ; }
+#define DEBUG3(params) { if (colamd_debug >= 3) (void) PRINTF params ; }
+#define DEBUG4(params) { if (colamd_debug >= 4) (void) PRINTF params ; }
+
+#ifdef MATLAB_MEX_FILE
+#define ASSERT(expression) (mxAssert ((expression), ""))
+#else
+#define ASSERT(expression) (assert (expression))
+#endif /* MATLAB_MEX_FILE */
+
+PRIVATE void colamd_get_debug	/* gets the debug print level from getenv */
+(
+    char *method
+) ;
+
+PRIVATE void debug_deg_lists
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+) ;
+
+PRIVATE void debug_mark
+(
+    int n_row,
+    Colamd_Row Row [],
+    int tag_mark,
+    int max_mark
+) ;
+
+PRIVATE void debug_matrix
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A []
+) ;
+
+PRIVATE void debug_structures
+(
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int n_col2
+) ;
+
+#else /* NDEBUG */
+
+/* === No debugging ========================================================= */
+
+#define DEBUG0(params) ;
+#define DEBUG1(params) ;
+#define DEBUG2(params) ;
+#define DEBUG3(params) ;
+#define DEBUG4(params) ;
+
+#define ASSERT(expression) ((void) 0)
+
+#endif /* NDEBUG */
+
+/* ========================================================================== */
+
+
+
+/* ========================================================================== */
+/* === USER-CALLABLE ROUTINES: ============================================== */
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === colamd_recommended =================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_recommended routine returns the suggested size for Alen.  This
+    value has been determined to provide good balance between the number of
+    garbage collections and the memory requirements for colamd.  If any
+    argument is negative, a -1 is returned as an error condition.  This
+    function is also available as a macro defined in colamd.h, so that you
+    can use it for a statically-allocated array size.
+*/
+
+PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
+(
+    /* === Parameters ======================================================= */
+
+    int nnz,			/* number of nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+)
+{
+    return (COLAMD_RECOMMENDED (nnz, n_row, n_col)) ; 
+}
+
+
+/* ========================================================================== */
+/* === colamd_set_defaults ================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_set_defaults routine sets the default values of the user-
+    controllable parameters for colamd:
+
+	knobs [0]	rows with knobs[0]*n_col entries or more are removed
+			prior to ordering in colamd.  Rows and columns with
+			knobs[0]*n_col entries or more are removed prior to
+			ordering in symamd and placed last in the output
+			ordering.
+
+	knobs [1]	columns with knobs[1]*n_row entries or more are removed
+			prior to ordering in colamd, and placed last in the
+			column permutation.  Symamd ignores this knob.
+
+	knobs [2..19]	unused, but future versions might use this
+*/
+
+PUBLIC void colamd_set_defaults
+(
+    /* === Parameters ======================================================= */
+
+    double knobs [COLAMD_KNOBS]		/* knob array */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+
+    if (!knobs)
+    {
+	return ;			/* no knobs to initialize */
+    }
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	knobs [i] = 0 ;
+    }
+    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
+    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
+}
+
+
+/* ========================================================================== */
+/* === symamd =============================================================== */
+/* ========================================================================== */
+
+PUBLIC int symamd			/* return TRUE if OK, FALSE otherwise */
+(
+    /* === Parameters ======================================================= */
+
+    int n,				/* number of rows and columns of A */
+    int A [],				/* row indices of A */
+    int p [],				/* column pointers of A */
+    int perm [],			/* output permutation, size n+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    int stats [COLAMD_STATS],		/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int *count ;		/* length of each column of M, and col pointer*/
+    int *mark ;			/* mark array for finding duplicate entries */
+    int *M ;			/* row indices of matrix M */
+    int Mlen ;			/* length of M */
+    int n_row ;			/* number of rows in M */
+    int nnz ;			/* number of entries in A */
+    int i ;			/* row index of A */
+    int j ;			/* column index of A */
+    int k ;			/* row index of M */ 
+    int mnz ;			/* number of nonzeros in M */
+    int pp ;			/* index into a column of A */
+    int last_row ;		/* last row seen in the current column */
+    int length ;		/* number of nonzeros in a column */
+
+    double cknobs [COLAMD_KNOBS] ;		/* knobs for colamd */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs for colamd */
+    int cstats [COLAMD_STATS] ;			/* colamd stats */
+
+#ifndef NDEBUG
+    colamd_get_debug ("symamd") ;
+#endif /* NDEBUG */
+
+    /* === Check the input arguments ======================================== */
+
+    if (!stats)
+    {
+	DEBUG0 (("symamd: stats not present\n")) ;
+	return (FALSE) ;
+    }
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	stats [i] = 0 ;
+    }
+    stats [COLAMD_STATUS] = COLAMD_OK ;
+    stats [COLAMD_INFO1] = -1 ;
+    stats [COLAMD_INFO2] = -1 ;
+
+    if (!A)
+    {
+    	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+	DEBUG0 (("symamd: A not present\n")) ;
+	return (FALSE) ;
+    }
+
+    if (!p)		/* p is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+	DEBUG0 (("symamd: p not present\n")) ;
+    	return (FALSE) ;
+    }
+
+    if (n < 0)		/* n must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+	stats [COLAMD_INFO1] = n ;
+	DEBUG0 (("symamd: n negative %d\n", n)) ;
+    	return (FALSE) ;
+    }
+
+    nnz = p [n] ;
+    if (nnz < 0)	/* nnz must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+	stats [COLAMD_INFO1] = nnz ;
+	DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ;
+	return (FALSE) ;
+    }
+
+    if (p [0] != 0)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+	stats [COLAMD_INFO1] = p [0] ;
+	DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default knobs =================================== */
+
+    if (!knobs)
+    {
+	colamd_set_defaults (default_knobs) ;
+	knobs = default_knobs ;
+    }
+
+    /* === Allocate count and mark ========================================== */
+
+    count = (int *) ((*allocate) (n+1, sizeof (int))) ;
+    if (!count)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ;
+	return (FALSE) ;
+    }
+
+    mark = (int *) ((*allocate) (n+1, sizeof (int))) ;
+    if (!mark)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	(*release) ((void *) count) ;
+	DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ;
+	return (FALSE) ;
+    }
+
+    /* === Compute column counts of M, check if A is valid ================== */
+
+    stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+
+    for (i = 0 ; i < n ; i++)
+    {
+    	mark [i] = -1 ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+	last_row = -1 ;
+
+	length = p [j+1] - p [j] ;
+	if (length < 0)
+	{
+	    /* column pointers must be non-decreasing */
+	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+	    stats [COLAMD_INFO1] = j ;
+	    stats [COLAMD_INFO2] = length ;
+	    (*release) ((void *) count) ;
+	    (*release) ((void *) mark) ;
+	    DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ;
+	    return (FALSE) ;
+	}
+
+	for (pp = p [j] ; pp < p [j+1] ; pp++)
+	{
+	    i = A [pp] ;
+	    if (i < 0 || i >= n)
+	    {
+		/* row index i, in column j, is out of bounds */
+		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+		stats [COLAMD_INFO1] = j ;
+		stats [COLAMD_INFO2] = i ;
+		stats [COLAMD_INFO3] = n ;
+		(*release) ((void *) count) ;
+		(*release) ((void *) mark) ;
+		DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ;
+		return (FALSE) ;
+	    }
+
+	    if (i <= last_row || mark [i] == j)
+	    {
+		/* row index is unsorted or repeated (or both), thus col */
+		/* is jumbled.  This is a notice, not an error condition. */
+		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+		stats [COLAMD_INFO1] = j ;
+		stats [COLAMD_INFO2] = i ;
+		(stats [COLAMD_INFO3]) ++ ;
+		DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ;
+	    }
+
+	    if (i > j && mark [i] != j)
+	    {
+		/* row k of M will contain column indices i and j */
+		count [i]++ ;
+		count [j]++ ;
+	    }
+
+	    /* mark the row as having been seen in this column */
+	    mark [i] = j ;
+
+	    last_row = i ;
+	}
+    }
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK)
+    {
+	/* if there are no duplicate entries, then mark is no longer needed */
+	(*release) ((void *) mark) ;
+    }
+
+    /* === Compute column pointers of M ===================================== */
+
+    /* use output permutation, perm, for column pointers of M */
+    perm [0] = 0 ;
+    for (j = 1 ; j <= n ; j++)
+    {
+	perm [j] = perm [j-1] + count [j-1] ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+	count [j] = perm [j] ;
+    }
+
+    /* === Construct M ====================================================== */
+
+    mnz = perm [n] ;
+    n_row = mnz / 2 ;
+    Mlen = colamd_recommended (mnz, n_row, n) ;
+    M = (int *) ((*allocate) (Mlen, sizeof (int))) ;
+    DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %d\n",
+    	n_row, n, mnz, Mlen)) ;
+
+    if (!M)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	(*release) ((void *) count) ;
+	(*release) ((void *) mark) ;
+	DEBUG0 (("symamd: allocate M (size %d) failed\n", Mlen)) ;
+	return (FALSE) ;
+    }
+
+    k = 0 ;
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK)
+    {
+	/* Matrix is OK */
+	for (j = 0 ; j < n ; j++)
+	{
+	    ASSERT (p [j+1] - p [j] >= 0) ;
+	    for (pp = p [j] ; pp < p [j+1] ; pp++)
+	    {
+		i = A [pp] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i > j)
+		{
+		    /* row k of M contains column indices i and j */
+		    M [count [i]++] = k ;
+		    M [count [j]++] = k ;
+		    k++ ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* Matrix is jumbled.  Do not add duplicates to M.  Unsorted cols OK. */
+	DEBUG0 (("symamd: Duplicates in A.\n")) ;
+	for (i = 0 ; i < n ; i++)
+	{
+	    mark [i] = -1 ;
+	}
+	for (j = 0 ; j < n ; j++)
+	{
+	    ASSERT (p [j+1] - p [j] >= 0) ;
+	    for (pp = p [j] ; pp < p [j+1] ; pp++)
+	    {
+		i = A [pp] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i > j && mark [i] != j)
+		{
+		    /* row k of M contains column indices i and j */
+		    M [count [i]++] = k ;
+		    M [count [j]++] = k ;
+		    k++ ;
+		    mark [i] = j ;
+		}
+	    }
+	}
+	(*release) ((void *) mark) ;
+    }
+
+    /* count and mark no longer needed */
+    (*release) ((void *) count) ;
+    ASSERT (k == n_row) ;
+
+    /* === Adjust the knobs for M =========================================== */
+
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	cknobs [i] = knobs [i] ;
+    }
+
+    /* there are no dense rows in M */
+    cknobs [COLAMD_DENSE_ROW] = 1.0 ;
+
+    if (n_row != 0 && n < n_row)
+    {
+	/* On input, the knob is a fraction of 1..n, the number of rows of A. */
+	/* Convert it to a fraction of 1..n_row, of the number of rows of M. */
+    	cknobs [COLAMD_DENSE_COL] = (knobs [COLAMD_DENSE_ROW] * n) / n_row ;
+    }
+    else
+    {
+	/* no dense columns in M */
+    	cknobs [COLAMD_DENSE_COL] = 1.0 ;
+    }
+
+    DEBUG0 (("symamd: dense col knob for M: %g\n", cknobs [COLAMD_DENSE_COL])) ;
+
+    /* === Order the columns of M =========================================== */
+
+    if (!colamd (n_row, n, Mlen, M, perm, cknobs, cstats))
+    {
+	/* This "cannot" happen, unless there is a bug in the code. */
+	stats [COLAMD_STATUS] = COLAMD_ERROR_internal_error ;
+	(*release) ((void *) M) ;
+	DEBUG0 (("symamd: internal error!\n")) ;
+	return (FALSE) ;
+    }
+
+    /* Note that the output permutation is now in perm */
+
+    /* === get the statistics for symamd from colamd ======================== */
+
+    /* note that a dense column in colamd means a dense row and col in symamd */
+    stats [COLAMD_DENSE_ROW]    = cstats [COLAMD_DENSE_COL] ;
+    stats [COLAMD_DENSE_COL]    = cstats [COLAMD_DENSE_COL] ;
+    stats [COLAMD_DEFRAG_COUNT] = cstats [COLAMD_DEFRAG_COUNT] ;
+
+    /* === Free M =========================================================== */
+
+    (*release) ((void *) M) ;
+    DEBUG0 (("symamd: done.\n")) ;
+    return (TRUE) ;
+
+}
+
+/* ========================================================================== */
+/* === colamd =============================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd routine computes a column ordering Q of a sparse matrix
+    A such that the LU factorization P(AQ) = LU remains sparse, where P is
+    selected via partial pivoting.   The routine can also be viewed as
+    providing a permutation Q such that the Cholesky factorization
+    (AQ)'(AQ) = LL' remains sparse.
+*/
+
+PUBLIC int colamd		/* returns TRUE if successful, FALSE otherwise*/
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* length of A */
+    int A [],			/* row indices of A */
+    int p [],			/* pointers to columns in A */
+    double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */
+    int stats [COLAMD_STATS]	/* output statistics and error codes */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop index */
+    int nnz ;			/* nonzeros in A */
+    int Row_size ;		/* size of Row [], in integers */
+    int Col_size ;		/* size of Col [], in integers */
+    int need ;			/* minimum required length of A */
+    Colamd_Row *Row ;		/* pointer into A of Row [0..n_row] array */
+    Colamd_Col *Col ;		/* pointer into A of Col [0..n_col] array */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int ngarbage ;		/* number of garbage collections performed */
+    int max_deg ;		/* maximum row degree */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs array */
+
+#ifndef NDEBUG
+    colamd_get_debug ("colamd") ;
+#endif /* NDEBUG */
+
+    /* === Check the input arguments ======================================== */
+
+    if (!stats)
+    {
+	DEBUG0 (("colamd: stats not present\n")) ;
+	return (FALSE) ;
+    }
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	stats [i] = 0 ;
+    }
+    stats [COLAMD_STATUS] = COLAMD_OK ;
+    stats [COLAMD_INFO1] = -1 ;
+    stats [COLAMD_INFO2] = -1 ;
+
+    if (!A)		/* A is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+	DEBUG0 (("colamd: A not present\n")) ;
+	return (FALSE) ;
+    }
+
+    if (!p)		/* p is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+	DEBUG0 (("colamd: p not present\n")) ;
+    	return (FALSE) ;
+    }
+
+    if (n_row < 0)	/* n_row must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
+	stats [COLAMD_INFO1] = n_row ;
+	DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
+    	return (FALSE) ;
+    }
+
+    if (n_col < 0)	/* n_col must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+	stats [COLAMD_INFO1] = n_col ;
+	DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
+    	return (FALSE) ;
+    }
+
+    nnz = p [n_col] ;
+    if (nnz < 0)	/* nnz must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+	stats [COLAMD_INFO1] = nnz ;
+	DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
+	return (FALSE) ;
+    }
+
+    if (p [0] != 0)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero	;
+	stats [COLAMD_INFO1] = p [0] ;
+	DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default knobs =================================== */
+
+    if (!knobs)
+    {
+	colamd_set_defaults (default_knobs) ;
+	knobs = default_knobs ;
+    }
+
+    /* === Allocate the Row and Col arrays from array A ===================== */
+
+    Col_size = COLAMD_C (n_col) ;
+    Row_size = COLAMD_R (n_row) ;
+    need = 2*nnz + n_col + Col_size + Row_size ;
+
+    if (need > Alen)
+    {
+	/* not enough space in array A to perform the ordering */
+	stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
+	stats [COLAMD_INFO1] = need ;
+	stats [COLAMD_INFO2] = Alen ;
+	DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
+	return (FALSE) ;
+    }
+
+    Alen -= Col_size + Row_size ;
+    Col = (Colamd_Col *) &A [Alen] ;
+    Row = (Colamd_Row *) &A [Alen + Col_size] ;
+
+    /* === Construct the row and column data structures ===================== */
+
+    if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+    {
+	/* input matrix is invalid */
+	DEBUG0 (("colamd: Matrix invalid\n")) ;
+	return (FALSE) ;
+    }
+
+    /* === Initialize scores, kill dense rows/columns ======================= */
+
+    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+	&n_row2, &n_col2, &max_deg) ;
+
+    /* === Order the supercolumns =========================================== */
+
+    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+	n_col2, max_deg, 2*nnz) ;
+
+    /* === Order the non-principal columns ================================== */
+
+    order_children (n_col, Col, p) ;
+
+    /* === Return statistics in stats ======================================= */
+
+    stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+    stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
+    stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
+    DEBUG0 (("colamd: done.\n")) ; 
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void colamd_report
+(
+    int stats [COLAMD_STATS]
+)
+{
+    print_report ("colamd", stats) ;
+}
+
+
+/* ========================================================================== */
+/* === symamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void symamd_report
+(
+    int stats [COLAMD_STATS]
+)
+{
+    print_report ("symamd", stats) ;
+}
+
+
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+    Takes the column form of the matrix in A and creates the row form of the
+    matrix.  Also, row and column attributes are stored in the Col and Row
+    structs.  If the columns are un-sorted or contain duplicate row indices,
+    this routine will also sort and remove duplicate row indices from the
+    column form of the matrix.  Returns FALSE if the matrix is invalid,
+    TRUE otherwise.  Not user-callable.
+*/
+
+PRIVATE int init_rows_cols	/* returns TRUE if OK, or FALSE otherwise */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* pointers to columns in A, of size n_col+1 */
+    int stats [COLAMD_STATS]	/* colamd statistics */ 
+)
+{
+    /* === Local variables ================================================== */
+
+    int col ;			/* a column index */
+    int row ;			/* a row index */
+    int *cp ;			/* a column pointer */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *rp ;			/* a row pointer */
+    int *rp_end ;		/* a pointer to the end of a row */
+    int last_row ;		/* previous row */
+
+    /* === Initialize columns, and check column pointers ==================== */
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	Col [col].start = p [col] ;
+	Col [col].length = p [col+1] - p [col] ;
+
+	if (Col [col].length < 0)
+	{
+	    /* column pointers must be non-decreasing */
+	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+	    stats [COLAMD_INFO1] = col ;
+	    stats [COLAMD_INFO2] = Col [col].length ;
+	    DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
+	    return (FALSE) ;
+	}
+
+	Col [col].shared1.thickness = 1 ;
+	Col [col].shared2.score = 0 ;
+	Col [col].shared3.prev = EMPTY ;
+	Col [col].shared4.degree_next = EMPTY ;
+    }
+
+    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+    /* === Scan columns, compute row degrees, and check row indices ========= */
+
+    stats [COLAMD_INFO3] = 0 ;	/* number of duplicate or unsorted row indices*/
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].length = 0 ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	last_row = -1 ;
+
+	cp = &A [p [col]] ;
+	cp_end = &A [p [col+1]] ;
+
+	while (cp < cp_end)
+	{
+	    row = *cp++ ;
+
+	    /* make sure row indices within range */
+	    if (row < 0 || row >= n_row)
+	    {
+		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+		stats [COLAMD_INFO1] = col ;
+		stats [COLAMD_INFO2] = row ;
+		stats [COLAMD_INFO3] = n_row ;
+		DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
+		return (FALSE) ;
+	    }
+
+	    if (row <= last_row || Row [row].shared2.mark == col)
+	    {
+		/* row index are unsorted or repeated (or both), thus col */
+		/* is jumbled.  This is a notice, not an error condition. */
+		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+		stats [COLAMD_INFO1] = col ;
+		stats [COLAMD_INFO2] = row ;
+		(stats [COLAMD_INFO3]) ++ ;
+		DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
+	    }
+
+	    if (Row [row].shared2.mark != col)
+	    {
+		Row [row].length++ ;
+	    }
+	    else
+	    {
+		/* this is a repeated entry in the column, */
+		/* it will be removed */
+		Col [col].length-- ;
+	    }
+
+	    /* mark the row as having been seen in this column */
+	    Row [row].shared2.mark = col ;
+
+	    last_row = row ;
+	}
+    }
+
+    /* === Compute row pointers ============================================= */
+
+    /* row form of the matrix starts directly after the column */
+    /* form of matrix in A */
+    Row [0].start = p [n_col] ;
+    Row [0].shared1.p = Row [0].start ;
+    Row [0].shared2.mark = -1 ;
+    for (row = 1 ; row < n_row ; row++)
+    {
+	Row [row].start = Row [row-1].start + Row [row-1].length ;
+	Row [row].shared1.p = Row [row].start ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    /* === Create row form ================================================== */
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+    {
+	/* if cols jumbled, watch for repeated row indices */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		row = *cp++ ;
+		if (Row [row].shared2.mark != col)
+		{
+		    A [(Row [row].shared1.p)++] = col ;
+		    Row [row].shared2.mark = col ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* if cols not jumbled, we don't need the mark (this is faster) */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		A [(Row [*cp++].shared1.p)++] = col ;
+	    }
+	}
+    }
+
+    /* === Clear the row marks and set row degrees ========================== */
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].shared2.mark = 0 ;
+	Row [row].shared1.degree = Row [row].length ;
+    }
+
+    /* === See if we need to re-create columns ============================== */
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+    {
+    	DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
+
+#ifndef NDEBUG
+	/* make sure column lengths are correct */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    p [col] = Col [col].length ;
+	}
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		p [*rp++]-- ;
+	    }
+	}
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    ASSERT (p [col] == 0) ;
+	}
+	/* now p is all zero (different than when debugging is turned off) */
+#endif /* NDEBUG */
+
+	/* === Compute col pointers ========================================= */
+
+	/* col form of the matrix starts at A [0]. */
+	/* Note, we may have a gap between the col form and the row */
+	/* form if there were duplicate entries, if so, it will be */
+	/* removed upon the first garbage collection */
+	Col [0].start = 0 ;
+	p [0] = Col [0].start ;
+	for (col = 1 ; col < n_col ; col++)
+	{
+	    /* note that the lengths here are for pruned columns, i.e. */
+	    /* no duplicate row indices will exist for these columns */
+	    Col [col].start = Col [col-1].start + Col [col-1].length ;
+	    p [col] = Col [col].start ;
+	}
+
+	/* === Re-create col form =========================================== */
+
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		A [(p [*rp++])++] = row ;
+	    }
+	}
+    }
+
+    /* === Done.  Matrix is not (or no longer) jumbled ====================== */
+
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+    Kills dense or empty columns and rows, calculates an initial score for
+    each column, and places all columns in the degree lists.  Not user-callable.
+*/
+
+PRIVATE void init_scoring
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameters */
+    int *p_n_row2,		/* number of non-dense, non-empty rows */
+    int *p_n_col2,		/* number of non-dense, non-empty columns */
+    int *p_max_deg		/* maximum row degree */
+)
+{
+    /* === Local variables ================================================== */
+
+    int c ;			/* a column index */
+    int r, row ;		/* a row index */
+    int *cp ;			/* a column pointer */
+    int deg ;			/* degree of a row or column */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *new_cp ;		/* new column pointer */
+    int col_length ;		/* length of pruned column */
+    int score ;			/* current column score */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int dense_row_count ;	/* remove rows with more entries than this */
+    int dense_col_count ;	/* remove cols with more entries than this */
+    int min_score ;		/* smallest column score */
+    int max_deg ;		/* maximum row degree */
+    int next_col ;		/* Used to add to degree list.*/
+
+#ifndef NDEBUG
+    int debug_count ;		/* debug only. */
+#endif /* NDEBUG */
+
+    /* === Extract knobs ==================================================== */
+
+    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
+    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+    DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+    max_deg = 0 ;
+    n_col2 = n_col ;
+    n_row2 = n_row ;
+
+    /* === Kill empty columns =============================================== */
+
+    /* Put the empty columns at the end in their natural order, so that LU */
+    /* factorization can proceed as far as possible. */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	deg = Col [c].length ;
+	if (deg == 0)
+	{
+	    /* this is a empty column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense columns =============================================== */
+
+    /* Put the dense columns at the end, in their natural order */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip any dead columns */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	deg = Col [c].length ;
+	if (deg > dense_col_count)
+	{
+	    /* this is a dense column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    /* decrement the row degrees */
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + Col [c].length ;
+	    while (cp < cp_end)
+	    {
+		Row [*cp++].shared1.degree-- ;
+	    }
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense and empty rows ======================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	deg = Row [r].shared1.degree ;
+	ASSERT (deg >= 0 && deg <= n_col) ;
+	if (deg > dense_row_count || deg == 0)
+	{
+	    /* kill a dense or empty row */
+	    KILL_ROW (r) ;
+	    --n_row2 ;
+	}
+	else
+	{
+	    /* keep track of max degree of remaining rows */
+	    max_deg = MAX (max_deg, deg) ;
+	}
+    }
+    DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+    /* === Compute initial column scores ==================================== */
+
+    /* At this point the row degrees are accurate.  They reflect the number */
+    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
+    /* Some "live" columns may contain only dead rows, however.  These are */
+    /* pruned in the code below. */
+
+    /* now find the initial matlab score for each column */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip dead column */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	score = 0 ;
+	cp = &A [Col [c].start] ;
+	new_cp = cp ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    /* skip if dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    /* compact the column */
+	    *new_cp++ = row ;
+	    /* add row's external degree */
+	    score += Row [row].shared1.degree - 1 ;
+	    /* guard against integer overflow */
+	    score = MIN (score, n_col) ;
+	}
+	/* determine pruned column length */
+	col_length = (int) (new_cp - &A [Col [c].start]) ;
+	if (col_length == 0)
+	{
+	    /* a newly-made null column (all rows in this col are "dense" */
+	    /* and have already been killed) */
+	    DEBUG2 (("Newly null killed: %d\n", c)) ;
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+	else
+	{
+	    /* set column length and set score */
+	    ASSERT (score >= 0) ;
+	    ASSERT (score <= n_col) ;
+	    Col [c].length = col_length ;
+	    Col [c].shared2.score = score ;
+	}
+    }
+    DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
+    	n_col-n_col2)) ;
+
+    /* At this point, all empty rows and columns are dead.  All live columns */
+    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+    /* yet).  Rows may contain dead columns, but all live rows contain at */
+    /* least one live column. */
+
+#ifndef NDEBUG
+    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
+#endif /* NDEBUG */
+
+    /* === Initialize degree lists ========================================== */
+
+#ifndef NDEBUG
+    debug_count = 0 ;
+#endif /* NDEBUG */
+
+    /* clear the hash buckets */
+    for (c = 0 ; c <= n_col ; c++)
+    {
+	head [c] = EMPTY ;
+    }
+    min_score = n_col ;
+    /* place in reverse order, so low column indices are at the front */
+    /* of the lists.  This is to encourage natural tie-breaking */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* only add principal columns to degree lists */
+	if (COL_IS_ALIVE (c))
+	{
+	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+		c, Col [c].shared2.score, min_score, n_col)) ;
+
+	    /* === Add columns score to DList =============================== */
+
+	    score = Col [c].shared2.score ;
+
+	    ASSERT (min_score >= 0) ;
+	    ASSERT (min_score <= n_col) ;
+	    ASSERT (score >= 0) ;
+	    ASSERT (score <= n_col) ;
+	    ASSERT (head [score] >= EMPTY) ;
+
+	    /* now add this column to dList at proper score location */
+	    next_col = head [score] ;
+	    Col [c].shared3.prev = EMPTY ;
+	    Col [c].shared4.degree_next = next_col ;
+
+	    /* if there already was a column with the same score, set its */
+	    /* previous pointer to this new column */
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = c ;
+	    }
+	    head [score] = c ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, score) ;
+
+#ifndef NDEBUG
+	    debug_count++ ;
+#endif /* NDEBUG */
+
+	}
+    }
+
+#ifndef NDEBUG
+    DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n",
+	debug_count, n_col, n_col-debug_count)) ;
+    ASSERT (debug_count == n_col2) ;
+    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
+#endif /* NDEBUG */
+
+    /* === Return number of remaining columns, and max row degree =========== */
+
+    *p_n_col2 = n_col2 ;
+    *p_n_row2 = n_row2 ;
+    *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+    Order the principal columns of the supercolumn form of the matrix
+    (no supercolumns on input).  Uses a minimum approximate column minimum
+    degree ordering method.  Not user-callable.
+*/
+
+PRIVATE int find_ordering	/* return the number of garbage collections */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    int Alen,			/* size of A, 2*nnz + n_col or larger */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    int n_col2,			/* Remaining columns to order */
+    int max_deg,		/* Maximum row degree */
+    int pfree			/* index of first free slot (2*nnz on entry) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int k ;			/* current pivot ordering step */
+    int pivot_col ;		/* current pivot column */
+    int *cp ;			/* a column pointer */
+    int *rp ;			/* a row pointer */
+    int pivot_row ;		/* current pivot row */
+    int *new_cp ;		/* modified column pointer */
+    int *new_rp ;		/* modified row pointer */
+    int pivot_row_start ;	/* pointer to start of pivot row */
+    int pivot_row_degree ;	/* number of columns in pivot row */
+    int pivot_row_length ;	/* number of supercolumns in pivot row */
+    int pivot_col_score ;	/* score of pivot column */
+    int needed_memory ;		/* free space needed for pivot row */
+    int *cp_end ;		/* pointer to the end of a column */
+    int *rp_end ;		/* pointer to the end of a row */
+    int row ;			/* a row index */
+    int col ;			/* a column index */
+    int max_score ;		/* maximum possible score */
+    int cur_score ;		/* score of current column */
+    unsigned int hash ;		/* hash value for supernode detection */
+    int head_column ;		/* head of hash bucket */
+    int first_col ;		/* first column in hash bucket */
+    int tag_mark ;		/* marker value for mark array */
+    int row_mark ;		/* Row [row].shared2.mark */
+    int set_difference ;	/* set difference size of row with pivot row */
+    int min_score ;		/* smallest column score */
+    int col_thickness ;		/* "thickness" (no. of columns in a supercol) */
+    int max_mark ;		/* maximum value of tag_mark */
+    int pivot_col_thickness ;	/* number of columns represented by pivot col */
+    int prev_col ;		/* Used by Dlist operations. */
+    int next_col ;		/* Used by Dlist operations. */
+    int ngarbage ;		/* number of garbage collections performed */
+
+#ifndef NDEBUG
+    int debug_d ;		/* debug loop counter */
+    int debug_step = 0 ;	/* debug loop counter */
+#endif /* NDEBUG */
+
+    /* === Initialization and clear mark ==================================== */
+
+    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
+    tag_mark = clear_mark (n_row, Row) ;
+    min_score = 0 ;
+    ngarbage = 0 ;
+    DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
+
+    /* === Order the columns ================================================ */
+
+    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+    {
+
+#ifndef NDEBUG
+	if (debug_step % 100 == 0)
+	{
+	    DEBUG2 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	else
+	{
+	    DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	debug_step++ ;
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+	debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+
+	/* === Select pivot column, and order it ============================ */
+
+	/* make sure degree list isn't empty */
+	ASSERT (min_score >= 0) ;
+	ASSERT (min_score <= n_col) ;
+	ASSERT (head [min_score] >= EMPTY) ;
+
+#ifndef NDEBUG
+	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
+	{
+	    ASSERT (head [debug_d] == EMPTY) ;
+	}
+#endif /* NDEBUG */
+
+	/* get pivot column from head of minimum degree list */
+	while (head [min_score] == EMPTY && min_score < n_col)
+	{
+	    min_score++ ;
+	}
+	pivot_col = head [min_score] ;
+	ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
+	next_col = Col [pivot_col].shared4.degree_next ;
+	head [min_score] = next_col ;
+	if (next_col != EMPTY)
+	{
+	    Col [next_col].shared3.prev = EMPTY ;
+	}
+
+	ASSERT (COL_IS_ALIVE (pivot_col)) ;
+	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+
+	/* remember score for defrag check */
+	pivot_col_score = Col [pivot_col].shared2.score ;
+
+	/* the pivot column is the kth column in the pivot order */
+	Col [pivot_col].shared2.order = k ;
+
+	/* increment order count by column thickness */
+	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+	k += pivot_col_thickness ;
+	ASSERT (pivot_col_thickness > 0) ;
+
+	/* === Garbage_collection, if necessary ============================= */
+
+	needed_memory = MIN (pivot_col_score, n_col - k) ;
+	if (pfree + needed_memory >= Alen)
+	{
+	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+	    ngarbage++ ;
+	    /* after garbage collection we will have enough */
+	    ASSERT (pfree + needed_memory < Alen) ;
+	    /* garbage collection has wiped out the Row[].shared2.mark array */
+	    tag_mark = clear_mark (n_row, Row) ;
+
+#ifndef NDEBUG
+	    debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+	}
+
+	/* === Compute pivot row pattern ==================================== */
+
+	/* get starting location for this new merged row */
+	pivot_row_start = pfree ;
+
+	/* initialize new row counts to zero */
+	pivot_row_degree = 0 ;
+
+	/* tag pivot column as having been visited so it isn't included */
+	/* in merged pivot row */
+	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+	/* pivot row is the union of all rows in the pivot column pattern */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+	    /* skip if row is dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		/* get a column */
+		col = *rp++ ;
+		/* add the column, if alive and untagged */
+		col_thickness = Col [col].shared1.thickness ;
+		if (col_thickness > 0 && COL_IS_ALIVE (col))
+		{
+		    /* tag column in pivot row */
+		    Col [col].shared1.thickness = -col_thickness ;
+		    ASSERT (pfree < Alen) ;
+		    /* place column in pivot row */
+		    A [pfree++] = col ;
+		    pivot_row_degree += col_thickness ;
+		}
+	    }
+	}
+
+	/* clear tag on pivot column */
+	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+	max_deg = MAX (max_deg, pivot_row_degree) ;
+
+#ifndef NDEBUG
+	DEBUG3 (("check2\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+	/* === Kill all rows used to construct pivot row ==================== */
+
+	/* also kill pivot row, temporarily */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* may be killing an already dead row */
+	    row = *cp++ ;
+	    DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
+	    KILL_ROW (row) ;
+	}
+
+	/* === Select a row index to use as the new pivot row =============== */
+
+	pivot_row_length = pfree - pivot_row_start ;
+	if (pivot_row_length > 0)
+	{
+	    /* pick the "pivot" row arbitrarily (first row in col) */
+	    pivot_row = A [Col [pivot_col].start] ;
+	    DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
+	}
+	else
+	{
+	    /* there is no pivot row, since it is of zero length */
+	    pivot_row = EMPTY ;
+	    ASSERT (pivot_row_length == 0) ;
+	}
+	ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+	/* === Approximate degree computation =============================== */
+
+	/* Here begins the computation of the approximate degree.  The column */
+	/* score is the sum of the pivot row "length", plus the size of the */
+	/* set differences of each row in the column minus the pattern of the */
+	/* pivot row itself.  The column ("thickness") itself is also */
+	/* excluded from the column score (we thus use an approximate */
+	/* external degree). */
+
+	/* The time taken by the following code (compute set differences, and */
+	/* add them up) is proportional to the size of the data structure */
+	/* being scanned - that is, the sum of the sizes of each column in */
+	/* the pivot row.  Thus, the amortized time to compute a column score */
+	/* is proportional to the size of that column (where size, in this */
+	/* context, is the column "length", or the number of row indices */
+	/* in that column).  The number of row indices in a column is */
+	/* monotonically non-decreasing, from the length of the original */
+	/* column on input to colamd. */
+
+	/* === Compute set differences ====================================== */
+
+	DEBUG3 (("** Computing set differences phase. **\n")) ;
+
+	/* pivot row is currently dead - it will be revived later. */
+
+	DEBUG3 (("Pivot row: ")) ;
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    DEBUG3 (("Col: %d\n", col)) ;
+
+	    /* clear tags used to construct pivot row pattern */
+	    col_thickness = -Col [col].shared1.thickness ;
+	    ASSERT (col_thickness > 0) ;
+	    Col [col].shared1.thickness = col_thickness ;
+
+	    /* === Remove column from degree list =========================== */
+
+	    cur_score = Col [col].shared2.score ;
+	    prev_col = Col [col].shared3.prev ;
+	    next_col = Col [col].shared4.degree_next ;
+	    ASSERT (cur_score >= 0) ;
+	    ASSERT (cur_score <= n_col) ;
+	    ASSERT (cur_score >= EMPTY) ;
+	    if (prev_col == EMPTY)
+	    {
+		head [cur_score] = next_col ;
+	    }
+	    else
+	    {
+		Col [prev_col].shared4.degree_next = next_col ;
+	    }
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = prev_col ;
+	    }
+
+	    /* === Scan the column ========================================== */
+
+	    cp = &A [Col [col].start] ;
+	    cp_end = cp + Col [col].length ;
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		ASSERT (row != pivot_row) ;
+		set_difference = row_mark - tag_mark ;
+		/* check if the row has been seen yet */
+		if (set_difference < 0)
+		{
+		    ASSERT (Row [row].shared1.degree <= max_deg) ;
+		    set_difference = Row [row].shared1.degree ;
+		}
+		/* subtract column thickness from this row's set difference */
+		set_difference -= col_thickness ;
+		ASSERT (set_difference >= 0) ;
+		/* absorb this row if the set difference becomes zero */
+		if (set_difference == 0)
+		{
+		    DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
+		    KILL_ROW (row) ;
+		}
+		else
+		{
+		    /* save the new mark */
+		    Row [row].shared2.mark = set_difference + tag_mark ;
+		}
+	    }
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k-pivot_row_degree, max_deg) ;
+#endif /* NDEBUG */
+
+	/* === Add up set differences for each column ======================= */
+
+	DEBUG3 (("** Adding set differences phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    /* get a column */
+	    col = *rp++ ;
+	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    hash = 0 ;
+	    cur_score = 0 ;
+	    cp = &A [Col [col].start] ;
+	    /* compact the column */
+	    new_cp = cp ;
+	    cp_end = cp + Col [col].length ;
+
+	    DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
+
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		ASSERT(row >= 0 && row < n_row) ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		ASSERT (row_mark > tag_mark) ;
+		/* compact the column */
+		*new_cp++ = row ;
+		/* compute hash function */
+		hash += row ;
+		/* add set difference */
+		cur_score += row_mark - tag_mark ;
+		/* integer overflow... */
+		cur_score = MIN (cur_score, n_col) ;
+	    }
+
+	    /* recompute the column's length */
+	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
+
+	    /* === Further mass elimination ================================= */
+
+	    if (Col [col].length == 0)
+	    {
+		DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
+		/* nothing left but the pivot row in this column */
+		KILL_PRINCIPAL_COL (col) ;
+		pivot_row_degree -= Col [col].shared1.thickness ;
+		ASSERT (pivot_row_degree >= 0) ;
+		/* order it */
+		Col [col].shared2.order = k ;
+		/* increment order count by column thickness */
+		k += Col [col].shared1.thickness ;
+	    }
+	    else
+	    {
+		/* === Prepare for supercolumn detection ==================== */
+
+		DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+		/* save score so far */
+		Col [col].shared2.score = cur_score ;
+
+		/* add column to hash table, for supercolumn detection */
+		hash %= n_col + 1 ;
+
+		DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+		ASSERT (hash <= n_col) ;
+
+		head_column = head [hash] ;
+		if (head_column > EMPTY)
+		{
+		    /* degree list "hash" is non-empty, use prev (shared3) of */
+		    /* first column in degree list as head of hash bucket */
+		    first_col = Col [head_column].shared3.headhash ;
+		    Col [head_column].shared3.headhash = col ;
+		}
+		else
+		{
+		    /* degree list "hash" is empty, use head as hash bucket */
+		    first_col = - (head_column + 2) ;
+		    head [hash] = - (col + 2) ;
+		}
+		Col [col].shared4.hash_next = first_col ;
+
+		/* save hash function in Col [col].shared3.hash */
+		Col [col].shared3.hash = (int) hash ;
+		ASSERT (COL_IS_ALIVE (col)) ;
+	    }
+	}
+
+	/* The approximate external column degree is now computed.  */
+
+	/* === Supercolumn detection ======================================== */
+
+	DEBUG3 (("** Supercolumn detection phase. **\n")) ;
+
+	detect_super_cols (
+
+#ifndef NDEBUG
+		n_col, Row,
+#endif /* NDEBUG */
+
+		Col, A, head, pivot_row_start, pivot_row_length) ;
+
+	/* === Kill the pivotal column ====================================== */
+
+	KILL_PRINCIPAL_COL (pivot_col) ;
+
+	/* === Clear mark =================================================== */
+
+	tag_mark += (max_deg + 1) ;
+	if (tag_mark >= max_mark)
+	{
+	    DEBUG2 (("clearing tag_mark\n")) ;
+	    tag_mark = clear_mark (n_row, Row) ;
+	}
+
+#ifndef NDEBUG
+	DEBUG3 (("check3\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+	/* === Finalize the new pivot row, and column scores ================ */
+
+	DEBUG3 (("** Finalize scores phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	/* compact the pivot row */
+	new_rp = rp ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    /* skip dead columns */
+	    if (COL_IS_DEAD (col))
+	    {
+		continue ;
+	    }
+	    *new_rp++ = col ;
+	    /* add new pivot row to column */
+	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+	    /* retrieve score so far and add on pivot row's degree. */
+	    /* (we wait until here for this in case the pivot */
+	    /* row's degree was reduced due to mass elimination). */
+	    cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+	    /* calculate the max possible score as the number of */
+	    /* external columns minus the 'k' value minus the */
+	    /* columns thickness */
+	    max_score = n_col - k - Col [col].shared1.thickness ;
+
+	    /* make the score the external degree of the union-of-rows */
+	    cur_score -= Col [col].shared1.thickness ;
+
+	    /* make sure score is less or equal than the max score */
+	    cur_score = MIN (cur_score, max_score) ;
+	    ASSERT (cur_score >= 0) ;
+
+	    /* store updated score */
+	    Col [col].shared2.score = cur_score ;
+
+	    /* === Place column back in degree list ========================= */
+
+	    ASSERT (min_score >= 0) ;
+	    ASSERT (min_score <= n_col) ;
+	    ASSERT (cur_score >= 0) ;
+	    ASSERT (cur_score <= n_col) ;
+	    ASSERT (head [cur_score] >= EMPTY) ;
+	    next_col = head [cur_score] ;
+	    Col [col].shared4.degree_next = next_col ;
+	    Col [col].shared3.prev = EMPTY ;
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = col ;
+	    }
+	    head [cur_score] = col ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, cur_score) ;
+
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+#endif /* NDEBUG */
+
+	/* === Resurrect the new pivot row ================================== */
+
+	if (pivot_row_degree > 0)
+	{
+	    /* update pivot row length to reflect any cols that were killed */
+	    /* during super-col detection and mass elimination */
+	    Row [pivot_row].start  = pivot_row_start ;
+	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
+	    Row [pivot_row].shared1.degree = pivot_row_degree ;
+	    Row [pivot_row].shared2.mark = 0 ;
+	    /* pivot row is no longer dead */
+	}
+    }
+
+    /* === All principal columns have now been ordered ====================== */
+
+    return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+    The find_ordering routine has ordered all of the principal columns (the
+    representatives of the supercolumns).  The non-principal columns have not
+    yet been ordered.  This routine orders those columns by walking up the
+    parent tree (a column is a child of the column which absorbed it).  The
+    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+    being the first column, and p [n_col-1] being the last.  It doesn't look
+    like it at first glance, but be assured that this routine takes time linear
+    in the number of columns.  Although not immediately obvious, the time
+    taken by this routine is O (n_col), that is, linear in the number of
+    columns.  Not user-callable.
+*/
+
+PRIVATE void order_children
+(
+    /* === Parameters ======================================================= */
+
+    int n_col,			/* number of columns of A */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    int p []			/* p [0 ... n_col-1] is the column permutation*/
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop counter for all columns */
+    int c ;			/* column index */
+    int parent ;		/* index of column's parent */
+    int order ;			/* column's order */
+
+    /* === Order each non-principal column ================================== */
+
+    for (i = 0 ; i < n_col ; i++)
+    {
+	/* find an un-ordered non-principal column */
+	ASSERT (COL_IS_DEAD (i)) ;
+	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
+	{
+	    parent = i ;
+	    /* once found, find its principal parent */
+	    do
+	    {
+		parent = Col [parent].shared1.parent ;
+	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+	    /* now, order all un-ordered non-principal columns along path */
+	    /* to this parent.  collapse tree at the same time */
+	    c = i ;
+	    /* get order of parent */
+	    order = Col [parent].shared2.order ;
+
+	    do
+	    {
+		ASSERT (Col [c].shared2.order == EMPTY) ;
+
+		/* order this column */
+		Col [c].shared2.order = order++ ;
+		/* collaps tree */
+		Col [c].shared1.parent = parent ;
+
+		/* get immediate parent of this column */
+		c = Col [c].shared1.parent ;
+
+		/* continue until we hit an ordered column.  There are */
+		/* guarranteed not to be anymore unordered columns */
+		/* above an ordered column */
+	    } while (Col [c].shared2.order == EMPTY) ;
+
+	    /* re-order the super_col parent to largest order for this group */
+	    Col [parent].shared2.order = order ;
+	}
+    }
+
+    /* === Generate the permutation ========================================= */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	p [Col [c].shared2.order] = c ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+    Detects supercolumns by finding matches between columns in the hash buckets.
+    Check amongst columns in the set A [row_start ... row_start + row_length-1].
+    The columns under consideration are currently *not* in the degree lists,
+    and have already been placed in the hash buckets.
+
+    The hash bucket for columns whose hash function is equal to h is stored
+    as follows:
+
+	if head [h] is >= 0, then head [h] contains a degree list, so:
+
+		head [h] is the first column in degree bucket h.
+		Col [head [h]].headhash gives the first column in hash bucket h.
+
+	otherwise, the degree list is empty, and:
+
+		-(head [h] + 2) is the first column in hash bucket h.
+
+    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+    column" pointer.  Col [c].shared3.hash is used instead as the hash number
+    for that column.  The value of Col [c].shared4.hash_next is the next column
+    in the same hash bucket.
+
+    Assuming no, or "few" hash collisions, the time taken by this routine is
+    linear in the sum of the sizes (lengths) of each column whose score has
+    just been computed in the approximate degree computation.
+    Not user-callable.
+*/
+
+PRIVATE void detect_super_cols
+(
+    /* === Parameters ======================================================= */
+
+#ifndef NDEBUG
+    /* these two parameters are only needed when debugging is enabled: */
+    int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+#endif /* NDEBUG */
+
+    Colamd_Col Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A */
+    int head [],		/* head of degree lists and hash buckets */
+    int row_start,		/* pointer to set of columns to check */
+    int row_length		/* number of columns to check */
+)
+{
+    /* === Local variables ================================================== */
+
+    int hash ;			/* hash value for a column */
+    int *rp ;			/* pointer to a row */
+    int c ;			/* a column index */
+    int super_c ;		/* column index of the column to absorb into */
+    int *cp1 ;			/* column pointer for column super_c */
+    int *cp2 ;			/* column pointer for column c */
+    int length ;		/* length of column super_c */
+    int prev_c ;		/* column preceding c in hash bucket */
+    int i ;			/* loop counter */
+    int *rp_end ;		/* pointer to the end of the row */
+    int col ;			/* a column index in the row to check */
+    int head_column ;		/* first column in hash bucket or degree list */
+    int first_col ;		/* first column in hash bucket */
+
+    /* === Consider each column in the row ================================== */
+
+    rp = &A [row_start] ;
+    rp_end = rp + row_length ;
+    while (rp < rp_end)
+    {
+	col = *rp++ ;
+	if (COL_IS_DEAD (col))
+	{
+	    continue ;
+	}
+
+	/* get hash number for this column */
+	hash = Col [col].shared3.hash ;
+	ASSERT (hash <= n_col) ;
+
+	/* === Get the first column in this hash bucket ===================== */
+
+	head_column = head [hash] ;
+	if (head_column > EMPTY)
+	{
+	    first_col = Col [head_column].shared3.headhash ;
+	}
+	else
+	{
+	    first_col = - (head_column + 2) ;
+	}
+
+	/* === Consider each column in the hash bucket ====================== */
+
+	for (super_c = first_col ; super_c != EMPTY ;
+	    super_c = Col [super_c].shared4.hash_next)
+	{
+	    ASSERT (COL_IS_ALIVE (super_c)) ;
+	    ASSERT (Col [super_c].shared3.hash == hash) ;
+	    length = Col [super_c].length ;
+
+	    /* prev_c is the column preceding column c in the hash bucket */
+	    prev_c = super_c ;
+
+	    /* === Compare super_c with all columns after it ================ */
+
+	    for (c = Col [super_c].shared4.hash_next ;
+		 c != EMPTY ; c = Col [c].shared4.hash_next)
+	    {
+		ASSERT (c != super_c) ;
+		ASSERT (COL_IS_ALIVE (c)) ;
+		ASSERT (Col [c].shared3.hash == hash) ;
+
+		/* not identical if lengths or scores are different */
+		if (Col [c].length != length ||
+		    Col [c].shared2.score != Col [super_c].shared2.score)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* compare the two columns */
+		cp1 = &A [Col [super_c].start] ;
+		cp2 = &A [Col [c].start] ;
+
+		for (i = 0 ; i < length ; i++)
+		{
+		    /* the columns are "clean" (no dead rows) */
+		    ASSERT (ROW_IS_ALIVE (*cp1))  ;
+		    ASSERT (ROW_IS_ALIVE (*cp2))  ;
+		    /* row indices will same order for both supercols, */
+		    /* no gather scatter nessasary */
+		    if (*cp1++ != *cp2++)
+		    {
+			break ;
+		    }
+		}
+
+		/* the two columns are different if the for-loop "broke" */
+		if (i != length)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* === Got it!  two columns are identical =================== */
+
+		ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+		Col [c].shared1.parent = super_c ;
+		KILL_NON_PRINCIPAL_COL (c) ;
+		/* order c later, in order_children() */
+		Col [c].shared2.order = EMPTY ;
+		/* remove c from hash bucket */
+		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+	    }
+	}
+
+	/* === Empty this hash bucket ======================================= */
+
+	if (head_column > EMPTY)
+	{
+	    /* corresponding degree list "hash" is not empty */
+	    Col [head_column].shared3.headhash = EMPTY ;
+	}
+	else
+	{
+	    /* corresponding degree list "hash" is empty */
+	    head [hash] = EMPTY ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+    Defragments and compacts columns and rows in the workspace A.  Used when
+    all avaliable memory has been used while performing row merging.  Returns
+    the index of the first free position in A, after garbage collection.  The
+    time taken by this routine is linear is the size of the array A, which is
+    itself linear in the number of nonzeros in the input matrix.
+    Not user-callable.
+*/
+
+PRIVATE int garbage_collection  /* returns the new value of pfree */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows */
+    int n_col,			/* number of columns */
+    Colamd_Row Row [],		/* row info */
+    Colamd_Col Col [],		/* column info */
+    int A [],			/* A [0 ... Alen-1] holds the matrix */
+    int *pfree			/* &A [0] ... pfree is in use */
+)
+{
+    /* === Local variables ================================================== */
+
+    int *psrc ;			/* source pointer */
+    int *pdest ;		/* destination pointer */
+    int j ;			/* counter */
+    int r ;			/* a row index */
+    int c ;			/* a column index */
+    int length ;		/* length of a row or column */
+
+#ifndef NDEBUG
+    int debug_rows ;
+    DEBUG2 (("Defrag..\n")) ;
+    for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ;
+    debug_rows = 0 ;
+#endif /* NDEBUG */
+
+    /* === Defragment the columns =========================================== */
+
+    pdest = &A[0] ;
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    psrc = &A [Col [c].start] ;
+
+	    /* move and compact the column */
+	    ASSERT (pdest <= psrc) ;
+	    Col [c].start = (int) (pdest - &A [0]) ;
+	    length = Col [c].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		r = *psrc++ ;
+		if (ROW_IS_ALIVE (r))
+		{
+		    *pdest++ = r ;
+		}
+	    }
+	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
+	}
+    }
+
+    /* === Prepare to defragment the rows =================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    if (Row [r].length == 0)
+	    {
+		/* this row is of zero length.  cannot compact it, so kill it */
+		DEBUG3 (("Defrag row kill\n")) ;
+		KILL_ROW (r) ;
+	    }
+	    else
+	    {
+		/* save first column index in Row [r].shared2.first_column */
+		psrc = &A [Row [r].start] ;
+		Row [r].shared2.first_column = *psrc ;
+		ASSERT (ROW_IS_ALIVE (r)) ;
+		/* flag the start of the row with the one's complement of row */
+		*psrc = ONES_COMPLEMENT (r) ;
+
+#ifndef NDEBUG
+		debug_rows++ ;
+#endif /* NDEBUG */
+
+	    }
+	}
+    }
+
+    /* === Defragment the rows ============================================== */
+
+    psrc = pdest ;
+    while (psrc < pfree)
+    {
+	/* find a negative number ... the start of a row */
+	if (*psrc++ < 0)
+	{
+	    psrc-- ;
+	    /* get the row index */
+	    r = ONES_COMPLEMENT (*psrc) ;
+	    ASSERT (r >= 0 && r < n_row) ;
+	    /* restore first column index */
+	    *psrc = Row [r].shared2.first_column ;
+	    ASSERT (ROW_IS_ALIVE (r)) ;
+
+	    /* move and compact the row */
+	    ASSERT (pdest <= psrc) ;
+	    Row [r].start = (int) (pdest - &A [0]) ;
+	    length = Row [r].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		c = *psrc++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    *pdest++ = c ;
+		}
+	    }
+	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
+
+#ifndef NDEBUG
+	    debug_rows-- ;
+#endif /* NDEBUG */
+
+	}
+    }
+    /* ensure we found all the rows */
+    ASSERT (debug_rows == 0) ;
+
+    /* === Return the new value of pfree ==================================== */
+
+    return ((int) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Clears the Row [].shared2.mark array, and returns the new tag_mark.
+    Return value is the new tag_mark.  Not user-callable.
+*/
+
+PRIVATE int clear_mark	/* return the new value for tag_mark */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,		/* number of rows in A */
+    Colamd_Row Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    Row [r].shared2.mark = 0 ;
+	}
+    }
+    return (1) ;
+}
+
+
+/* ========================================================================== */
+/* === print_report ========================================================= */
+/* ========================================================================== */
+
+PRIVATE void print_report
+(
+    char *method,
+    int stats [COLAMD_STATS]
+)
+{
+
+    int i1, i2, i3 ;
+
+    if (!stats)
+    {
+    	PRINTF ("%s: No statistics available.\n", method) ;
+	return ;
+    }
+
+    i1 = stats [COLAMD_INFO1] ;
+    i2 = stats [COLAMD_INFO2] ;
+    i3 = stats [COLAMD_INFO3] ;
+
+    if (stats [COLAMD_STATUS] >= 0)
+    {
+    	PRINTF ("%s: OK.  ", method) ;
+    }
+    else
+    {
+    	PRINTF ("%s: ERROR.  ", method) ;
+    }
+
+    switch (stats [COLAMD_STATUS])
+    {
+
+	case COLAMD_OK_BUT_JUMBLED:
+
+	    PRINTF ("Matrix has unsorted or duplicate row indices.\n") ;
+
+	    PRINTF ("%s: number of duplicate or out-of-order row indices: %d\n",
+	    method, i3) ;
+
+	    PRINTF ("%s: last seen duplicate or out-of-order row index:   %d\n",
+	    method, INDEX (i2)) ;
+
+	    PRINTF ("%s: last seen in column:                             %d",
+	    method, INDEX (i1)) ;
+
+	    /* no break - fall through to next case instead */
+
+	case COLAMD_OK:
+
+	    PRINTF ("\n") ;
+
+ 	    PRINTF ("%s: number of dense or empty rows ignored:           %d\n",
+	    method, stats [COLAMD_DENSE_ROW]) ;
+
+	    PRINTF ("%s: number of dense or empty columns ignored:        %d\n",
+	    method, stats [COLAMD_DENSE_COL]) ;
+
+	    PRINTF ("%s: number of garbage collections performed:         %d\n",
+	    method, stats [COLAMD_DEFRAG_COUNT]) ;
+	    break ;
+
+	case COLAMD_ERROR_A_not_present:
+
+	    PRINTF ("Array A (row indices of matrix) not present.\n") ;
+	    break ;
+
+	case COLAMD_ERROR_p_not_present:
+
+	    PRINTF ("Array p (column pointers for matrix) not present.\n") ;
+	    break ;
+
+	case COLAMD_ERROR_nrow_negative:
+
+	    PRINTF ("Invalid number of rows (%d).\n", i1) ;
+	    break ;
+
+	case COLAMD_ERROR_ncol_negative:
+
+	    PRINTF ("Invalid number of columns (%d).\n", i1) ;
+	    break ;
+
+	case COLAMD_ERROR_nnz_negative:
+
+	    PRINTF ("Invalid number of nonzero entries (%d).\n", i1) ;
+	    break ;
+
+	case COLAMD_ERROR_p0_nonzero:
+
+	    PRINTF ("Invalid column pointer, p [0] = %d, must be zero.\n", i1) ;
+	    break ;
+
+	case COLAMD_ERROR_A_too_small:
+
+	    PRINTF ("Array A too small.\n") ;
+	    PRINTF ("        Need Alen >= %d, but given only Alen = %d.\n",
+	    i1, i2) ;
+	    break ;
+
+	case COLAMD_ERROR_col_length_negative:
+
+	    PRINTF
+	    ("Column %d has a negative number of nonzero entries (%d).\n",
+	    INDEX (i1), i2) ;
+	    break ;
+
+	case COLAMD_ERROR_row_index_out_of_bounds:
+
+	    PRINTF
+	    ("Row index (row %d) out of bounds (%d to %d) in column %d.\n",
+	    INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1)) ;
+	    break ;
+
+	case COLAMD_ERROR_out_of_memory:
+
+	    PRINTF ("Out of memory.\n") ;
+	    break ;
+
+	case COLAMD_ERROR_internal_error:
+
+	    /* if this happens, there is a bug in the code */
+	    PRINTF
+	    ("Internal error! Please contact authors (davis at cise.ufl.edu).\n") ;
+	    break ;
+    }
+}
+
+
+
+
+/* ========================================================================== */
+/* === colamd debugging routines ============================================ */
+/* ========================================================================== */
+
+/* When debugging is disabled, the remainder of this file is ignored. */
+
+#ifndef NDEBUG
+
+
+/* ========================================================================== */
+/* === debug_structures ===================================================== */
+/* ========================================================================== */
+
+/*
+    At this point, all empty rows and columns are dead.  All live columns
+    are "clean" (containing no dead rows) and simplicial (no supercolumns
+    yet).  Rows may contain dead columns, but all live rows contain at
+    least one live column.
+*/
+
+PRIVATE void debug_structures
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A [],
+    int n_col2
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+    int c ;
+    int *cp ;
+    int *cp_end ;
+    int len ;
+    int score ;
+    int r ;
+    int *rp ;
+    int *rp_end ;
+    int deg ;
+
+    /* === Check A, Row, and Col ============================================ */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    len = Col [c].length ;
+	    score = Col [c].shared2.score ;
+	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
+	    ASSERT (len > 0) ;
+	    ASSERT (score >= 0) ;
+	    ASSERT (Col [c].shared1.thickness == 1) ;
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + len ;
+	    while (cp < cp_end)
+	    {
+		r = *cp++ ;
+		ASSERT (ROW_IS_ALIVE (r)) ;
+	    }
+	}
+	else
+	{
+	    i = Col [c].shared2.order ;
+	    ASSERT (i >= n_col2 && i < n_col) ;
+	}
+    }
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    i = 0 ;
+	    len = Row [r].length ;
+	    deg = Row [r].shared1.degree ;
+	    ASSERT (len > 0) ;
+	    ASSERT (deg > 0) ;
+	    rp = &A [Row [r].start] ;
+	    rp_end = rp + len ;
+	    while (rp < rp_end)
+	    {
+		c = *rp++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    i++ ;
+		}
+	    }
+	    ASSERT (i > 0) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_deg_lists ====================================================== */
+/* ========================================================================== */
+
+/*
+    Prints the contents of the degree lists.  Counts the number of columns
+    in the degree list and compares it to the total it should have.  Also
+    checks the row degrees.
+*/
+
+PRIVATE void debug_deg_lists
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+)
+{
+    /* === Local variables ================================================== */
+
+    int deg ;
+    int col ;
+    int have ;
+    int row ;
+
+    /* === Check the degree lists =========================================== */
+
+    if (n_col > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    have = 0 ;
+    DEBUG4 (("Degree lists: %d\n", min_score)) ;
+    for (deg = 0 ; deg <= n_col ; deg++)
+    {
+	col = head [deg] ;
+	if (col == EMPTY)
+	{
+	    continue ;
+	}
+	DEBUG4 (("%d:", deg)) ;
+	while (col != EMPTY)
+	{
+	    DEBUG4 ((" %d", col)) ;
+	    have += Col [col].shared1.thickness ;
+	    ASSERT (COL_IS_ALIVE (col)) ;
+	    col = Col [col].shared4.degree_next ;
+	}
+	DEBUG4 (("\n")) ;
+    }
+    DEBUG4 (("should %d have %d\n", should, have)) ;
+    ASSERT (should == have) ;
+
+    /* === Check the row degrees ============================================ */
+
+    if (n_row > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    for (row = 0 ; row < n_row ; row++)
+    {
+	if (ROW_IS_ALIVE (row))
+	{
+	    ASSERT (Row [row].shared1.degree <= max_deg) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Ensures that the tag_mark is less that the maximum and also ensures that
+    each entry in the mark array is less than the tag mark.
+*/
+
+PRIVATE void debug_mark
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    Colamd_Row Row [],
+    int tag_mark,
+    int max_mark
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    /* === Check the Row marks ============================================== */
+
+    ASSERT (tag_mark > 0 && tag_mark <= max_mark) ;
+    if (n_row > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    for (r = 0 ; r < n_row ; r++)
+    {
+	ASSERT (Row [r].shared2.mark < tag_mark) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_matrix ========================================================= */
+/* ========================================================================== */
+
+/*
+    Prints out the contents of the columns and the rows.
+*/
+
+PRIVATE void debug_matrix
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    int A []
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+    int c ;
+    int *rp ;
+    int *rp_end ;
+    int *cp ;
+    int *cp_end ;
+
+    /* === Dump the rows and columns of the matrix ========================== */
+
+    if (colamd_debug < 3)
+    {
+	return ;
+    }
+    DEBUG3 (("DUMP MATRIX:\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
+	if (ROW_IS_DEAD (r))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d degree %d\n",
+		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
+	rp = &A [Row [r].start] ;
+	rp_end = rp + Row [r].length ;
+	while (rp < rp_end)
+	{
+	    c = *rp++ ;
+	    DEBUG4 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
+	}
+    }
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
+		Col [c].start, Col [c].length,
+		Col [c].shared1.thickness, Col [c].shared2.score)) ;
+	cp = &A [Col [c].start] ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    r = *cp++ ;
+	    DEBUG4 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
+	}
+    }
+}
+
+PRIVATE void colamd_get_debug
+(
+    char *method
+)
+{
+    colamd_debug = 0 ;		/* no debug printing */
+
+    /* get "D" environment variable, which gives the debug printing level */
+    if (getenv ("D"))
+    {
+    	colamd_debug = atoi (getenv ("D")) ;
+    }
+
+    DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n",
+    	method, colamd_debug)) ;
+}
+
+#endif /* NDEBUG */
+
diff --git a/SRC/colamd.h b/SRC/colamd.h
new file mode 100644
index 0000000..03fc3bd
--- /dev/null
+++ b/SRC/colamd.h
@@ -0,0 +1,259 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file colamd.h
+    \brief Colamd prototypes and definitions
+
+	<pre> 
+    ==========================================================================
+    === colamd/symamd prototypes and definitions =============================
+    ==========================================================================
+
+    You must include this file (colamd.h) in any routine that uses colamd,
+    symamd, or the related macros and definitions.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	September 8, 2003.  Version 2.3.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998-2003 by the University of Florida.
+	All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+	file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
+	files, and by any C code that calls the routines whose prototypes are
+	listed below, or that uses the colamd/symamd definitions listed below.
+ </pre>
+*/
+
+#ifndef COLAMD_H
+#define COLAMD_H
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === Knob and statistics definitions ====================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
+
+/* stats [4..6]: error info, or info on jumbled columns */ 
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
+
+/* error codes returned in stats [3]: */
+#define COLAMD_OK				(0)
+#define COLAMD_OK_BUT_JUMBLED			(1)
+#define COLAMD_ERROR_A_not_present		(-1)
+#define COLAMD_ERROR_p_not_present		(-2)
+#define COLAMD_ERROR_nrow_negative		(-3)
+#define COLAMD_ERROR_ncol_negative		(-4)
+#define COLAMD_ERROR_nnz_negative		(-5)
+#define COLAMD_ERROR_p0_nonzero			(-6)
+#define COLAMD_ERROR_A_too_small		(-7)
+#define COLAMD_ERROR_col_length_negative	(-8)
+#define COLAMD_ERROR_row_index_out_of_bounds	(-9)
+#define COLAMD_ERROR_out_of_memory		(-10)
+#define COLAMD_ERROR_internal_error		(-999)
+
+/* ========================================================================== */
+/* === Row and Column structures ============================================ */
+/* ========================================================================== */
+
+/* User code that makes use of the colamd/symamd routines need not directly */
+/* reference these structures.  They are used only for the COLAMD_RECOMMENDED */
+/* macro. */
+
+typedef struct Colamd_Col_struct
+{
+    int start ;		/* index for A of first row in this column, or DEAD */
+			/* if column is dead */
+    int length ;	/* number of rows in this column */
+    union
+    {
+	int thickness ;	/* number of original columns represented by this */
+			/* col, if the column is alive */
+	int parent ;	/* parent in parent tree super-column structure, if */
+			/* the column is dead */
+    } shared1 ;
+    union
+    {
+	int score ;	/* the score used to maintain heap, if col is alive */
+	int order ;	/* pivot ordering of this column, if col is dead */
+    } shared2 ;
+    union
+    {
+	int headhash ;	/* head of a hash bucket, if col is at the head of */
+			/* a degree list */
+	int hash ;	/* hash value, if col is not in a degree list */
+	int prev ;	/* previous column in degree list, if col is in a */
+			/* degree list (but not at the head of a degree list) */
+    } shared3 ;
+    union
+    {
+	int degree_next ;	/* next column, if col is in a degree list */
+	int hash_next ;		/* next column, if col is in a hash list */
+    } shared4 ;
+
+} Colamd_Col ;
+
+typedef struct Colamd_Row_struct
+{
+    int start ;		/* index for A of first col in this row */
+    int length ;	/* number of principal columns in this row */
+    union
+    {
+	int degree ;	/* number of principal & non-principal columns in row */
+	int p ;		/* used as a row pointer in init_rows_cols () */
+    } shared1 ;
+    union
+    {
+	int mark ;	/* for computing set differences and marking dead rows*/
+	int first_column ;/* first column in row (used in garbage collection) */
+    } shared2 ;
+
+} Colamd_Row ;
+
+/* ========================================================================== */
+/* === Colamd recommended memory size ======================================= */
+/* ========================================================================== */
+
+/*
+    The recommended length Alen of the array A passed to colamd is given by
+    the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
+    argument is negative.  2*nnz space is required for the row and column
+    indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is
+    required for the Col and Row arrays, respectively, which are internal to
+    colamd.  An additional n_col space is the minimal amount of "elbow room",
+    and nnz/5 more space is recommended for run time efficiency.
+
+    This macro is not needed when using symamd.
+
+    Explicit typecast to int added Sept. 23, 2002, COLAMD version 2.2, to avoid
+    gcc -pedantic warning messages.
+*/
+
+#define COLAMD_C(n_col) ((int) (((n_col) + 1) * sizeof (Colamd_Col) / sizeof (int)))
+#define COLAMD_R(n_row) ((int) (((n_row) + 1) * sizeof (Colamd_Row) / sizeof (int)))
+
+#define COLAMD_RECOMMENDED(nnz, n_row, n_col)                                 \
+(                                                                             \
+((nnz) < 0 || (n_row) < 0 || (n_col) < 0)                                     \
+?                                                                             \
+    (-1)                                                                      \
+:                                                                             \
+    (2 * (nnz) + COLAMD_C (n_col) + COLAMD_R (n_row) + (n_col) + ((nnz) / 5)) \
+)
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+int colamd_recommended		/* returns recommended value of Alen, */
+				/* or (-1) if input arguments are erroneous */
+(
+    int nnz,			/* nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+) ;
+
+void colamd_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+int colamd			/* returns (1) if successful, (0) otherwise*/
+(				/* A and p arguments are modified on output */
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* size of the array A */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+    int stats [COLAMD_STATS]	/* colamd output statistics and error codes */
+) ;
+
+int symamd				/* return (1) if OK, (0) otherwise */
+(
+    int n,				/* number of rows and columns of A */
+    int A [],				/* row indices of A */
+    int p [],				/* column pointers of A */
+    int perm [],			/* output permutation, size n_col+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    int stats [COLAMD_STATS],		/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+) ;
+
+void colamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+void symamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+#endif /* COLAMD_H */
diff --git a/SRC/dSchCompUdt-2Ddynamic.c b/SRC/dSchCompUdt-2Ddynamic.c
index 360861f..38b8f11 100644
--- a/SRC/dSchCompUdt-2Ddynamic.c
+++ b/SRC/dSchCompUdt-2Ddynamic.c
@@ -16,29 +16,46 @@ at the top-level directory.
  *        Uses 2D partitioning for the scatter phase.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: September 14, 2017
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros), 
+ *     gather only trailing columns of the L-panel corresponding to the nonzero
+ *     of U-rows.
+ *   - Padding zeros for nice dimensions of GEMM.
+ *
  */
 
 #define SCHEDULE_STRATEGY guided 
-double tt_start;
-double tt_end;
+
+/* 
+ * Buffers:
+ *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ *                                            (A matrix in C := A*B )
+ *     bigU : stores the U-panel (B matrix in C := A*B)
+ *     bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
 
 if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
     int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
     int temp_nbrow;   /* nonzero rows in current block L(i,k) */
     lptr  = lptr0;
     luptr = luptr0;
-    /**
+    int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+			   and remaining part.  */
+
+    /*******************************************************************
      * Seperating L blocks into the top part within look-ahead window
      * and the remaining ones.
-     */
+     *******************************************************************/
+
      int lookAheadBlk=0, RemainBlk=0;
 
      tt_start = SuperLU_timer_();
 
+     /* Sherry -- can this loop be threaded?? */
      /* Loop through all blocks in L(:,k) to set up pointers to the start 
       * of each block in the data arrays.
       *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
@@ -47,36 +64,36 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
       *   - (ditto Remain_Info[i])
       */
      for (int i = 0; i < nlb; ++i) {
-	 ib = lsub[lptr];            /* block number of L(i,k). */
+	 ib = lsub[lptr];            /* Block number of L(i,k). */
 	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
         
 	 int look_up_flag = 1; /* assume ib is outside look-up window */
-	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
-	     {
-		 if(ib == perm_c_supno[j]) {
-		     look_up_flag=0; /* flag ib is within look-up window */
-                     break; /* Sherry -- can exit the loop?? */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+	      ++j) {
+		 if ( ib == perm_c_supno[j] ) {
+		     look_up_flag = 0; /* flag ib within look-up window */
+                     break;            /* Sherry -- can exit the loop?? */
                  }
-	     }
+	 }
 	 
-	 if( look_up_flag == 0 ) { /* ib is within look up window */
+	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
 	     if (lookAheadBlk==0) {
 		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
 	     } else {
-		 lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];   
+		 lookAheadFullRow[lookAheadBlk] = 
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];   
 	     }
 	     lookAheadStRow[lookAheadBlk] = cum_nrow;
 	     lookAhead_lptr[lookAheadBlk] = lptr;
 	     lookAhead_ib[lookAheadBlk] = ib; 
 	     lookAheadBlk++;
-	 } else { /* ib is not in look up window */
-
-	     if (RemainBlk==0) {
+	 } else { /* ib is not in look-up window */
+	     if ( RemainBlk==0 ) {
 		 Remain_info[RemainBlk].FullRow = temp_nbrow;
 	     } else {
-		 Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;   
+		 Remain_info[RemainBlk].FullRow = 
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;   
 	     }
-
              RemainStRow[RemainBlk] = cum_nrow;
              // Remain_lptr[RemainBlk] = lptr;
 	     Remain_info[RemainBlk].lptr = lptr;
@@ -85,139 +102,105 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 	     RemainBlk++;
 	 }
 	 
-         cum_nrow +=temp_nbrow;
+         cum_nrow += temp_nbrow;
 	 
 	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 	 lptr += temp_nbrow;     /* Move to next block */
 	 luptr += temp_nbrow;
-     }  /* for i ... all blocks in L(:,k) */
+     }  /* for i ... set up pointers for all blocks in L(:,k) */
 
      lptr = lptr0;
      luptr = luptr0;
 
-     /* leading dimension of L buffer */
-#if 0
-     int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
-#else /* Piyush fix */
-     int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
-#endif
-
-     /* Loop through the look-ahead blocks to copy Lval into the buffer */
-#ifdef __OPENMP
-     /* #pragma omp parallel for -- why not?? Sherry */
-#endif
-     for (int i = 0; i < lookAheadBlk; ++i) {
-	 int StRowDest  = 0;
-	 int temp_nbrow;
-	 if (i==0) {
-	     temp_nbrow = lookAheadFullRow[0];
-	 } else {
-	     StRowDest   = lookAheadFullRow[i-1];
-	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
-	 }
-	 
-	 int StRowSource=lookAheadStRow[i];
-	 
-	 /* Now copying the matrix*/
-	 // #pragma omp parallel for (gives slow down)
-	 for (int j = 0; j < knsupc; ++j) {
-	     memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
-		    &lusup[luptr+j*nsupr+StRowSource],
-		    temp_nbrow * sizeof(double) );
-	 }
-     }
-
-     int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-
-    /* Loop through the remaining blocks to copy Lval into the buffer */
-#ifdef _OPENMP
-#pragma omp parallel for 
-#endif
-     for (int i = 0; i < RemainBlk; ++i) {
-	 int StRowDest  = 0;
-	 int temp_nbrow;
-         if (i==0)  {
-	     temp_nbrow = Remain_info[0].FullRow;
-	 } else  {
-	     StRowDest   = Remain_info[i-1].FullRow;
-	     temp_nbrow  = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
-	 }
-
-	 int StRowSource=RemainStRow[i];
-
-	 /* Now copying the matrix*/
-	 // #pragma omp parallel for (gives slow down)
-	 for (int j = 0; j < knsupc; ++j) {
-	     // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
-	     memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
-		    &lusup[luptr+j*nsupr+StRowSource],
-                    temp_nbrow * sizeof(double) );
-	 }
-     } /* parallel for i ... */
-
-#if ( PRNTlevel>=1 )
-     tt_end = SuperLU_timer_();
-     GatherLTimer += tt_end - tt_start;
-#endif
-#if 0
-     LookAheadRowSepMOP  +=  2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
-#else
-     int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
-                              or remaining part.  */
-     lnbrow = lookAheadBlk==0 ? 0  : lookAheadFullRow[lookAheadBlk-1];
-     rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-     nbrow = lnbrow + rnbrow; /* total number of rows in L */
+     /* leading dimension of L look-ahead buffer, same as Lnbrow */
+     //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+     Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+     /* leading dimension of L remaining buffer, same as Rnbrow */
+     //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+     /* Piyush fix */
+     //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+     nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
      LookAheadRowSepMOP += 2*knsupc*(nbrow);
-#endif     
-     
-     /**********************
-      * Gather U blocks *
-      **********************/
 
+     /***********************************************
+      * Gather U blocks (AFTER LOOK-AHEAD WINDOW)   *
+      ***********************************************/
      tt_start = SuperLU_timer_();
-#if 0     
-     nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
-#endif
 
      if ( nbrow > 0 ) { /* L(:,k) is not empty */
 	 /*
 	  * Counting U blocks
 	  */
-	 ncols = 0; /* total number of nonzero columns in U(k,:) */
-	 ldu   = 0;
-	 full  = 1; /* flag the U block is indeed 'full', containing segments
-	               of same length. No need padding 0 */
-	 int temp_ncols=0;
+	 ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
+	 int temp_ncols = 0;
 
-         /* Loop through all blocks in U(k,:) to set up pointers to the start
+#if 0
+	 /* jj0 contains the look-ahead window that was updated in 
+	    dlook_ahead_update.c. Now the search can continue from that point,
+	    not to start from block 0. */
+	 iukp = iukp0; /* point to the first block in index[] */
+	 rukp = rukp0; /* point to the start of nzval[] */
+#esle
+	 /* Save pointers at location right after look-ahead window
+	    for later restart. */
+	 iukp0 = iukp;
+	 rukp0 = rukp;
+#endif
+
+	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+	     
+         /* 
+	  * Loop through all blocks in U(k,:) to set up pointers to the start
           * of each block in the data arrays, store them in Ublock_info[j]
           * for block U(k,j).
   	  */
-	 for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+	 for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
 	     temp_ncols = 0;
+#if 0
+	     /* Sherry - can remove following call, since perm_u == Identity  */
 	     arrive_at_ublock(
 			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
 			      iukp0, rukp0, usub, perm_u, xsup, grid
 			      );
+#else
+	     jb = usub[iukp];
+	     /* ljb = LBj (jb, grid);   Local block number of U(k,j). */
+	     nsupc = SuperSize(jb);
+	     iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
 	     Ublock_info[j].iukp = iukp;
 	     Ublock_info[j].rukp = rukp;
 	     Ublock_info[j].jb = jb;
-	     
+
+	     /* if ( iam==0 )
+		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+			"Ublock_info[j].jb %d, nsupc %d\n", 
+			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+			Ublock_info[j].jb, nsupc); */
+
 	     /* Prepare to call GEMM. */
 	     jj = iukp;
-	     
 	     for (; jj < iukp+nsupc; ++jj) {
 		 segsize = klst - usub[jj];
 		 if ( segsize ) {
                     ++temp_ncols;
-                    if ( segsize != ldu ) full = 0; /* need padding 0 */
                     if ( segsize > ldu ) ldu = segsize;
 		 }
 	     }
 
 	     Ublock_info[j].full_u_cols = temp_ncols;
 	     ncols += temp_ncols;
-	 }
+#if 1	     
+	     /* Jump number of nonzeros in block U(k,jj);
+		Move to block U(k,j+1) in nzval[] array.  */
+	     rukp += usub[iukp - 1];
+	     iukp += nsupc;
+#endif
+         } /* end for j ... compute ldu & ncols */
 
 	 /* Now doing prefix sum on full_u_cols.
 	  * After this, full_u_cols is the number of nonzero columns
@@ -227,101 +210,239 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
 	 }
             
+	 /* Padding zeros to make {m,n,k} multiple of vector length. */
+	 jj = 8; //n;
+	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+	     gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+	     gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+	     //gemm_n_pad = ncols;
+	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+	     gemm_k_pad = ldu;
+	     
+	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
+		 for (j = 0; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = 0; i < Rnbrow; ++i)         
+		 for (j = ldu; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
+		 for (j = 0; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	     for (i = 0; i < ldu; ++i)
+		 for (j = ncols; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	 } else {
+	     gemm_m_pad = Rnbrow;
+	     gemm_n_pad = ncols;
+	     gemm_k_pad = ldu;
+	 }
+     
 	 tempu = bigU; /* buffer the entire row block U(k,:) */
 
          /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
-#ifdef _OPENMP        
-#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
-	lead_zero, jj, i) \
-        default (shared) schedule(SCHEDULE_STRATEGY)
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+    private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+    default (shared) schedule(SCHEDULE_STRATEGY)
 #endif
-        for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+        for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
 
-            if(j==jj0) tempu = bigU;
-            else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+            if (j==jj0) tempu = bigU;
+            //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+            else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
 
-            /* == processing each of the remaining columns == */
+            /* == processing each of the remaining columns in parallel == */
+#if 0
+	    /* Sherry - can remove following call, since perm_u == Identity  */
             arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
 			     iukp0, rukp0, usub,perm_u, xsup, grid);
-
-            /* Copy from U(k,:) to tempu[], padding zeros.  */            
+#else
+	    iukp = Ublock_info[j].iukp;
+	    rukp = Ublock_info[j].rukp;
+	    jb = Ublock_info[j].jb;
+	    nsupc = SuperSize (jb );
+#endif
+            /* Copy from U(k,j) to tempu[], padding zeros.  */            
             for (jj = iukp; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
                     lead_zero = ldu - segsize;
                     for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-                    tempu += lead_zero;
-                    for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+		    //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+		    for (i=0; i<segsize; ++i) tempu[i+lead_zero] = uval[rukp+i];
+
                     rukp += segsize;
-                    tempu += segsize;
+#if 0
+		    tempu += segsize;
+#else
+                    tempu += gemm_k_pad;
+#endif
                 }
-            }
+	    }
+#if 0
+	    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
+        }   /* parallel for j = jj0 .. nub */
+
+#if 0
+	if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+	fflush(stdout);
+#endif
+    }  /* end if (nbrow>0), end gather U blocks */
+
+    GatherUTimer += SuperLU_timer_() - tt_start;
+    GatherMOP += 2*ldu*ncols;
+    int jj_cpu = nub;       /* limit between CPU and GPU */
+    int thread_id;
+    /*tempv = bigV;*/
 
-            rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
 
-        }   /* parallel for j:jjj_st..jjj */
+    /**********************
+     * Gather L blocks    *
+     **********************/
+     tt_start = SuperLU_timer_();
 
-        tempu = bigU;  /* setting to the start of padded U(k,:) */
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+     for (int i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest, temp_nbrow;
+	 if ( i==0 ) {
+	     StRowDest = 0;
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+	 
+	 int StRowSource = lookAheadStRow[i];
+	 
+	 /* Now copying one block into L lookahead buffer */
+	 /* #pragma omp parallel for (gives slow down) */
+	 // for (int j = 0; j < knsupc; ++j) { 
+	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+						    corresponding to zero U rows */
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+	     //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+	     tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+	     tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+	     memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+		    &lusup[luptr+j*nsupr + StRowSource],
+		    temp_nbrow * sizeof(double) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... gather Lval blocks from lookahead window */
 
-    }  /* end if (nbrow>0) */
+     /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared)	\
+    schedule(SCHEDULE_STRATEGY)
+#endif
+     for (i = 0; i < RemainBlk; ++i) {
+         int StRowDest, temp_nbrow;
+         if ( i==0 )  {
+	     StRowDest  = 0;
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+	 }
 
-#if ( PRNTlevel>=1 )
-    GatherUTimer += SuperLU_timer_() - tt_start;
+	 int StRowSource = RemainStRow[i];
+
+	 /* Now copying a block into L remaining buffer */
+	 // #pragma omp parallel for (gives slow down)
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (j = knsupc-ldu; j < knsupc; ++j) {
+	     // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+	     //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+	     tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+	     tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
 #endif
-    GatherMOP += 2*ldu*ncols;
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+	     memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+		    &lusup[luptr+j*nsupr + StRowSource],
+                    temp_nbrow * sizeof(double) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... copy Lval into the remaining buffer */
 
-    int Lnbrow   = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
-    int Rnbrow   = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-    int jj_cpu=nub;       /*limit between CPU and GPU */
-    int thread_id;
-    tempv = bigV;
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
 
-    /**************************************
-     * Perform GEMM followed by Scatter *
-     **************************************/
 
-    if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
-        /* Perform a large GEMM call */
-        ncols = Ublock_info[nub-1].full_u_cols;
-        schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
-        stat->ops[FACT]    += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+     /*************************************************************************
+      * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+      *************************************************************************/
+     tempu = bigU;  /* setting to the start of padded U(k,:) */
+    
+     if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+	 /***************************************************************
+	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+	  ***************************************************************/
+
+	 /* Count flops for total GEMM calls */
+	 ncols = Ublock_info[nub-1].full_u_cols;
+ 	 flops_t flps = 2.0 * (flops_t)Lnbrow * ldu * ncols;
+	 LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+	 schur_flop_counter += flps;
+	 stat->ops[FACT]    += flps;
+	 LookAheadGEMMFlOp  += flps;
 
-        /***************************************************************
-         * Updating look-ahead blocks in both L and U look-ahead windows.
-         ***************************************************************/
 #ifdef _OPENMP
-#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
-     {
- 	thread_id = omp_get_thread_num();
+#pragma omp parallel default (shared) private(thread_id)
+	 {
+	   thread_id = omp_get_thread_num();
  
- 	/* Ideally, should organize the loop as:
-                for (j = 0; j < nub; ++j) {
-                    for (lb = 0; lb < lookAheadBlk; ++lb) {
- 	               L(lb,k) X U(k,j) -> tempv[]
-                    }
-                }
- 	   But now, we use collapsed loop to achieve more parallelism.
- 	   Total number of block updates is:
- 	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
- 	*/
+	   /* Ideally, should organize the loop as:
+	      for (j = 0; j < nub; ++j) {
+	          for (lb = 0; lb < lookAheadBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+		  }
+	      }
+	      But now, we use collapsed loop to achieve more parallelism.
+	      Total number of block updates is:
+	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+	   */
+
+	   int i = sizeof(int);
+	   int* indirect_thread    = indirect + (ldt + CACHELINE/i) * thread_id;
+	   int* indirect2_thread   = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
 #pragma omp for \
-    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
     schedule(dynamic)
 #else /* not use _OPENMP */
- 	thread_id = 0;
+	   thread_id = 0;
+	   int* indirect_thread    = indirect;
+	   int* indirect2_thread   = indirect2;
 #endif
- 	/* Each thread is assigned one loop index ij, responsible for 
- 	   block update L(lb,k) * U(k,j) -> tempv[]. */
-        for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
-	    if ( thread_id == 0 ) tt_start = SuperLU_timer_();
-
-            int j   = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+	   /* Each thread is assigned one loop index ij, responsible for 
+	      block update L(lb,k) * U(k,j) -> tempv[]. */
+	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	       /* jj0 starts after look-ahead window. */
+            int j   = ij/lookAheadBlk + jj0;
             int lb  = ij%lookAheadBlk;
 
-            int* indirect_thread    = indirect + ldt*thread_id;
-            int* indirect2_thread   = indirect2 + ldt*thread_id;
-            double* tempv1 = bigV + thread_id*ldt*ldt; 
-
             /* Getting U block U(k,j) information */
             /* unsigned long long ut_start, ut_end; */
             int_t rukp =  Ublock_info[j].rukp;
@@ -330,8 +451,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
             int nsupc = SuperSize(jb);
             int ljb = LBj (jb, grid);  /* destination column block */
             int st_col;
-            int ncols;
-            if ( j>jj0 ) { /* jj0 was set to 0 */
+            int ncols;  /* Local variable counts only columns in the block */
+            if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
                 ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
                 st_col = Ublock_info[j-1].full_u_cols;
             } else {
@@ -346,7 +467,16 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
             lptr += LB_DESCRIPTOR;
             int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
 
+	    /* Block-by-block GEMM in look-ahead window */
+#if 0
+	    i = sizeof(double);
+	    double* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+	    double* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
 #if ( PRNTlevel>= 1)
+	    if (thread_id == 0) tt_start = SuperLU_timer_();
 	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
 	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
 	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
@@ -354,14 +484,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 
 #if defined (USE_VENDOR_BLAS)            
             dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
-                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
 #else
             dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
-                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
 #endif
-#if ( PRNTlevel>=1 )
+
+#if (PRNTlevel>=1 )
 	    if (thread_id == 0) {
 		tt_end = SuperLU_timer_();
 		LookAheadGEMMTimer += tt_end - tt_start;
@@ -379,6 +512,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 				 grid
 			        );
             } else {
+#if 0
+		//#ifdef USE_VTUNE
+	    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	    __itt_resume(); // start VTune, again use 2 underscores
+#endif
                 dscatter_l (
 				 ib, ljb, 
 				 nsupc, iukp, xsup,
@@ -389,137 +527,187 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 				 Lrowind_bc_ptr, Lnzval_bc_ptr,
 				 grid
 				);
+#if 0
+		//#ifdef USE_VTUNE
+		__itt_pause(); // stop VTune
+		__SSC_MARK(0x222); // stop SDE tracing
+#endif
             }
 
 #if ( PRNTlevel>=1 )
-	    if (thread_id == 0)
+	    if (thread_id == 0) 
 		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
 #endif
-        } /* end omp for ij = ... */
+	   } /* end omp for ij = ... */
+
 #ifdef _OPENMP
-    } /* end omp parallel */
+	 } /* end omp parallel */
 #endif
-        LookAheadGEMMFlOp  += 2*(double)Lnbrow * (double)ldu * (double)ncols;
-        stat->ops[FACT]    += 2*(double)Lnbrow * (double)ldu * (double)ncols;
-        LookAheadScatterMOP += 3*Lnbrow*ncols;
-    } /* end if Lnbrow < ... */
-    
+     } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
     /***************************************************************
      * Updating remaining rows and columns on CPU.
      ***************************************************************/
-    Rnbrow  = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-    ncols   = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+    ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+	double flps = 2.0 * (double)Rnbrow * ldu * ncols;
+	schur_flop_counter  += flps;
+	stat->ops[FACT]     += flps;
 
-    schur_flop_counter  += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
-    stat->ops[FACT]     += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+#if ( PRNTlevel>=1 )
+	RemainGEMM_flops += flps;
+	gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+	gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+	tt_start = SuperLU_timer_();
+	/* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+	   iam, k0,Rnbrow,ldu,ncols,RemainBlk);  fflush(stdout);
+	assert( Rnbrow*ncols < bigv_size ); */
+#endif
+	/* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+	//dgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	dgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+	//dgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	dgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
 
+#if ( PRNTlevel>=1 )
+	tt_end = SuperLU_timer_();
+	RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+	//fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+	// (tt_end - tt_start)*1e6); // time in microsecond
+	//fflush(fgemm);
+	gemm_stats[gemm_count].m = Rnbrow;
+	gemm_stats[gemm_count].n = ncols;
+	gemm_stats[gemm_count].k = ldu;
+	gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+	tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* Scatter into destination block-by-block. */
 #ifdef _OPENMP
-#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
-    {
-	thread_id = omp_get_thread_num();
+#pragma omp parallel default(shared) private(thread_id)
+	{
+	    thread_id = omp_get_thread_num();
  
-	/* Ideally, should organize the loop as:
+	    /* Ideally, should organize the loop as:
                for (j = 0; j < jj_cpu; ++j) {
-                   for (lb = 0; lb < RemainBlk; ++lb) {
+	           for (lb = 0; lb < RemainBlk; ++lb) {
 	               L(lb,k) X U(k,j) -> tempv[]
                    }
                }
-	   But now, we use collapsed loop to achieve more parallelism.
-	   Total number of block updates is:
-	      (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
-	*/
+	       But now, we use collapsed loop to achieve more parallelism.
+	       Total number of block updates is:
+	       (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	    */
+
+	    int i = sizeof(int);
+	    int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+	    int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
 #pragma omp for \
-    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
     schedule(dynamic)
 #else /* not use _OPENMP */
-    thread_id = 0;
-#endif
-	/* Each thread is assigned one loop index ij, responsible for 
-	   block update L(lb,k) * U(k,j) -> tempv[]. */
-    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
-	int j   = ij / RemainBlk + jj0; 
-	int lb  = ij % RemainBlk;
-
-	int* indirect_thread = indirect + ldt*thread_id;
-	int* indirect2_thread = indirect2 + ldt*thread_id;
-	double* tempv1 = bigV + thread_id*ldt*ldt; 
-
-	/* Getting U block U(k,j) information */
-	/* unsigned long long ut_start, ut_end; */
-	int_t rukp =  Ublock_info[j].rukp;
-	int_t iukp =  Ublock_info[j].iukp;
-	int jb   =  Ublock_info[j].jb;
-	int nsupc = SuperSize(jb);
-	int ljb = LBj (jb, grid);
-	int st_col;
-	int ncols;
-	if ( j>jj0 ) {
-	    ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
-	    st_col = Ublock_info[j-1].full_u_cols;
-	} else {
-	    ncols  = Ublock_info[j].full_u_cols;
-	    st_col = 0;   
-	}
-
-	/* Getting L block L(i,k) information */
-	int_t lptr = Remain_info[lb].lptr;
-	int ib   = Remain_info[lb].ib;
-	int temp_nbrow = lsub[lptr+1];
-	lptr += LB_DESCRIPTOR;
-	int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-
+	    thread_id = 0;
+	    int* indirect_thread = indirect;
+	    int* indirect2_thread = indirect2;
+#endif
+	    /* Each thread is assigned one loop index ij, responsible for 
+	       block update L(lb,k) * U(k,j) -> tempv[]. */
+	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+		/* jj_cpu := nub, jj0 starts after look-ahead window. */
+		int j   = ij / RemainBlk + jj0; /* j-th block in U panel */
+		int lb  = ij % RemainBlk;       /* lb-th block in L panel */
+
+		/* Getting U block U(k,j) information */
+		/* unsigned long long ut_start, ut_end; */
+		int_t rukp =  Ublock_info[j].rukp;
+		int_t iukp =  Ublock_info[j].iukp;
+		int jb   =  Ublock_info[j].jb;
+		int nsupc = SuperSize(jb);
+		int ljb = LBj (jb, grid);
+		int st_col;
+		int ncols;
+		if ( j>jj0 ) {
+		    ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+		    st_col = Ublock_info[j-1].full_u_cols;
+		} else {
+		    ncols = Ublock_info[j].full_u_cols;
+		    st_col = 0;   
+		}
+
+		/* Getting L block L(i,k) information */
+		int_t lptr = Remain_info[lb].lptr;
+		int ib   = Remain_info[lb].ib;
+		int temp_nbrow = lsub[lptr+1];
+		lptr += LB_DESCRIPTOR;
+		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+		
+		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry 
+		double* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+		/* Now scattering the block */
+
+		if ( ib < jb ) {
+		    dscatter_u (
+				ib, jb,
+				nsupc, iukp, xsup,
+				//klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+				klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+				lptr, temp_nbrow, /* row dimension of the block */
+				lsub, usub, tempv1,
+				Ufstnz_br_ptr, Unzval_br_ptr,
+				grid
+				);
+		} else {
+		    dscatter_l(
+			       ib, ljb,
+			       nsupc, iukp, xsup,
+			       //klst, temp_nbrow, Sherry
+			       klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+			       lptr, temp_nbrow, /* row dimension of the block */
+			       usub, lsub, tempv1,
+			       indirect_thread, indirect2_thread,
+			       Lrowind_bc_ptr,Lnzval_bc_ptr,
+			       grid
+			       );
+		}
+		
+	    } /* end omp for (int ij =...) */
+	    
+#ifdef _OPENMP
+	} /* end omp parallel region */
+#endif
+	
 #if ( PRNTlevel>=1 )
-	if ( thread_id==0 ) tt_start = SuperLU_timer_();
+	RemainScatterTimer += SuperLU_timer_() - tt_start;
 #endif
 
-	/* calling GEMM */
-#if defined (USE_VENDOR_BLAS)
-	dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
-	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-#else
-	dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
-	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
 #endif
 
-#if ( PRNTlevel>=1 )
-	if (thread_id==0) {
-	    tt_end = SuperLU_timer_();
-	    RemainGEMMTimer += tt_end - tt_start;
-	    tt_start = tt_end;
-	}
-#endif
-
-	/* Now scattering the block */
-	if ( ib<jb ) {
-	    dscatter_u(
-			    ib, jb,
-			    nsupc, iukp, xsup,
-			    klst, temp_nbrow,
-			    lptr, temp_nbrow,lsub,
-			    usub, tempv1,
-			    Ufstnz_br_ptr, Unzval_br_ptr,
-			    grid
-		           );
-	} else {
-	    dscatter_l(
-			    ib, ljb,
-			    nsupc, iukp, xsup,
-			    klst, temp_nbrow,
-			    lptr, temp_nbrow,
-			    usub, lsub, tempv1,
-			    indirect_thread, indirect2_thread,
-			    Lrowind_bc_ptr,Lnzval_bc_ptr,
-			    grid
-			   );
-	}
+    } /* end if Rnbrow>0 ... update remaining block */
 
-#if ( PRNTlevel>=1 )
-	if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
-#endif
-    } /* end omp for (int ij =...) */
-#ifdef _OPENMP
-    } /* end omp parallel region */
-#endif
 }  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c
new file mode 100644
index 0000000..22714a7
--- /dev/null
+++ b/SRC/dbinary_io.c
@@ -0,0 +1,40 @@
+#include "superlu_ddefs.h"
+
+int
+dread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, 
+	     double **nzval, int_t **rowind, int_t **colptr)
+{
+    size_t isize = sizeof(int_t), dsize = sizeof(double);
+    int nnz_read;
+    fread(n, isize, 1, fp);
+    fread(nnz, isize, 1, fp);
+    printf("fread n %d\tnnz %d\n", *n, *nnz);
+    *m = *n;
+    *colptr = intMalloc_dist(*n+1);
+    *rowind = intMalloc_dist(*nnz);
+    *nzval  = doubleMalloc_dist(*nnz);
+    fread(*colptr, isize, (size_t) (*n + 1), fp);
+    fread(*rowind, isize, (size_t) *nnz, fp);
+    nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
+    printf("# of doubles fread: %d\n", nnz_read);
+    fclose(fp);
+}
+
+int
+dwrite_binary(int_t n, int_t nnz,
+	      double *values, int_t *rowind, int_t *colptr)
+{       
+      FILE  *fp1;
+      int nnz_written;
+      size_t isize = sizeof(int_t), dsize = sizeof(double);
+      fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+      fwrite(&n, isize, 1, fp1);
+      fwrite(&nnz, isize, 1, fp1);
+      fwrite(colptr, isize, n+1, fp1);
+      fwrite(rowind, isize, nnz, fp1);
+      nnz_written = fwrite(values, dsize, nnz, fp1);
+      printf("n %d, # of double: %d\n", n, nnz);
+      printf("dump binary file ... # of double fwrite: %d\n", nnz_written);
+      assert(nnz_written==nnz);
+      fclose(fp1);
+}
diff --git a/SRC/dlook_ahead_update.c b/SRC/dlook_ahead_update.c
index 7521506..a9f53b1 100644
--- a/SRC/dlook_ahead_update.c
+++ b/SRC/dlook_ahead_update.c
@@ -15,11 +15,17 @@ at the top-level directory.
  * \brief Look-ahead update of the Schur complement.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: September 18, 2017
+ *   
  */
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+
 #ifdef ISORT
 while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
 #else
@@ -28,6 +34,8 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 {
     double zero = 0.0;
 
+#if 0 // Sherry: no need to search
+    /* Caveat: There is a permutation perm_u involved for j  */
     /* Search along the row for the pointers {iukp, rukp} pointing to
      * block U(k,j).
      * j    -- current block in look-ahead window, initialized to 0 on entry
@@ -39,6 +47,13 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
          	     iukp0, rukp0, usub, perm_u, xsup, grid
 		    );
+#else
+    jb = usub[iukp];
+    ljb = LBj (jb, grid);     /* Local block number of U(k,j). */
+    nsupc = SuperSize(jb);
+    iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
     j++;
     jj0++;
     jj = iukp;
@@ -47,48 +62,47 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 
     ldu = klst - usub[jj++];
     ncols = 1;
-    full = 1; /* flag the U block is indeed 'full', containing segments
-                 of same length. No need padding 0.  */
+
+    /* This loop computes ldu. */
     for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
         segsize = klst - usub[jj];
         if (segsize) {
             ++ncols;
-            if (segsize != ldu) full = 0; /* need padding 0 */
             if (segsize > ldu)  ldu = segsize;
         }
     }
 #if ( DEBUGlevel>=3 )
     ++num_update;
 #endif
-    if (0) {
-        tempu = &uval[rukp];
-    }
-    else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+
 #if ( DEBUGlevel>=3 )
-        printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-                iam, full, k, jb, ldu, ncols, nsupc);
-        ++num_copy;
+    printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+	    iam, k, jb, ldu, ncols, nsupc);
+    ++num_copy;
 #endif
-        tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
-        for (jj = iukp; jj < iukp + nsupc; ++jj) {
-            segsize = klst - usub[jj];
-            if (segsize) {
-                lead_zero = ldu - segsize;
-                for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-                tempu += lead_zero;
-                for (i = 0; i < segsize; ++i) {
-                    tempu[i] = uval[rukp + i];
-                }
-                rukp += segsize;
-                tempu += segsize;
+
+    /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+    tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+    for (jj = iukp; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            lead_zero = ldu - segsize;
+            for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+            tempu += lead_zero;
+            for (i = 0; i < segsize; ++i) {
+                tempu[i] = uval[rukp + i];
             }
+            rukp += segsize;
+            tempu += segsize;
         }
-        tempu = bigU;
-        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-    } /* if full ... */
+    }
+    tempu = bigU; /* set back to the beginning of the buffer */
+#if 0
+    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
 
     nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
-    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
     // double ttx =SuperLU_timer_();
 
     int current_b = 0; /* Each thread starts searching from first block.
@@ -99,9 +113,9 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 #ifdef _OPENMP
     /* Sherry -- examine all the shared variables ??
        'firstprivate' ensures that the private variables are initialized
-       to the values before entering the loop  */
+       to the values before entering the loop.  */
 #pragma omp parallel for \
-    firstprivate(lptr,luptr,ib,tempv,current_b) private(lb) \
+    firstprivate(lptr,luptr,ib,current_b) private(lb) \
     default(shared) schedule(dynamic)
 #endif
     for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
@@ -134,7 +148,10 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 
         lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 
+	/*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
         /* calling gemm */
+	stat->ops[FACT] += 2.0 * (flops_t)temp_nbrow * ldu * ncols;
 #if defined (USE_VENDOR_BLAS)
         dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
                    &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
@@ -145,7 +162,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
                    tempu, &ldu, &beta, tempv, &temp_nbrow );
 #endif
 
-        /* Now scattering the output*/
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	} 
+#endif
+        /* Now scattering the output. */
         if (ib < jb) {    /* A(i,j) is in U. */
             dscatter_u (ib, jb,
                        nsupc, iukp, xsup,
@@ -159,14 +183,22 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
                        Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
         }
 
-        ++current_b;         /* move to next block */
+        ++current_b;         /* Move to next block. */
         lptr += temp_nbrow;
         luptr += temp_nbrow;
 
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadScatterTimer += tt_end - tt_start;
+	}
+#endif
     } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
 
-    rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
-    iukp += nsupc;
+#if 0
+    rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+#endif
+    iukp += nsupc; /* Mov to block U(k,j+1) */
 
     /* =========================================== *
      * == factorize L(:,j) and send if possible == *
@@ -187,17 +219,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
         /* Factor diagonal and subdiagonal blocks and test for exact
            singularity.  */
         factored[kk] = 0;
-        /* double ttt1 = SuperLU_timer_(); */
-#if ( VAMPIR>=1 )
-        VT_begin (5);
-#endif
+
+        double tt1 = SuperLU_timer_();
 
         PDGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-#if ( VAMPIR>=1 )
-        VT_end (5);
-#endif
+        pdgstrf2_timer += SuperLU_timer_() - tt1; 
+
         /* stat->time7 += SuperLU_timer_() - ttt1; */
 
         /* Multicasts numeric values of L(:,kk) to process rows. */
@@ -221,18 +250,12 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 #if ( PROFlevel>=1 )
                 TIC (t1);
 #endif
-#if ( VAMPIR>=1 )
-                VT_begin (1);
-#endif
                 MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                            SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
                            scp->comm, &send_req[pj]);
                 MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
                            SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
                            scp->comm, &send_req[pj + Pc]);
-#if ( VAMPIR>=1 )
-                VT_end (1);
-#endif
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c
index 8f9e7a2..47f541f 100644
--- a/SRC/dmemory_dist.c
+++ b/SRC/dmemory_dist.c
@@ -129,10 +129,13 @@ int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
     mem_usage->total += (float)(2 * k * iword);
 #else
     /*mem_usage->total += stat->current_buffer;*/
-    printf(".. dQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
     mem_usage->total += stat->peak_buffer;
-#endif
 
+#if ( PRNTlevel>=1 )
+    if (iam==0) printf(".. dQuerySpace: peak_buffer %.2f (MB)\n",
+                       stat->peak_buffer * 1.0e-6);
+#endif
+#endif
     return 0;
 } /* dQuerySpace_dist */
 
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
index 9ddc538..f7e0a2e 100644
--- a/SRC/dreadMM.c
+++ b/SRC/dreadMM.c
@@ -17,6 +17,7 @@ at the top-level directory.
  *
  */
 #include <ctype.h>
+#include <stdio.h>
 #include "superlu_ddefs.h"
 
 #undef EXPAND_SYM
@@ -43,6 +44,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     int_t    zero_base = 0;
     char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
     int expand;
+    char *cs;
 
     /* 	File format:
      *    %%MatrixMarket matrix coordinate real general/symmetric/...
@@ -54,7 +56,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
      /* 1/ read header */ 
-     fgets(line,512,fp);
+     cs = fgets(line,512,fp);
      for (p=line; *p!='\0'; *p=tolower(*p),p++);
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
@@ -100,7 +102,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 2/ Skip comments */
      while(banner[0]=='%') {
-       fgets(line,512,fp);
+       cs = fgets(line,512,fp);
        sscanf(line,"%s",banner);
      }
 
@@ -123,16 +125,17 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
     *m = *n;
     printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    fflush(stdout);
     dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
     a    = *nzval;
     asub = *rowind;
     xa   = *colptr;
 
-    if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+    if ( !(val = doubleMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for val[]");
-    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+    if ( !(row = (int_t *) intMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for row[]");
-    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+    if ( !(col = (int_t *) intMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for col[]");
 
     for (j = 0; j < *n; ++j) xa[j] = 0;
@@ -140,17 +143,19 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* 4/ Read triplets of values */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
+	j = fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
 #else
-	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+	j = fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
 #endif
 
-	if ( nnz == 0 ) /* first nonzero */
+	if ( nnz == 0 ) /* first nonzero */ {
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
 	    } else
 		printf("triplet file: row/col indices are one-based.\n");
+	    fflush(stdout);
+	}
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
@@ -181,6 +186,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     *nonz = nz;
     if(expand) {
       printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+      fflush(stdout);
     }
     
 
@@ -234,10 +240,8 @@ static void dreadrhs(int m, double *b)
 	exit(-1);
     }
     for (i = 0; i < m; ++i)
-      fscanf(fp, "%lf\n", &b[i]);
+      i = fscanf(fp, "%lf\n", &b[i]);
       /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
     /*        readpair_(j, &b[i]);*/
     fclose(fp);
 }
-
-
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
index af18ea8..00adbdf 100644
--- a/SRC/dscatter.c
+++ b/SRC/dscatter.c
@@ -14,10 +14,13 @@ at the top-level directory.
  * \brief Scatter the computed blocks into LU destination.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: 
+ *   September 18, 2017, enable SIMD vectorized scatter operation.
+ *   
  */
 #include <math.h>
 #include "superlu_ddefs.h"
@@ -112,9 +115,9 @@ dscatter_l (
            int_t iukp, /* point to destination supernode's index[] */
            int_t* xsup,
            int klst,
-           int nbrow,
+           int nbrow,  /* LDA of the block in tempv[] */
            int_t lptr, /* Input, point to index[] location of block L(i,k) */
-	   int temp_nbrow, /* number of rows in block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
            int_t* usub,
            int_t* lsub,
            double *tempv,
@@ -126,7 +129,7 @@ dscatter_l (
     int_t rel, i, segsize, jj;
     double *nzval;
     int_t *index = Lrowind_bc_ptr[ljb];
-    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t ldv = index[1];       /* LDA of the destination lusup. */
     int_t lptrj = BC_HEADER;
     int_t luptrj = 0;
     int_t ijb = index[lptrj];
@@ -139,36 +142,43 @@ dscatter_l (
     }
     
     /*
-     * Build indirect table. This is needed because the
-     * indices are not sorted for the L blocks.
+     * Build indirect table. This is needed because the indices are not sorted
+     * in the L blocks.
      */
     int_t fnz = FstBlockC (ib);
     int_t dest_nbrow; 
     lptrj += LB_DESCRIPTOR;
     dest_nbrow=index[lptrj - 1];
     
-    for (i = 0; i < dest_nbrow; ++i)
-    {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    for (i = 0; i < dest_nbrow; ++i) {
         rel = index[lptrj + i] - fnz;
         indirect_thread[rel] = i;
 
     }
 
-    /* can be precalculated */
-    for (i = 0; i < temp_nbrow; ++i)
-    {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    /* can be precalculated? */
+    for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
         rel = lsub[lptr + i] - fnz;
         indirect2[i] =indirect_thread[rel]; 
     }
 
-    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
-    for (jj = 0; jj < nsupc; ++jj)
-    {
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+    for (jj = 0; jj < nsupc; ++jj) {
         segsize = klst - usub[iukp + jj];
-        if (segsize)
-        {
-            for (i = 0; i < temp_nbrow; ++i)
-            {
+        if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
                 nzval[indirect2[i]] -= tempv[i];
             }
             tempv += nbrow;
@@ -186,9 +196,9 @@ dscatter_u (int ib,
            int_t iukp,
            int_t * xsup,
            int klst,
-           int nbrow,
-           int_t lptr,
-           int temp_nbrow,
+ 	   int nbrow,      /* LDA of the block in tempv[] */
+           int_t lptr,     /* point to index location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
            int_t* lsub,
            int_t* usub,
            double* tempv,
@@ -208,8 +218,8 @@ dscatter_u (int ib,
     int_t lib = LBi (ib, grid);
     int_t *index = Ufstnz_br_ptr[lib];
 
-    /* Reinitilize the pointers to the begining of the 
-     * k-th column/row of L/U factors.
+    /* Reinitilize the pointers to the begining of the k-th column/row of
+     * L/U factors.
      * usub[] - index array for panel U(k,:)
      */
     int_t iuip_lib, ruip_lib;
@@ -217,38 +227,32 @@ dscatter_u (int ib,
     ruip_lib = 0;
 
     int_t ijb = index[iuip_lib];
-    while (ijb < jb)            /* Search for dest block. */
-    {
+    while (ijb < jb) {   /* Search for destination block. */
         ruip_lib += index[iuip_lib + 1];
         // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
         iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
         ijb = index[iuip_lib];
     }
-    /* Skip descriptor.  Now point to fstnz index of
-       block U(i,j). */
+    /* Skip descriptor. Now point to fstnz index of block U(i,j). */
     iuip_lib += UB_DESCRIPTOR;
 
     // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
-    for (jj = 0; jj < nsupc; ++jj)
-    {
+    for (jj = 0; jj < nsupc; ++jj) {
         segsize = klst - usub[iukp + jj];
         fnz = index[iuip_lib++];
-        if (segsize)            /* Nonzero segment in U(k.j). */
-        {
+        if (segsize) {          /* Nonzero segment in U(k,j). */
             ucol = &Unzval_br_ptr[lib][ruip_lib];
 
             // printf("========Entering loop=========\n");
-            for (i = 0; i < temp_nbrow; ++i)
-            {
-
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
                 rel = lsub[lptr + i] - fnz;
                 // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
                 // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
-
                 ucol[rel] -= tempv[i];
 
-                // printf("hello\n");
-
 #ifdef PI_DEBUG
                 double zz = 0.0;
                 if (!(*(long *) &zz == *(long *) &tempv[i]))
@@ -256,15 +260,16 @@ dscatter_u (int ib,
                             ucol[rel]);
                 //printing triplets (location??, old value, new value ) if none of them is zero
 #endif
-            }                   /* for i=0..temp_nbropw */
-            tempv += nbrow;
+            } /* for i = 0:temp_nbropw */
+            tempv += nbrow; /* Jump LDA to next column */
 #ifdef PI_DEBUG
             // printf("\n");
 #endif
-        }                       /*ig segsize */
+        }  /* if segsize */
+
         ruip_lib += ilst - fnz;
 
-    }                           /*for jj=0:nsupc */
+    }  /* for jj = 0:nsupc */
 #ifdef PI_DEBUG
     // printf("\n");
 #endif
diff --git a/SRC/get_perm_c.c b/SRC/get_perm_c.c
index 14b208d..4353ca4 100644
--- a/SRC/get_perm_c.c
+++ b/SRC/get_perm_c.c
@@ -23,6 +23,7 @@ at the top-level directory.
  */
 
 #include "superlu_ddefs.h"
+#include "colamd.h"
 
 
 void
@@ -102,6 +103,39 @@ get_metis(
     SUPERLU_FREE(perm);
 }
 
+void
+get_colamd_dist(
+	   const int m,  /* number of rows in matrix A. */
+	   const int n,  /* number of columns in matrix A. */
+	   const int nnz,/* number of nonzeros in matrix A. */
+	   int_t *colptr,  /* column pointer of size n+1 for matrix A. */
+	   int_t *rowind,  /* row indices of size nz for matrix A. */
+	   int_t *perm_c   /* out - the column permutation vector. */
+	   )
+{
+    int Alen, *A, i, info, *p;
+    double knobs[COLAMD_KNOBS];
+    int stats[COLAMD_STATS];
+
+    Alen = colamd_recommended(nnz, m, n);
+
+    colamd_set_defaults(knobs);
+
+    if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) )
+        ABORT("Malloc fails for A[]");
+    if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) )
+        ABORT("Malloc fails for p[]");
+    for (i = 0; i <= n; ++i) p[i] = colptr[i];
+    for (i = 0; i < nnz; ++i) A[i] = rowind[i];
+    info = colamd(m, n, Alen, A, p, knobs, stats);
+    if ( info == FALSE ) ABORT("COLAMD failed");
+
+    for (i = 0; i < n; ++i) perm_c[p[i]] = i;
+
+    SUPERLU_FREE(A);
+    SUPERLU_FREE(p);
+}
+
 /*! \brief
  *
  * <pre>
@@ -472,6 +506,13 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c)
 #endif
 	      break;
 
+        case (COLAMD): /* Approximate minimum degree column ordering. */
+	      get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+			      perm_c);
+#if ( PRNTlevel>=1 )
+	      printf(".. Use approximate minimum degree column ordering.\n");
+#endif
+	      return;
         case METIS_AT_PLUS_A: /* METIS ordering on A'+A */
 	      if ( m != n ) ABORT("Matrix is not square");
 	      at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
diff --git a/SRC/mc64ad_dist.c b/SRC/mc64ad_dist.c
deleted file mode 100644
index bf722fd..0000000
--- a/SRC/mc64ad_dist.c
+++ /dev/null
@@ -1,2654 +0,0 @@
-/* mc64ad.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-	on Microsoft Windows system, link with libf2c.lib;
-	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-	or, if you install libf2c.a in a standard place, with -lf2c -lm
-	-- in that order, at the end of the command line, as in
-		cc *.o -lf2c -lm
-	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-		http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "superlu_ddefs.h"
-
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define min(a,b) ((a) < (b)) ? (a) : (b)
-
-/* Table of constant values */
-
-static int_t c__1 = 1;
-static int_t c__2 = 2;
-
-/*! @file 
- * \brief Permute large entries to the main diagonal
- */
-/* CCCC COPYRIGHT (c) 1999  Council for the Central Laboratory of the */
-/* CCCC Research Councils.    All rights reserved. */
-/* CCCC PACKAGE MC64A/AD */
-/* CCCC AUTHORS Iain Duff (i.duff at rl.ac.uk) and Jacko Koster (jak at ii.uib.no) */
-/* CCCC LAST UPDATE 20/09/99 */
-/* CCCC */
-/* *** Conditions on external use *** */
-
-/* The user shall acknowledge the contribution of this */
-/* package in any publication of material dependent upon the use of */
-/* the package. The user shall use reasonable endeavours to notify */
-/* the authors of the package of this publication. */
-
-/* The user can modify this code but, at no time */
-/* shall the right or title to all or any part of this package pass */
-/* to the user. The user shall make available free of charge */
-/* to the authors for any purpose all information relating to any */
-/* alteration or addition made to this package for the purposes of */
-/* extending the capabilities or enhancing the performance of this */
-/* package. */
-
-/* The user shall not pass this code directly to a third party without the */
-/* express prior consent of the authors.  Users wanting to licence their */
-/* own copy of these routines should send email to hsl at aeat.co.uk */
-
-/* None of the comments from the Copyright notice up to and including this */
-/* one shall be removed or altered in any way. */
-/* ********************************************************************** */
-/* </pre>
- */
-
-/* Subroutine */ int_t mc64id_dist(int_t *icntl)
-{
-    int_t i__;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/*  Purpose */
-/*  ======= */
-
-/*  The components of the array ICNTL control the action of MC64A/AD. */
-/*  Default values for these are set in this subroutine. */
-
-/*  Parameters */
-/*  ========== */
-
-
-/*  Local variables */
-
-/*    ICNTL(1) has default value 6. */
-/*     It is the output stream for error messages. If it */
-/*     is negative, these messages will be suppressed. */
-
-/*    ICNTL(2) has default value 6. */
-/*     It is the output stream for warning messages. */
-/*     If it is negative, these messages are suppressed. */
-
-/*    ICNTL(3) has default value -1. */
-/*     It is the output stream for monitoring printing. */
-/*     If it is negative, these messages are suppressed. */
-
-/*    ICNTL(4) has default value 0. */
-/*     If left at the defaut value, the incoming data is checked for */
-/*     out-of-range indices and duplicates.  Setting ICNTL(4) to any */
-/*     other will avoid the checks but is likely to cause problems */
-/*     later if out-of-range indices or duplicates are present. */
-/*     The user should only set ICNTL(4) non-zero, if the data is */
-/*     known to avoid these problems. */
-
-/*    ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */
-/*     zero in this routine. */
-/* Initialization of the ICNTL array. */
-    /* Parameter adjustments */
-    --icntl;
-
-    /* Function Body */
-    icntl[1] = 6;
-    icntl[2] = 6;
-    icntl[3] = -1;
-    for (i__ = 4; i__ <= 10; ++i__) {
-	icntl[i__] = 0;
-/* L10: */
-    }
-    return 0;
-} /* mc64id_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ad_dist(int_t *job, int_t *n, int_t *ne, int_t *
-	ip, int_t *irn, double *a, int_t *num, int_t *cperm, 
-	int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t *
-	icntl, int_t *info)
-{
-    /* System generated locals */
-    int_t i__1, i__2;
-    double d__1, d__2;
-
-    /* Builtin functions */
-    double log(double);
-
-    /* Local variables */
-    int_t i__, j, k;
-    double fact, rinf;
-
-    extern /* Subroutine */ int_t mc21ad_dist(int_t *, int_t *, int_t *, 
-	    int_t *, int_t *, int_t *, int_t *, int_t *),
-	    mc64bd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t 
-	    *, int_t *, int_t *, int_t *, int_t *, int_t *, double *),
-	    mc64rd_dist(int_t *, int_t *, int_t *, int_t *, double *),
-	    mc64sd_dist(int_t *, int_t *, int_t *, int_t *
-	    , double *, int_t *, int_t *, int_t *, int_t *, 
-	    int_t *, int_t *, int_t *, int_t *, int_t *),
-	    mc64wd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t 
-	    *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t 
-	    *, double *, double *);
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/*  Purpose */
-/*  ======= */
-
-/*! \brief
- * <pre>
- * This subroutine attempts to find a column permutation for an NxN 
- * sparse matrix A = {a_ij} that makes the permuted matrix have N 
- * entries on its diagonal. 
- * If the matrix is structurally nonsingular, the subroutine optionally 
- * returns a column permutation that maximizes the smallest element 
- * on the diagonal, maximizes the sum of the diagonal entries, or 
- * maximizes the product of the diagonal entries of the permuted matrix. 
- * For the latter option, the subroutine also finds scaling factors 
- * that may be used to scale the matrix so that the nonzero diagonal 
- * entries of the permuted matrix are one in absolute value and all the 
- * off-diagonal entries are less than or equal to one in absolute value. 
- * The natural logarithms of the scaling factors u(i), i=1..N, for the 
- * rows and v(j), j=1..N, for the columns are returned so that the 
- * scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). 
- * </pre>
- */
- 
-/*  Parameters */
-/*  ========== */
-
-
-/* JOB is an INT_T variable which must be set by the user to */
-/* control the action. It is not altered by the subroutine. */
-/* Possible values for JOB are: */
-/*   1 Compute a column permutation of the matrix so that the */
-/*     permuted matrix has as many entries on its diagonal as possible. */
-/*     The values on the diagonal are of arbitrary size. HSL subroutine */
-/*     MC21A/AD is used for this. See [1]. */
-/*   2 Compute a column permutation of the matrix so that the smallest */
-/*     value on the diagonal of the permuted matrix is maximized. */
-/*     See [3]. */
-/*   3 Compute a column permutation of the matrix so that the smallest */
-/*     value on the diagonal of the permuted matrix is maximized. */
-/*     The algorithm differs from the one used for JOB = 2 and may */
-/*     have quite a different performance. See [2]. */
-/*   4 Compute a column permutation of the matrix so that the sum */
-/*     of the diagonal entries of the permuted matrix is maximized. */
-/*     See [3]. */
-/*   5 Compute a column permutation of the matrix so that the product */
-/*     of the diagonal entries of the permuted matrix is maximized */
-/*     and vectors to scale the matrix so that the nonzero diagonal */
-/*     entries of the permuted matrix are one in absolute value and */
-/*     all the off-diagonal entries are less than or equal to one in */
-/*     absolute value. See [3]. */
-/*  Restriction: 1 <= JOB <= 5. */
-
-/* N is an INT_T variable which must be set by the user to the */
-/*   order of the matrix A. It is not altered by the subroutine. */
-/*   Restriction: N >= 1. */
-
-/* NE is an INT_T variable which must be set by the user to the */
-/*   number of entries in the matrix. It is not altered by the */
-/*   subroutine. */
-/*   Restriction: NE >= 1. */
-
-/* IP is an INT_T array of length N+1. */
-/*   IP(J), J=1..N, must be set by the user to the position in array IRN */
-/*   of the first row index of an entry in column J. IP(N+1) must be set */
-/*   to NE+1. It is not altered by the subroutine. */
-
-/* IRN is an INT_T array of length NE. */
-/*   IRN(K), K=1..NE, must be set by the user to hold the row indices of */
-/*   the entries of the matrix. Those belonging to column J must be */
-/*   stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */
-/*   of the row indices within each column is unimportant. Repeated */
-/*   entries are not allowed. The array IRN is not altered by the */
-/*   subroutine. */
-
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/*   The user must set A(K), K=1..NE, to the numerical value of the */
-/*   entry that corresponds to IRN(K). */
-/*   It is not used by the subroutine when JOB = 1. */
-/*   It is not altered by the subroutine. */
-
-/* NUM is an INT_T variable that need not be set by the user. */
-/*   On successful exit, NUM will be the number of entries on the */
-/*   diagonal of the permuted matrix. */
-/*   If NUM < N, the matrix is structurally singular. */
-
-/* CPERM is an INT_T array of length N that need not be set by the */
-/*   user. On successful exit, CPERM contains the column permutation. */
-/*   Column CPERM(J) of the original matrix is column J in the permuted */
-/*   matrix, J=1..N. */
-
-/* LIW is an INT_T variable that must be set by the user to */
-/*   the dimension of array IW. It is not altered by the subroutine. */
-/*   Restriction: */
-/*     JOB = 1 :  LIW >= 5N */
-/*     JOB = 2 :  LIW >= 4N */
-/*     JOB = 3 :  LIW >= 10N + NE */
-/*     JOB = 4 :  LIW >= 5N */
-/*     JOB = 5 :  LIW >= 5N */
-
-/* IW is an INT_T array of length LIW that is used for workspace. */
-
-/* LDW is an INT_T variable that must be set by the user to the */
-/*   dimension of array DW. It is not altered by the subroutine. */
-/*   Restriction: */
-/*     JOB = 1 :  LDW is not used */
-/*     JOB = 2 :  LDW >= N */
-/*     JOB = 3 :  LDW >= NE */
-/*     JOB = 4 :  LDW >= 2N + NE */
-/*     JOB = 5 :  LDW >= 3N + NE */
-
-/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */
-/*   that is used for workspace. If JOB = 5, on return, */
-/*   DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */
-
-/* ICNTL is an INT_T array of length 10. Its components control the */
-/*   output of MC64A/AD and must be set by the user before calling */
-/*   MC64A/AD. They are not altered by the subroutine. */
-
-/*   ICNTL(1) must be set to specify the output stream for */
-/*   error messages. If ICNTL(1) < 0, messages are suppressed. */
-/*   The default value set by MC46I/ID is 6. */
-
-/*   ICNTL(2) must be set by the user to specify the output stream for */
-/*   warning messages. If ICNTL(2) < 0, messages are suppressed. */
-/*   The default value set by MC46I/ID is 6. */
-
-/*   ICNTL(3) must be set by the user to specify the output stream for */
-/*   diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */
-/*   The default value set by MC46I/ID is -1. */
-
-/*   ICNTL(4) must be set by the user to a value other than 0 to avoid */
-/*   checking of the input data. */
-/*   The default value set by MC46I/ID is 0. */
-
-/* INFO is an INT_T array of length 10 which need not be set by the */
-/*   user. INFO(1) is set non-negative to indicate success. A negative */
-/*   value is returned if an error occurred, a positive value if a */
-/*   warning occurred. INFO(2) holds further information on the error. */
-/*   On exit from the subroutine, INFO(1) will take one of the */
-/*   following values: */
-/*    0 : successful entry (for structurally nonsingular matrix). */
-/*   +1 : successful entry (for structurally singular matrix). */
-/*   +2 : the returned scaling factors are large and may cause */
-/*        overflow when used to scale the matrix. */
-/*        (For JOB = 5 entry only.) */
-/*   -1 : JOB < 1 or JOB > 5.  Value of JOB held in INFO(2). */
-/*   -2 : N < 1.  Value of N held in INFO(2). */
-/*   -3 : NE < 1. Value of NE held in INFO(2). */
-/*   -4 : the defined length LIW violates the restriction on LIW. */
-/*        Value of LIW required given by INFO(2). */
-/*   -5 : the defined length LDW violates the restriction on LDW. */
-/*        Value of LDW required given by INFO(2). */
-/*   -6 : entries are found whose row indices are out of range. INFO(2) */
-/*        contains the index of a column in which such an entry is found. */
-/*   -7 : repeated entries are found. INFO(2) contains the index of a */
-/*        column in which such entries are found. */
-/*  INFO(3) to INFO(10) are not currently used and are set to zero by */
-/*        the routine. */
-
-/* References: */
-/*  [1]  I. S. Duff, (1981), */
-/*       "Algorithm 575. Permutations for a zero-free diagonal", */
-/*       ACM Trans. Math. Software 7(3), 387-390. */
-/*  [2]  I. S. Duff and J. Koster, (1998), */
-/*       "The design and use of algorithms for permuting large */
-/*       entries to the diagonal of sparse matrices", */
-/*       SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */
-/*  [3]  I. S. Duff and J. Koster, (1999), */
-/*       "On algorithms for permuting large entries to the diagonal */
-/*       of sparse matrices", */
-/*       Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */
-/* Local variables and parameters */
-/* External routines and functions */
-/*     EXTERNAL FD05AD */
-/*     DOUBLE PRECISION FD05AD */
-/* Intrinsic functions */
-/* Set RINF to largest positive real number (infinity) */
-/* XSL    RINF = FD05AD(5) */
-    /* Parameter adjustments */
-    --cperm;
-    --ip;
-    --a;
-    --irn;
-    --iw;
-    --dw;
-    --icntl;
-    --info;
-
-    /* Function Body */
-    rinf = dmach_dist("Overflow");
-/* Check value of JOB */
-    if (*job < 1 || *job > 5) {
-	info[1] = -1;
-	info[2] = *job;
-	if (icntl[1] >= 0) {
-	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-		   " because JOB = " IFMT "\n",  info[1], *job);
-	}
-	goto L99;
-    }
-/* Check value of N */
-    if (*n < 1) {
-	info[1] = -2;
-	info[2] = *n;
-	if (icntl[1] >= 0) {
-	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-		   " because N = " IFMT "\n", info[1], *job);
-	}
-	goto L99;
-    }
-/* Check value of NE */
-    if (*ne < 1) {
-	info[1] = -3;
-	info[2] = *ne;
-	if (icntl[1] >= 0) {
-	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
-		   " because NE = " IFMT "\n", info[1], *job);
-	}
-	goto L99;
-    }
-/* Check LIW */
-    if (*job == 1) {
-	k = *n * 5;
-    }
-    if (*job == 2) {
-	k = *n << 2;
-    }
-    if (*job == 3) {
-	k = *n * 10 + *ne;
-    }
-    if (*job == 4) {
-	k = *n * 5;
-    }
-    if (*job == 5) {
-	k = *n * 5;
-    }
-    if (*liw < k) {
-	info[1] = -4;
-	info[2] = k;
-	if (icntl[1] >= 0) {
-	    printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-		   " LIW too small, must be at least " IFMT "\n", info[1], k);
-	}
-	goto L99;
-    }
-/* Check LDW */
-/* If JOB = 1, do not check */
-    if (*job > 1) {
-	if (*job == 2) {
-	    k = *n;
-	}
-	if (*job == 3) {
-	    k = *ne;
-	}
-	if (*job == 4) {
-	    k = (*n << 1) + *ne;
-	}
-	if (*job == 5) {
-	    k = *n * 3 + *ne;
-	}
-	if (*ldw < k) {
-	    info[1] = -5;
-	    info[2] = k;
-	    if (icntl[1] >= 0) {
-		printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-		       " LDW too small, must be at least " IFMT "\n", info[1], k);
-	    }
-	    goto L99;
-	}
-    }
-    if (icntl[4] == 0) {
-/* Check row indices. Use IW(1:N) as workspace */
-	i__1 = *n;
-	for (i__ = 1; i__ <= i__1; ++i__) {
-	    iw[i__] = 0;
-/* L3: */
-	}
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		i__ = irn[k];
-/* Check for row indices that are out of range */
-		if (i__ < 1 || i__ > *n) {
-		    info[1] = -6;
-		    info[2] = j;
-		    if (icntl[1] >= 0) {
-			printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-			       " Column " IFMT 
-			       " contains an entry with invalid row index " IFMT "\n",
-			       info[1], j, i__);
-		    }
-		    goto L99;
-		}
-/* Check for repeated row indices within a column */
-		if (iw[i__] == j) {
-		    info[1] = -7;
-		    info[2] = j;
-		    if (icntl[1] >= 0) {
-			printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT 
-			       "        Column " IFMT
-			       " contains two or more entries with row index " IFMT "\n",
-			       info[1], j, i__);
-		    }
-		    goto L99;
-		} else {
-		    iw[i__] = j;
-		}
-/* L4: */
-	    }
-/* L6: */
-	    }
-    }
-/* Print diagnostics on input */
-    if (icntl[3] >= 0) {
-	printf("  ****** Input parameters for MC64A/AD: JOB = " IFMT ","
-	       " N = " IFMT ", NE = " IFMT "\n", *job, *n, *ne);
-	printf(" IP(1:N+1)   = ");
-	for (j=1; j<=(*n+1); ++j) {
-	    printf(IFMT, ip[j]);
-	    if (j%8 == 0) printf("\n");
-	}
-	printf("\n IRN(1:NE) = ");
-	for (j=1; j<=(*ne); ++j) {
-	    printf(IFMT, irn[j]);
-	    if (j%8 == 0) printf("\n");
-	}
-	printf("\n");
-
-	if (*job > 1) {
-	    printf(" A(1:NE)     = ");
-	    for (j=1; j<=(*ne); ++j) {
-		printf("%f14.4", a[j]);
-		if (j%4 == 0) printf("\n");
-	    }
-	    printf("\n");
-	}
-    }
-/* Set components of INFO to zero */
-    for (i__ = 1; i__ <= 10; ++i__) {
-	info[i__] = 0;
-/* L8: */
-    }
-/* Compute maximum matching with MC21A/AD */
-    if (*job == 1) {
-/* Put length of column J in IW(J) */
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    iw[j] = ip[j + 1] - ip[j];
-/* L10: */
-	}
-/* IW(N+1:5N) is workspace */
-#if 0
-	mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]);
-#else
-	printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n");
-#endif
-	goto L90;
-    }
-/* Compute bottleneck matching */
-    if (*job == 2) {
-/* IW(1:5N), DW(1:N) are workspaces */
-	mc64bd_dist(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num,
-		    &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1],
-		    &dw[1]);
-	goto L90;
-    }
-/* Compute bottleneck matching */
-    if (*job == 3) {
-/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */
-	i__1 = *ne;
-	for (k = 1; k <= i__1; ++k) {
-	    iw[k] = irn[k];
-	    dw[k] = (d__1 = a[k], abs(d__1));
-/* L20: */
-	}
-/* Sort entries in each column by decreasing value. */
-	mc64rd_dist(n, ne, &ip[1], &iw[1], &dw[1]);
-/* IW(NE+1:NE+10N) is workspace */
-	mc64sd_dist(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num,
-		    &iw[*ne + 1], &iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1],
-		    &iw[*ne + *n * 3 + 1], &iw[*ne + (*n << 2) + 1],
-		    &iw[*ne + *n * 5 + 1], &iw[*ne + *n * 6 + 1]);
-	goto L90;
-    }
-    if (*job == 4) {
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    fact = 0.;
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		if ((d__1 = a[k], abs(d__1)) > fact) {
-		    fact = (d__2 = a[k], abs(d__2));
-		}
-/* L30: */
-	    }
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1));
-/* L40: */
-	    }
-/* L50: */
-	}
-/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */
-	mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1],
-		    num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], 
-		    &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
-	goto L90;
-    }
-    if (*job == 5) {
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    fact = 0.;
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		dw[*n * 3 + k] = (d__1 = a[k], abs(d__1));
-		if (dw[*n * 3 + k] > fact) {
-		    fact = dw[*n * 3 + k];
-		}
-/* L60: */
-	    }
-	    dw[(*n << 1) + j] = fact;
-	    if (fact != 0.) {
-		fact = log(fact);
-	    } else {
-		fact = rinf / *n;
-	    }
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		if (dw[*n * 3 + k] != 0.) {
-		    dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]);
-		} else {
-		    dw[*n * 3 + k] = rinf / *n;
-		}
-/* L70: */
-	    }
-/* L75: */
-	}
-/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */
-	mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1],
-		    num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1],
-		    &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
-	if (*num == *n) {
-	    i__1 = *n;
-	    for (j = 1; j <= i__1; ++j) {
-		if (dw[(*n << 1) + j] != 0.) {
-		    dw[*n + j] -= log(dw[(*n << 1) + j]);
-		} else {
-		    dw[*n + j] = 0.;
-		}
-/* L80: */
-	    }
-	}
-/* Check size of scaling factors */
-	fact = log(rinf) * .5f;
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    if (dw[j] < fact && dw[*n + j] < fact) {
-		goto L86;
-	    }
-	    info[1] = 2;
-	    goto L90;
-L86:
-	    ;
-	}
-/*       GO TO 90 */
-    }
-L90:
-    if (info[1] == 0 && *num < *n) {
-/* Matrix is structurally singular, return with warning */
-	info[1] = 1;
-	if (icntl[2] >= 0) {
-	    printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT
-		   " The matrix is structurally singular.\n",  info[1]);
-	}
-    }
-    if (info[1] == 2) {
-/* Scaling factors are large, return with warning */
-	if (icntl[2] >= 0) {
-	    printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT "\n"
-		   "        Some scaling factors may be too large.\n", info[1]);
-	}
-    }
-/* Print diagnostics on output */
-    if (icntl[3] >= 0) {
-	printf(" ****** Output parameters for MC64A/AD: INFO(1:2)  = " IFMT IFMT "\n",
-	       info[1], info[2]);
-	printf(" NUM        = " IFMT, *num);
-	printf(" CPERM(1:N) = ");
-	for (j=1; j<=*n; ++j) {
-	    printf(IFMT, cperm[j]);
-	    if (j%8 == 0) printf("\n");
-	}
-	if (*job == 5) {
-	    printf("\n DW(1:N)    = ");
-	    for (j=1; j<=*n; ++j) {
-		printf("%11.3f", dw[j]);
-		if (j%5 == 0) printf("\n");
-	    }
-	    printf("\n DW(N+1:2N) = ");
-	    for (j=1; j<=*n; ++j) {
-		printf("%11.3f", dw[*n+j]);
-		if (j%5 == 0) printf("\n");
-	    }
-	    printf("\n");
-	}
-    }
-/* Return from subroutine. */
-L99:
-    return 0;
-} /* mc64ad_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64bd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
-	irn, double *a, int_t *iperm, int_t *num, int_t *jperm, 
-	int_t *pr, int_t *q, int_t *l, double *d__)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3;
-    double d__1, d__2, d__3;
-
-    /* Local variables */
-    int_t i__, j, k;
-    double a0;
-    int_t i0, q0;
-    double ai, di;
-    int_t ii, jj, kk;
-    double bv;
-    int_t up;
-    double dq0;
-    int_t kk1, kk2;
-    double csp;
-    int_t isp, jsp, low;
-    double dnew;
-    int_t jord, qlen, idum, jdum;
-    double rinf;
-    extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, 
-	    double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
-	     int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
-	    , int_t *, int_t *, int_t *, double *, int_t *, int_t *);
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* N, NE, IP, IRN are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */
-/*   NE. A(K), K=1..NE, must be set to the value of the entry */
-/*   that corresponds to IRN(K). It is not altered. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/*    matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */
-/* NUM is INT_T variable. On exit, it contains the cardinality of the */
-/*    matching stored in IPERM. */
-/* IW is an INT_T work array of length 4N. */
-/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */
-/* Local variables */
-/* Local parameters */
-/* Intrinsic functions */
-/* External subroutines and/or functions */
-/*      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DMACH */
-/*      DOUBLE PRECISION FD05AD, DMACH */
-/* Set RINF to largest positive real number */
-/* XSL  RINF = FD05AD(5) */
-    /* Parameter adjustments */
-    --d__;
-    --l;
-    --q;
-    --pr;
-    --jperm;
-    --iperm;
-    --ip;
-    --a;
-    --irn;
-
-    /* Function Body */
-    rinf = dmach_dist("Overflow");
-/* Initialization */
-    *num = 0;
-    bv = rinf;
-    i__1 = *n;
-    for (k = 1; k <= i__1; ++k) {
-	iperm[k] = 0;
-	jperm[k] = 0;
-	pr[k] = ip[k];
-	d__[k] = 0.;
-/* L10: */
-    }
-/* Scan columns of matrix; */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	a0 = -1.;
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    ai = (d__1 = a[k], abs(d__1));
-	    if (ai > d__[i__]) {
-		d__[i__] = ai;
-	    }
-	    if (jperm[j] != 0) {
-		goto L30;
-	    }
-	    if (ai >= bv) {
-		a0 = bv;
-		if (iperm[i__] != 0) {
-		    goto L30;
-		}
-		jperm[j] = i__;
-		iperm[i__] = j;
-		++(*num);
-	    } else {
-		if (ai <= a0) {
-		    goto L30;
-		}
-		a0 = ai;
-		i0 = i__;
-	    }
-L30:
-	    ;
-	}
-	if (a0 != -1. && a0 < bv) {
-	    bv = a0;
-	    if (iperm[i0] != 0) {
-		goto L20;
-	    }
-	    iperm[i0] = j;
-	    jperm[j] = i0;
-	    ++(*num);
-	}
-L20:
-	;
-    }
-/* Update BV with smallest of all the largest maximum absolute values */
-/* of the rows. */
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-/* Computing MIN */
-	d__1 = bv, d__2 = d__[i__];
-	bv = min(d__1,d__2);
-/* L25: */
-    }
-    if (*num == *n) {
-	goto L1000;
-    }
-/* Rescan unassigned columns; improve initial assignment */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	if (jperm[j] != 0) {
-	    goto L95;
-	}
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    ai = (d__1 = a[k], abs(d__1));
-	    if (ai < bv) {
-		goto L50;
-	    }
-	    if (iperm[i__] == 0) {
-		goto L90;
-	    }
-	    jj = iperm[i__];
-	    kk1 = pr[jj];
-	    kk2 = ip[jj + 1] - 1;
-	    if (kk1 > kk2) {
-		goto L50;
-	    }
-	    i__3 = kk2;
-	    for (kk = kk1; kk <= i__3; ++kk) {
-		ii = irn[kk];
-		if (iperm[ii] != 0) {
-		    goto L70;
-		}
-		if ((d__1 = a[kk], abs(d__1)) >= bv) {
-		    goto L80;
-		}
-L70:
-		;
-	    }
-	    pr[jj] = kk2 + 1;
-L50:
-	    ;
-	}
-	goto L95;
-L80:
-	jperm[jj] = ii;
-	iperm[ii] = jj;
-	pr[jj] = kk + 1;
-L90:
-	++(*num);
-	jperm[j] = i__;
-	iperm[i__] = j;
-	pr[j] = k + 1;
-L95:
-	;
-    }
-    if (*num == *n) {
-	goto L1000;
-    }
-/* Prepare for main loop */
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	d__[i__] = -1.;
-	l[i__] = 0;
-/* L99: */
-    }
-/* Main loop ... each pass round this loop is similar to Dijkstra's */
-/* algorithm for solving the single source shortest path problem */
-    i__1 = *n;
-    for (jord = 1; jord <= i__1; ++jord) {
-	if (jperm[jord] != 0) {
-	    goto L100;
-	}
-	qlen = 0;
-	low = *n + 1;
-	up = *n + 1;
-/* CSP is cost of shortest path to any unassigned row */
-/* ISP is matrix position of unassigned row element in shortest path */
-/* JSP is column index of unassigned row element in shortest path */
-	csp = -1.;
-/* Build shortest path tree starting from unassigned column JORD */
-	j = jord;
-	pr[j] = -1;
-/* Scan column J */
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    dnew = (d__1 = a[k], abs(d__1));
-	    if (csp >= dnew) {
-		goto L115;
-	    }
-	    if (iperm[i__] == 0) {
-/* Row I is unassigned; update shortest path info */
-		csp = dnew;
-		isp = i__;
-		jsp = j;
-		if (csp >= bv) {
-		    goto L160;
-		}
-	    } else {
-		d__[i__] = dnew;
-		if (dnew >= bv) {
-/* Add row I to Q2 */
-		    --low;
-		    q[low] = i__;
-		} else {
-/* Add row I to Q, and push it */
-		    ++qlen;
-		    l[i__] = qlen;
-		    mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
-		}
-		jj = iperm[i__];
-		pr[jj] = j;
-	    }
-L115:
-	    ;
-	}
-	i__2 = *num;
-	for (jdum = 1; jdum <= i__2; ++jdum) {
-/* If Q2 is empty, extract new rows from Q */
-	    if (low == up) {
-		if (qlen == 0) {
-		    goto L160;
-		}
-		i__ = q[1];
-		if (csp >= d__[i__]) {
-		    goto L160;
-		}
-		bv = d__[i__];
-		i__3 = *n;
-		for (idum = 1; idum <= i__3; ++idum) {
-		    mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__1);
-		    l[i__] = 0;
-		    --low;
-		    q[low] = i__;
-		    if (qlen == 0) {
-			goto L153;
-		    }
-		    i__ = q[1];
-		    if (d__[i__] != bv) {
-			goto L153;
-		    }
-/* L152: */
-		}
-/* End of dummy loop; this point is never reached */
-	    }
-/* Move row Q0 */
-L153:
-	    --up;
-	    q0 = q[up];
-	    dq0 = d__[q0];
-	    l[q0] = up;
-/* Scan column that matches with row Q0 */
-	    j = iperm[q0];
-	    i__3 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__3; ++k) {
-		i__ = irn[k];
-/* Update D(I) */
-		if (l[i__] >= up) {
-		    goto L155;
-		}
-/* Computing MIN */
-		d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1));
-		dnew = min(d__2,d__3);
-		if (csp >= dnew) {
-		    goto L155;
-		}
-		if (iperm[i__] == 0) {
-/* Row I is unassigned; update shortest path info */
-		    csp = dnew;
-		    isp = i__;
-		    jsp = j;
-		    if (csp >= bv) {
-			goto L160;
-		    }
-		} else {
-		    di = d__[i__];
-		    if (di >= bv || di >= dnew) {
-			goto L155;
-		    }
-		    d__[i__] = dnew;
-		    if (dnew >= bv) {
-/* Delete row I from Q (if necessary); add row I to Q2 */
-			if (di != -1.) {
-			    mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], 
-				    &c__1);
-			}
-			l[i__] = 0;
-			--low;
-			q[low] = i__;
-		    } else {
-/* Add row I to Q (if necessary); push row I up Q */
-			if (di == -1.) {
-			    ++qlen;
-			    l[i__] = qlen;
-			}
-			mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
-		    }
-/* Update tree */
-		    jj = iperm[i__];
-		    pr[jj] = j;
-		}
-L155:
-		;
-	    }
-/* L150: */
-	}
-/* If CSP = MINONE, no augmenting path is found */
-L160:
-	if (csp == -1.) {
-	    goto L190;
-	}
-/* Update bottleneck value */
-	bv = min(bv,csp);
-/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
-	++(*num);
-	i__ = isp;
-	j = jsp;
-	i__2 = *num + 1;
-	for (jdum = 1; jdum <= i__2; ++jdum) {
-	    i0 = jperm[j];
-	    jperm[j] = i__;
-	    iperm[i__] = j;
-	    j = pr[j];
-	    if (j == -1) {
-		goto L190;
-	    }
-	    i__ = i0;
-/* L170: */
-	}
-/* End of dummy loop; this point is never reached */
-L190:
-	i__2 = *n;
-	for (kk = up; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    d__[i__] = -1.;
-	    l[i__] = 0;
-/* L191: */
-	}
-	i__2 = up - 1;
-	for (kk = low; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    d__[i__] = -1.;
-/* L192: */
-	}
-	i__2 = qlen;
-	for (kk = 1; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    d__[i__] = -1.;
-	    l[i__] = 0;
-/* L193: */
-	}
-L100:
-	;
-    }
-/* End of main loop */
-/* BV is bottleneck value of final matching */
-    if (*num == *n) {
-	goto L1000;
-    }
-/* Matrix is structurally singular, complete IPERM. */
-/* JPERM, PR are work arrays */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	jperm[j] = 0;
-/* L300: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	if (iperm[i__] == 0) {
-	    ++k;
-	    pr[k] = i__;
-	} else {
-	    j = iperm[i__];
-	    jperm[j] = i__;
-	}
-/* L310: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	if (jperm[i__] != 0) {
-	    goto L320;
-	}
-	++k;
-	jdum = pr[k];
-	iperm[jdum] = i__;
-L320:
-	;
-    }
-L1000:
-    return 0;
-} /* mc64bd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64dd_dist(int_t *i__, int_t *n, int_t *q, double 
-	*d__, int_t *l, int_t *iway)
-{
-    /* System generated locals */
-    int_t i__1;
-
-    /* Local variables */
-    double di;
-    int_t qk, pos, idum, posk;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* Variables N,Q,D,L are described in MC64B/BD */
-/* IF IWAY is equal to 1, then */
-/* node I is pushed from its current position upwards */
-/* IF IWAY is not equal to 1, then */
-/* node I is pushed from its current position downwards */
-/* Local variables and parameters */
-    /* Parameter adjustments */
-    --l;
-    --d__;
-    --q;
-
-    /* Function Body */
-    di = d__[*i__];
-    pos = l[*i__];
-/* POS is index of current position of I in the tree */
-    if (*iway == 1) {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    if (pos <= 1) {
-		goto L20;
-	    }
-	    posk = pos / 2;
-	    qk = q[posk];
-	    if (di <= d__[qk]) {
-		goto L20;
-	    }
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L10: */
-	}
-/* End of dummy loop; this point is never reached */
-    } else {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    if (pos <= 1) {
-		goto L20;
-	    }
-	    posk = pos / 2;
-	    qk = q[posk];
-	    if (di >= d__[qk]) {
-		goto L20;
-	    }
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L15: */
-	}
-/* End of dummy loop; this point is never reached */
-    }
-/* End of dummy if; this point is never reached */
-L20:
-    q[pos] = *i__;
-    l[*i__] = pos;
-    return 0;
-} /* mc64dd_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ed_dist(int_t *qlen, int_t *n, int_t *q, 
-	double *d__, int_t *l, int_t *iway)
-{
-    /* System generated locals */
-    int_t i__1;
-
-    /* Local variables */
-    int_t i__;
-    double di, dk, dr;
-    int_t pos, idum, posk;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
-/*     MC64W/WD (IWAY = 2) */
-/* The root node is deleted from the binary heap. */
-/* Local variables and parameters */
-/* Move last element to begin of Q */
-    /* Parameter adjustments */
-    --l;
-    --d__;
-    --q;
-
-    /* Function Body */
-    i__ = q[*qlen];
-    di = d__[i__];
-    --(*qlen);
-    pos = 1;
-    if (*iway == 1) {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    posk = pos << 1;
-	    if (posk > *qlen) {
-		goto L20;
-	    }
-	    dk = d__[q[posk]];
-	    if (posk < *qlen) {
-		dr = d__[q[posk + 1]];
-		if (dk < dr) {
-		    ++posk;
-		    dk = dr;
-		}
-	    }
-	    if (di >= dk) {
-		goto L20;
-	    }
-/* Exchange old last element with larger priority child */
-	    q[pos] = q[posk];
-	    l[q[pos]] = pos;
-	    pos = posk;
-/* L10: */
-	}
-/* End of dummy loop; this point is never reached */
-    } else {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    posk = pos << 1;
-	    if (posk > *qlen) {
-		goto L20;
-	    }
-	    dk = d__[q[posk]];
-	    if (posk < *qlen) {
-		dr = d__[q[posk + 1]];
-		if (dk > dr) {
-		    ++posk;
-		    dk = dr;
-		}
-	    }
-	    if (di <= dk) {
-		goto L20;
-	    }
-/* Exchange old last element with smaller child */
-	    q[pos] = q[posk];
-	    l[q[pos]] = pos;
-	    pos = posk;
-/* L15: */
-	}
-/* End of dummy loop; this point is never reached */
-    }
-/* End of dummy if; this point is never reached */
-L20:
-    q[pos] = i__;
-    l[i__] = pos;
-    return 0;
-} /* mc64ed_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64fd_dist(int_t *pos0, int_t *qlen, int_t *n, 
-	int_t *q, double *d__, int_t *l, int_t *iway)
-{
-    /* System generated locals */
-    int_t i__1;
-
-    /* Local variables */
-    int_t i__;
-    double di, dk, dr;
-    int_t qk, pos, idum, posk;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
-/*     MC64WD (IWAY = 2). */
-/* Move last element in the heap */
-/* Quick return, if possible */
-    /* Parameter adjustments */
-    --l;
-    --d__;
-    --q;
-
-    /* Function Body */
-    if (*qlen == *pos0) {
-	--(*qlen);
-	return 0;
-    }
-/* Move last element from queue Q to position POS0 */
-/* POS is current position of node I in the tree */
-    i__ = q[*qlen];
-    di = d__[i__];
-    --(*qlen);
-    pos = *pos0;
-    if (*iway == 1) {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    if (pos <= 1) {
-		goto L20;
-	    }
-	    posk = pos / 2;
-	    qk = q[posk];
-	    if (di <= d__[qk]) {
-		goto L20;
-	    }
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L10: */
-	}
-/* End of dummy loop; this point is never reached */
-L20:
-	q[pos] = i__;
-	l[i__] = pos;
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    posk = pos << 1;
-	    if (posk > *qlen) {
-		goto L40;
-	    }
-	    dk = d__[q[posk]];
-	    if (posk < *qlen) {
-		dr = d__[q[posk + 1]];
-		if (dk < dr) {
-		    ++posk;
-		    dk = dr;
-		}
-	    }
-	    if (di >= dk) {
-		goto L40;
-	    }
-	    qk = q[posk];
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L30: */
-	}
-/* End of dummy loop; this point is never reached */
-    } else {
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    if (pos <= 1) {
-		goto L34;
-	    }
-	    posk = pos / 2;
-	    qk = q[posk];
-	    if (di >= d__[qk]) {
-		goto L34;
-	    }
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L32: */
-	}
-/* End of dummy loop; this point is never reached */
-L34:
-	q[pos] = i__;
-	l[i__] = pos;
-	i__1 = *n;
-	for (idum = 1; idum <= i__1; ++idum) {
-	    posk = pos << 1;
-	    if (posk > *qlen) {
-		goto L40;
-	    }
-	    dk = d__[q[posk]];
-	    if (posk < *qlen) {
-		dr = d__[q[posk + 1]];
-		if (dk > dr) {
-		    ++posk;
-		    dk = dr;
-		}
-	    }
-	    if (di <= dk) {
-		goto L40;
-	    }
-	    qk = q[posk];
-	    q[pos] = qk;
-	    l[qk] = pos;
-	    pos = posk;
-/* L36: */
-	}
-/* End of dummy loop; this point is never reached */
-    }
-/* End of dummy if; this point is never reached */
-L40:
-    q[pos] = i__;
-    l[i__] = pos;
-    return 0;
-} /* mc64fd_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64rd_dist(int_t *n, int_t *ne, int_t *ip,
-				   int_t *irn, double *a)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3;
-
-    /* Local variables */
-    int_t j, k, r__, s;
-    double ha;
-    int_t hi, td, mid, len, ipj;
-    double key;
-    int_t last, todo[50], first;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* This subroutine sorts the entries in each column of the */
-/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */
-/* numerical value. */
-/* Local constants */
-/* Local variables */
-/* Local arrays */
-    /* Parameter adjustments */
-    --ip;
-    --a;
-    --irn;
-
-    /* Function Body */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	len = ip[j + 1] - ip[j];
-	if (len <= 1) {
-	    goto L100;
-	}
-	ipj = ip[j];
-/* Sort array roughly with partial quicksort */
-	if (len < 15) {
-	    goto L400;
-	}
-	todo[0] = ipj;
-	todo[1] = ipj + len;
-	td = 2;
-L500:
-	first = todo[td - 2];
-	last = todo[td - 1];
-/* KEY is the smallest of two values present in interval [FIRST,LAST) */
-	key = a[(first + last) / 2];
-	i__2 = last - 1;
-	for (k = first; k <= i__2; ++k) {
-	    ha = a[k];
-	    if (ha == key) {
-		goto L475;
-	    }
-	    if (ha > key) {
-		goto L470;
-	    }
-	    key = ha;
-	    goto L470;
-L475:
-	    ;
-	}
-/* Only one value found in interval, so it is already sorted */
-	td += -2;
-	goto L425;
-/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */
-L470:
-	mid = first;
-	i__2 = last - 1;
-	for (k = first; k <= i__2; ++k) {
-	    if (a[k] <= key) {
-		goto L450;
-	    }
-	    ha = a[mid];
-	    a[mid] = a[k];
-	    a[k] = ha;
-	    hi = irn[mid];
-	    irn[mid] = irn[k];
-	    irn[k] = hi;
-	    ++mid;
-L450:
-	    ;
-	}
-/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */
-/* Stack the longest of the two subintervals first */
-	if (mid - first >= last - mid) {
-	    todo[td + 1] = last;
-	    todo[td] = mid;
-	    todo[td - 1] = mid;
-/*          TODO(TD-1) = FIRST */
-	} else {
-	    todo[td + 1] = mid;
-	    todo[td] = first;
-	    todo[td - 1] = last;
-	    todo[td - 2] = mid;
-	}
-	td += 2;
-L425:
-	if (td == 0) {
-	    goto L400;
-	}
-/* There is still work to be done */
-	if (todo[td - 1] - todo[td - 2] >= 15) {
-	    goto L500;
-	}
-/* Next interval is already short enough for straightforward insertion */
-	td += -2;
-	goto L425;
-/* Complete sorting with straightforward insertion */
-L400:
-	i__2 = ipj + len - 1;
-	for (r__ = ipj + 1; r__ <= i__2; ++r__) {
-	    if (a[r__ - 1] < a[r__]) {
-		ha = a[r__];
-		hi = irn[r__];
-		a[r__] = a[r__ - 1];
-		irn[r__] = irn[r__ - 1];
-		i__3 = ipj + 1;
-		for (s = r__ - 1; s >= i__3; --s) {
-		    if (a[s - 1] < ha) {
-			a[s] = a[s - 1];
-			irn[s] = irn[s - 1];
-		    } else {
-			a[s] = ha;
-			irn[s] = hi;
-			goto L200;
-		    }
-/* L300: */
-		}
-		a[ipj] = ha;
-		irn[ipj] = hi;
-	    }
-L200:
-	    ;
-	}
-L100:
-	;
-    }
-    return 0;
-} /* mc64rd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64sd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
-	irn, double *a, int_t *iperm, int_t *numx, int_t *w, 
-	int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, 
-	int_t *iw4)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3, i__4;
-
-    /* Local variables */
-    int_t i__, j, k, l, ii, mod, cnt, num;
-    double bval, bmin, bmax, rinf;
-    int_t nval, wlen, idum1, idum2, idum3;
-    extern /* Subroutine */ int_t mc64qd_dist(int_t *, int_t *, int_t *, 
-	    int_t *, int_t *, double *, int_t *, double *), 
-	    mc64ud_dist(int_t *, int_t *, int_t *, int_t *, int_t *, 
-	    int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, 
-	    int_t *, int_t *, int_t *, int_t *);
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* N, NE, IP, IRN, are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/*   A(K), K=1..NE, must be set to the value of the entry that */
-/*   corresponds to IRN(k). The entries in each column must be */
-/*   non-negative and ordered by decreasing value. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/*   bottleneck matching: IPERM(I) - 0 or row I is matched to column */
-/*   IPERM(I). */
-/* NUMX is an INT_T variable. On exit, it contains the cardinality */
-/*   of the matching stored in IPERM. */
-/* IW is an INT_T work array of length 10N. */
-/* FC is an int_t array of length N that contains the list of */
-/*   unmatched columns. */
-/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */
-/*   to entries in matrix column J. */
-/*   In the matrix defined by the column parts IP(J)+LENL(J) we know */
-/*   a matching does not exist; in the matrix defined by the column */
-/*   parts IP(J)+LENH(J) we know one exists. */
-/*   LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */
-/*   that is tested for a maximum matching. */
-/* W is an int_t array of length N and contains the indices of the */
-/*   columns for which LENL ne LENH. */
-/* WLEN is number of indices stored in array W. */
-/* IW is int_t work array of length N. */
-/* IW4 is int_t work array of length 4N used by MC64U/UD. */
-/*      EXTERNAL FD05AD,MC64QD,MC64UD */
-/*      DOUBLE PRECISION FD05AD */
-/* BMIN and BMAX are such that a maximum matching exists for the input */
-/*   matrix in which all entries smaller than BMIN are dropped. */
-/*   For BMAX, a maximum matching does not exist. */
-/* BVAL is a value between BMIN and BMAX. */
-/* CNT is the number of calls made to MC64U/UD so far. */
-/* NUM is the cardinality of last matching found. */
-/* Set RINF to largest positive real number */
-/* XSL      RINF = FD05AD(5) */
-    /* Parameter adjustments */
-    --iw4;
-    --iw;
-    --fc;
-    --lenh;
-    --lenl;
-    --len;
-    --w;
-    --iperm;
-    --ip;
-    --a;
-    --irn;
-
-    /* Function Body */
-    rinf = dmach_dist("Overflow");
-/* Compute a first maximum matching from scratch on whole matrix. */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	fc[j] = j;
-	iw[j] = 0;
-	len[j] = ip[j + 1] - ip[j];
-/* L20: */
-    }
-/* The first call to MC64U/UD */
-    cnt = 1;
-    mod = 1;
-    *numx = 0;
-    mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1],
-		numx, n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1],
-		&iw4[*n * 3 + 1]);
-/* IW contains a maximum matching of length NUMX. */
-    num = *numx;
-    if (num != *n) {
-/* Matrix is structurally singular */
-	bmax = rinf;
-    } else {
-/* Matrix is structurally nonsingular, NUM=NUMX=N; */
-/* Set BMAX just above the smallest of all the maximum absolute */
-/* values of the columns */
-	bmax = rinf;
-	i__1 = *n;
-	for (j = 1; j <= i__1; ++j) {
-	    bval = 0.f;
-	    i__2 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__2; ++k) {
-		if (a[k] > bval) {
-		    bval = a[k];
-		}
-/* L25: */
-	    }
-	    if (bval < bmax) {
-		bmax = bval;
-	    }
-/* L30: */
-	}
-	bmax *= 1.001f;
-    }
-/* Initialize BVAL,BMIN */
-    bval = 0.f;
-    bmin = 0.f;
-/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */
-/* Set LEN(J), LENH(J) just after last entry in column J. */
-/* Set LENL(J) just after last entry in column J with value ge BMAX. */
-    wlen = 0;
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	l = ip[j + 1] - ip[j];
-	lenh[j] = l;
-	len[j] = l;
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    if (a[k] < bmax) {
-		goto L46;
-	    }
-/* L45: */
-	}
-/* Column J is empty or all entries are ge BMAX */
-	k = ip[j + 1];
-L46:
-	lenl[j] = k - ip[j];
-/* Add J to W if LENL(J) ne LENH(J) */
-	if (lenl[j] == l) {
-	    goto L48;
-	}
-	++wlen;
-	w[wlen] = j;
-L48:
-	;
-    }
-/* Main loop */
-    i__1 = *ne;
-    for (idum1 = 1; idum1 <= i__1; ++idum1) {
-	if (num == *numx) {
-/* We have a maximum matching in IW; store IW in IPERM */
-	    i__2 = *n;
-	    for (i__ = 1; i__ <= i__2; ++i__) {
-		iperm[i__] = iw[i__];
-/* L50: */
-	    }
-/* Keep going round this loop until matching IW is no longer maximum. */
-	    i__2 = *ne;
-	    for (idum2 = 1; idum2 <= i__2; ++idum2) {
-		bmin = bval;
-		if (bmax == bmin) {
-		    goto L99;
-		}
-/* Find splitting value BVAL */
-		mc64qd_dist(&ip[1], &lenl[1], &len[1], &w[1], &wlen,
-			    &a[1], &nval, &bval);
-		if (nval <= 1) {
-		    goto L99;
-		}
-/* Set LEN such that all matrix entries with value lt BVAL are */
-/* discarded. Store old LEN in LENH. Do this for all columns W(K). */
-/* Each step, either K is incremented or WLEN is decremented. */
-		k = 1;
-		i__3 = *n;
-		for (idum3 = 1; idum3 <= i__3; ++idum3) {
-		    if (k > wlen) {
-			goto L71;
-		    }
-		    j = w[k];
-		    i__4 = ip[j] + lenl[j];
-		    for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) {
-			if (a[ii] >= bval) {
-			    goto L60;
-			}
-			i__ = irn[ii];
-			if (iw[i__] != j) {
-			    goto L55;
-			}
-/* Remove entry from matching */
-			iw[i__] = 0;
-			--num;
-			fc[*n - num] = j;
-L55:
-			;
-		    }
-L60:
-		    lenh[j] = len[j];
-/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */
-		    len[j] = ii - ip[j] + 1;
-/* If LENH(J) = LENL(J), remove J from W */
-		    if (lenl[j] == lenh[j]) {
-			w[k] = w[wlen];
-			--wlen;
-		    } else {
-			++k;
-		    }
-/* L70: */
-		}
-L71:
-		if (num < *numx) {
-		    goto L81;
-		}
-/* L80: */
-	    }
-/* End of dummy loop; this point is never reached */
-/* Set mode for next call to MC64U/UD */
-L81:
-	    mod = 1;
-	} else {
-/* We do not have a maximum matching in IW. */
-	    bmax = bval;
-/* BMIN is the bottleneck value of a maximum matching; */
-/* for BMAX the matching is not maximum, so BMAX>BMIN */
-/*          IF (BMAX .EQ. BMIN) GO TO 99 */
-/* Find splitting value BVAL */
-	    mc64qd_dist(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1],
-			&nval, &bval);
-	    if (nval == 0 || bval == bmin) {
-		goto L99;
-	    }
-/* Set LEN such that all matrix entries with value ge BVAL are */
-/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */
-/* Each step, either K is incremented or WLEN is decremented. */
-	    k = 1;
-	    i__2 = *n;
-	    for (idum3 = 1; idum3 <= i__2; ++idum3) {
-		if (k > wlen) {
-		    goto L88;
-		}
-		j = w[k];
-		i__3 = ip[j] + lenh[j] - 1;
-		for (ii = ip[j] + len[j]; ii <= i__3; ++ii) {
-		    if (a[ii] < bval) {
-			goto L86;
-		    }
-/* L85: */
-		}
-L86:
-		lenl[j] = len[j];
-		len[j] = ii - ip[j];
-		if (lenl[j] == lenh[j]) {
-		    w[k] = w[wlen];
-		    --wlen;
-		} else {
-		    ++k;
-		}
-/* L87: */
-	    }
-/* End of dummy loop; this point is never reached */
-/* Set mode for next call to MC64U/UD */
-L88:
-	    mod = 0;
-	}
-	++cnt;
-	mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1],
-		    &iw[1], &num, numx, &iw4[1], &iw4[*n + 1],
-		    &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]);
-/* IW contains maximum matching of length NUM */
-/* L90: */
-    }
-/* End of dummy loop; this point is never reached */
-/* BMIN is bottleneck value of final matching */
-L99:
-    if (*numx == *n) {
-	goto L1000;
-    }
-/* The matrix is structurally singular, complete IPERM */
-/* W, IW are work arrays */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	w[j] = 0;
-/* L300: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	if (iperm[i__] == 0) {
-	    ++k;
-	    iw[k] = i__;
-	} else {
-	    j = iperm[i__];
-	    w[j] = i__;
-	}
-/* L310: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	if (w[j] != 0) {
-	    goto L320;
-	}
-	++k;
-	idum1 = iw[k];
-	iperm[idum1] = j;
-L320:
-	;
-    }
-L1000:
-    return 0;
-} /* mc64sd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64qd_dist(int_t *ip, int_t *lenl, int_t *lenh, 
-	int_t *w, int_t *wlen, double *a, int_t *nval, double *val)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3;
-
-    /* Local variables */
-    int_t j, k, s;
-    double ha;
-    int_t ii, pos;
-    double split[10];
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* This routine searches for at most XX different numerical values */
-/* in the columns W(1:WLEN). XX>=2. */
-/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */
-/* until XX values are found or all columns have been considered. */
-/* On output, NVAL is the number of different values that is found */
-/* and SPLIT(1:NVAL) contains the values in decreasing order. */
-/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */
-
-/* Scan columns in W(1:WLEN). For each encountered value, if value not */
-/* already present in SPLIT(1:NVAL), insert value such that SPLIT */
-/* remains sorted by decreasing value. */
-/* The sorting is done by straightforward insertion; therefore the use */
-/* of this routine should be avoided for large XX (XX < 20). */
-    /* Parameter adjustments */
-    --a;
-    --w;
-    --lenh;
-    --lenl;
-    --ip;
-
-    /* Function Body */
-    *nval = 0;
-    i__1 = *wlen;
-    for (k = 1; k <= i__1; ++k) {
-	j = w[k];
-	i__2 = ip[j] + lenh[j] - 1;
-	for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) {
-	    ha = a[ii];
-	    if (*nval == 0) {
-		split[0] = ha;
-		*nval = 1;
-	    } else {
-/* Check presence of HA in SPLIT */
-		for (s = *nval; s >= 1; --s) {
-		    if (split[s - 1] == ha) {
-			goto L15;
-		    }
-		    if (split[s - 1] > ha) {
-			pos = s + 1;
-			goto L21;
-		    }
-/* L20: */
-		}
-		pos = 1;
-/* The insertion */
-L21:
-		i__3 = pos;
-		for (s = *nval; s >= i__3; --s) {
-		    split[s] = split[s - 1];
-/* L22: */
-		}
-		split[pos - 1] = ha;
-		++(*nval);
-	    }
-/* Exit loop if XX values are found */
-	    if (*nval == 10) {
-		goto L11;
-	    }
-L15:
-	    ;
-	}
-/* L10: */
-    }
-/* Determine VAL */
-L11:
-    if (*nval > 0) {
-	*val = split[(*nval + 1) / 2 - 1];
-    }
-    return 0;
-} /* mc64qd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ud_dist(int_t *id, int_t *mod, int_t *n, int_t *
-	irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t *
-	iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, 
-	int_t *cv, int_t *out)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3, i__4;
-
-    /* Local variables */
-    int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2, 
-	    jord, last;
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* PR(J) is the previous column to J in the depth first search. */
-/*   Array PR is used as workspace in the sorting algorithm. */
-/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */
-/*   algorithm unless N assignments have not been made in which case */
-/*   N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */
-/* CV(I) is the most recent loop number (ID+JORD) at which row I */
-/*   was visited. */
-/* ARP(J) is the number of entries in column J which have been scanned */
-/*   when looking for a cheap assignment. */
-/* OUT(J) is one less than the number of entries in column J which have */
-/*   not been scanned during one pass through the main loop. */
-/* NUMX is maximum possible size of matching. */
-    /* Parameter adjustments */
-    --out;
-    --cv;
-    --arp;
-    --pr;
-    --iperm;
-    --fc;
-    --lenc;
-    --ip;
-    --irn;
-
-    /* Function Body */
-    if (*id == 1) {
-/* The first call to MC64U/UD. */
-/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */
-	i__1 = *n;
-	for (i__ = 1; i__ <= i__1; ++i__) {
-	    cv[i__] = 0;
-	    arp[i__] = 0;
-/* L5: */
-	}
-	num1 = *n;
-	num2 = *n;
-    } else {
-/* Not the first call to MC64U/UD. */
-/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */
-	if (*mod == 1) {
-	    i__1 = *n;
-	    for (i__ = 1; i__ <= i__1; ++i__) {
-		arp[i__] = 0;
-/* L8: */
-	    }
-	}
-	num1 = *numx;
-	num2 = *n - *numx;
-    }
-    num0 = *num;
-/* NUM0 is size of input matching */
-/* NUM1 is maximum possible size of matching */
-/* NUM2 is maximum allowed number of unassigned rows/columns */
-/* NUM is size of current matching */
-/* Quick return if possible */
-/*      IF (NUM.EQ.N) GO TO 199 */
-/* NFC is number of rows/columns that could not be assigned */
-    nfc = 0;
-/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */
-/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */
-    id0 = (*id - 1) * *n;
-/* Main loop. Each pass round this loop either results in a new */
-/* assignment or gives a column with no assignment */
-    i__1 = *n;
-    for (jord = num0 + 1; jord <= i__1; ++jord) {
-/* Each pass uses unique number ID1 */
-	id1 = id0 + jord;
-/* J is unmatched column */
-	j = fc[jord - num0];
-	pr[j] = -1;
-	i__2 = jord;
-	for (k = 1; k <= i__2; ++k) {
-/* Look for a cheap assignment */
-	    if (arp[j] >= lenc[j]) {
-		goto L30;
-	    }
-	    in1 = ip[j] + arp[j];
-	    in2 = ip[j] + lenc[j] - 1;
-	    i__3 = in2;
-	    for (ii = in1; ii <= i__3; ++ii) {
-		i__ = irn[ii];
-		if (iperm[i__] == 0) {
-		    goto L80;
-		}
-/* L20: */
-	    }
-/* No cheap assignment in row */
-	    arp[j] = lenc[j];
-/* Begin looking for assignment chain starting with row J */
-L30:
-	    out[j] = lenc[j] - 1;
-/* Inner loop.  Extends chain by one or backtracks */
-	    i__3 = jord;
-	    for (kk = 1; kk <= i__3; ++kk) {
-		in1 = out[j];
-		if (in1 < 0) {
-		    goto L50;
-		}
-		in2 = ip[j] + lenc[j] - 1;
-		in1 = in2 - in1;
-/* Forward scan */
-		i__4 = in2;
-		for (ii = in1; ii <= i__4; ++ii) {
-		    i__ = irn[ii];
-		    if (cv[i__] == id1) {
-			goto L40;
-		    }
-/* Column J has not yet been accessed during this pass */
-		    j1 = j;
-		    j = iperm[i__];
-		    cv[i__] = id1;
-		    pr[j] = j1;
-		    out[j1] = in2 - ii - 1;
-		    goto L70;
-L40:
-		    ;
-		}
-/* Backtracking step. */
-L50:
-		j1 = pr[j];
-		if (j1 == -1) {
-/* No augmenting path exists for column J. */
-		    ++nfc;
-		    fc[nfc] = j;
-		    if (nfc > num2) {
-/* A matching of maximum size NUM1 is not possible */
-			last = jord;
-			goto L101;
-		    }
-		    goto L100;
-		}
-		j = j1;
-/* L60: */
-	    }
-/* End of dummy loop; this point is never reached */
-L70:
-	    ;
-	}
-/* End of dummy loop; this point is never reached */
-/* New assignment is made. */
-L80:
-	iperm[i__] = j;
-	arp[j] = ii - ip[j] + 1;
-	++(*num);
-	i__2 = jord;
-	for (k = 1; k <= i__2; ++k) {
-	    j = pr[j];
-	    if (j == -1) {
-		goto L95;
-	    }
-	    ii = ip[j] + lenc[j] - out[j] - 2;
-	    i__ = irn[ii];
-	    iperm[i__] = j;
-/* L90: */
-	}
-/* End of dummy loop; this point is never reached */
-L95:
-	if (*num == num1) {
-/* A matching of maximum size NUM1 is found */
-	    last = jord;
-	    goto L101;
-	}
-
-L100:
-	;
-    }
-/* All unassigned columns have been considered */
-    last = *n;
-/* Now, a transversal is computed or is not possible. */
-/* Complete FC before returning. */
-L101:
-    i__1 = *n;
-    for (jord = last + 1; jord <= i__1; ++jord) {
-	++nfc;
-	fc[nfc] = fc[jord - num0];
-/* L110: */
-    }
-/*  199 RETURN */
-    return 0;
-} /* mc64ud_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64wd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
-	irn, double *a, int_t *iperm, int_t *num, int_t *jperm, 
-	int_t *out, int_t *pr, int_t *q, int_t *l, double *u, 
-	double *d__)
-{
-    /* System generated locals */
-    int_t i__1, i__2, i__3;
-
-    /* Local variables */
-    int_t i__, j, k, i0, k0, k1, k2, q0;
-    double di;
-    int_t ii, jj, kk;
-    double vj;
-    int_t up;
-    double dq0;
-    int_t kk1, kk2;
-    double csp;
-    int_t isp, jsp, low;
-    double dmin__, dnew;
-    int_t jord, qlen, jdum;
-    double rinf;
-    extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, 
-	    double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
-	     int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
-	    , int_t *, int_t *, int_t *, double *, int_t *, 
-	    int_t *);
-
-
-/* *** Copyright (c) 1999  Council for the Central Laboratory of the */
-/*     Research Councils                                             *** */
-/* *** Although every effort has been made to ensure robustness and  *** */
-/* *** reliability of the subroutines in this MC64 suite, we         *** */
-/* *** disclaim any liability arising through the use or misuse of   *** */
-/* *** any of the subroutines.                                       *** */
-/* *** Any problems?   Contact ... */
-/*     Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no)   *** */
-
-/* N, NE, IP, IRN are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/*   A(K), K=1..NE, must be set to the value of the entry that */
-/*   corresponds to IRN(K). It is not altered. */
-/*   All values A(K) must be non-negative. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/*   weighted matching: IPERM(I) = 0 or row I is matched to column */
-/*   IPERM(I). */
-/* NUM is an INT_T variable. On exit, it contains the cardinality of */
-/*   the matching stored in IPERM. */
-/* IW is an INT_T work array of length 5N. */
-/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */
-/*   On exit, U = D(1:N) contains the dual row variable and */
-/*   V = D(N+1:2N) contains the dual column variable. If the matrix */
-/*   is structurally nonsingular (NUM = N), the following holds: */
-/*      U(I)+V(J) <= A(I,J)  if IPERM(I) |= J */
-/*      U(I)+V(J)  = A(I,J)  if IPERM(I)  = J */
-/*      U(I) = 0  if IPERM(I) = 0 */
-/*      V(J) = 0  if there is no I for which IPERM(I) = J */
-/* Local variables */
-/* Local parameters */
-/* External subroutines and/or functions */
-/*      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */
-/*      DOUBLE PRECISION FD05AD */
-/* Set RINF to largest positive real number */
-/* XSL      RINF = FD05AD(5) */
-    /* Parameter adjustments */
-    --d__;
-    --u;
-    --l;
-    --q;
-    --pr;
-    --out;
-    --jperm;
-    --iperm;
-    --ip;
-    --a;
-    --irn;
-
-    /* Function Body */
-    rinf = dmach_dist("Overflow");
-/* Initialization */
-    *num = 0;
-    i__1 = *n;
-    for (k = 1; k <= i__1; ++k) {
-	u[k] = rinf;
-	d__[k] = 0.;
-	iperm[k] = 0;
-	jperm[k] = 0;
-	pr[k] = ip[k];
-	l[k] = 0;
-/* L10: */
-    }
-/* Initialize U(I) */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    if (a[k] > u[i__]) {
-		goto L20;
-	    }
-	    u[i__] = a[k];
-	    iperm[i__] = j;
-	    l[i__] = k;
-L20:
-	    ;
-	}
-/* L30: */
-    }
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	j = iperm[i__];
-	if (j == 0) {
-	    goto L40;
-	}
-/* Row I is not empty */
-	iperm[i__] = 0;
-	if (jperm[j] != 0) {
-	    goto L40;
-	}
-/* Assignment of column J to row I */
-	++(*num);
-	iperm[i__] = j;
-	jperm[j] = l[i__];
-L40:
-	;
-    }
-    if (*num == *n) {
-	goto L1000;
-    }
-/* Scan unassigned columns; improve assignment */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-/* JPERM(J) ne 0 iff column J is already assigned */
-	if (jperm[j] != 0) {
-	    goto L95;
-	}
-	k1 = ip[j];
-	k2 = ip[j + 1] - 1;
-/* Continue only if column J is not empty */
-	if (k1 > k2) {
-	    goto L95;
-	}
-	vj = rinf;
-	i__2 = k2;
-	for (k = k1; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    di = a[k] - u[i__];
-	    if (di > vj) {
-		goto L50;
-	    }
-	    if (di < vj || di == rinf) {
-		goto L55;
-	    }
-	    if (iperm[i__] != 0 || iperm[i0] == 0) {
-		goto L50;
-	    }
-L55:
-	    vj = di;
-	    i0 = i__;
-	    k0 = k;
-L50:
-	    ;
-	}
-	d__[j] = vj;
-	k = k0;
-	i__ = i0;
-	if (iperm[i__] == 0) {
-	    goto L90;
-	}
-	i__2 = k2;
-	for (k = k0; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    if (a[k] - u[i__] > vj) {
-		goto L60;
-	    }
-	    jj = iperm[i__];
-/* Scan remaining part of assigned column JJ */
-	    kk1 = pr[jj];
-	    kk2 = ip[jj + 1] - 1;
-	    if (kk1 > kk2) {
-		goto L60;
-	    }
-	    i__3 = kk2;
-	    for (kk = kk1; kk <= i__3; ++kk) {
-		ii = irn[kk];
-		if (iperm[ii] > 0) {
-		    goto L70;
-		}
-		if (a[kk] - u[ii] <= d__[jj]) {
-		    goto L80;
-		}
-L70:
-		;
-	    }
-	    pr[jj] = kk2 + 1;
-L60:
-	    ;
-	}
-	goto L95;
-L80:
-	jperm[jj] = kk;
-	iperm[ii] = jj;
-	pr[jj] = kk + 1;
-L90:
-	++(*num);
-	jperm[j] = k;
-	iperm[i__] = j;
-	pr[j] = k + 1;
-L95:
-	;
-    }
-    if (*num == *n) {
-	goto L1000;
-    }
-/* Prepare for main loop */
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	d__[i__] = rinf;
-	l[i__] = 0;
-/* L99: */
-    }
-/* Main loop ... each pass round this loop is similar to Dijkstra's */
-/* algorithm for solving the single source shortest path problem */
-    i__1 = *n;
-    for (jord = 1; jord <= i__1; ++jord) {
-	if (jperm[jord] != 0) {
-	    goto L100;
-	}
-/* JORD is next unmatched column */
-/* DMIN is the length of shortest path in the tree */
-	dmin__ = rinf;
-	qlen = 0;
-	low = *n + 1;
-	up = *n + 1;
-/* CSP is the cost of the shortest augmenting path to unassigned row */
-/* IRN(ISP). The corresponding column index is JSP. */
-	csp = rinf;
-/* Build shortest path tree starting from unassigned column (root) JORD */
-	j = jord;
-	pr[j] = -1;
-/* Scan column J */
-	i__2 = ip[j + 1] - 1;
-	for (k = ip[j]; k <= i__2; ++k) {
-	    i__ = irn[k];
-	    dnew = a[k] - u[i__];
-	    if (dnew >= csp) {
-		goto L115;
-	    }
-	    if (iperm[i__] == 0) {
-		csp = dnew;
-		isp = k;
-		jsp = j;
-	    } else {
-		if (dnew < dmin__) {
-		    dmin__ = dnew;
-		}
-		d__[i__] = dnew;
-		++qlen;
-		q[qlen] = k;
-	    }
-L115:
-	    ;
-	}
-/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */
-	q0 = qlen;
-	qlen = 0;
-	i__2 = q0;
-	for (kk = 1; kk <= i__2; ++kk) {
-	    k = q[kk];
-	    i__ = irn[k];
-	    if (csp <= d__[i__]) {
-		d__[i__] = rinf;
-		goto L120;
-	    }
-	    if (d__[i__] <= dmin__) {
-		--low;
-		q[low] = i__;
-		l[i__] = low;
-	    } else {
-		++qlen;
-		l[i__] = qlen;
-		mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
-	    }
-/* Update tree */
-	    jj = iperm[i__];
-	    out[jj] = k;
-	    pr[jj] = j;
-L120:
-	    ;
-	}
-	i__2 = *num;
-	for (jdum = 1; jdum <= i__2; ++jdum) {
-/* If Q2 is empty, extract rows from Q */
-	    if (low == up) {
-		if (qlen == 0) {
-		    goto L160;
-		}
-		i__ = q[1];
-		if (d__[i__] >= csp) {
-		    goto L160;
-		}
-		dmin__ = d__[i__];
-L152:
-		mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__2);
-		--low;
-		q[low] = i__;
-		l[i__] = low;
-		if (qlen == 0) {
-		    goto L153;
-		}
-		i__ = q[1];
-		if (d__[i__] > dmin__) {
-		    goto L153;
-		}
-		goto L152;
-	    }
-/* Q0 is row whose distance D(Q0) to the root is smallest */
-L153:
-	    q0 = q[up - 1];
-	    dq0 = d__[q0];
-/* Exit loop if path to Q0 is longer than the shortest augmenting path */
-	    if (dq0 >= csp) {
-		goto L160;
-	    }
-	    --up;
-/* Scan column that matches with row Q0 */
-	    j = iperm[q0];
-	    vj = dq0 - a[jperm[j]] + u[q0];
-	    i__3 = ip[j + 1] - 1;
-	    for (k = ip[j]; k <= i__3; ++k) {
-		i__ = irn[k];
-		if (l[i__] >= up) {
-		    goto L155;
-		}
-/* DNEW is new cost */
-		dnew = vj + a[k] - u[i__];
-/* Do not update D(I) if DNEW ge cost of shortest path */
-		if (dnew >= csp) {
-		    goto L155;
-		}
-		if (iperm[i__] == 0) {
-/* Row I is unmatched; update shortest path info */
-		    csp = dnew;
-		    isp = k;
-		    jsp = j;
-		} else {
-/* Row I is matched; do not update D(I) if DNEW is larger */
-		    di = d__[i__];
-		    if (di <= dnew) {
-			goto L155;
-		    }
-		    if (l[i__] >= low) {
-			goto L155;
-		    }
-		    d__[i__] = dnew;
-		    if (dnew <= dmin__) {
-			if (l[i__] != 0) {
-			    mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], 
-				    &c__2);
-			}
-			--low;
-			q[low] = i__;
-			l[i__] = low;
-		    } else {
-			if (l[i__] == 0) {
-			    ++qlen;
-			    l[i__] = qlen;
-			}
-			mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
-		    }
-/* Update tree */
-		    jj = iperm[i__];
-		    out[jj] = k;
-		    pr[jj] = j;
-		}
-L155:
-		;
-	    }
-/* L150: */
-	}
-/* If CSP = RINF, no augmenting path is found */
-L160:
-	if (csp == rinf) {
-	    goto L190;
-	}
-/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
-	++(*num);
-	i__ = irn[isp];
-	iperm[i__] = jsp;
-	jperm[jsp] = isp;
-	j = jsp;
-	i__2 = *num;
-	for (jdum = 1; jdum <= i__2; ++jdum) {
-	    jj = pr[j];
-	    if (jj == -1) {
-		goto L180;
-	    }
-	    k = out[j];
-	    i__ = irn[k];
-	    iperm[i__] = jj;
-	    jperm[jj] = k;
-	    j = jj;
-/* L170: */
-	}
-/* End of dummy loop; this point is never reached */
-/* Update U for rows in Q(UP:N) */
-L180:
-	i__2 = *n;
-	for (kk = up; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    u[i__] = u[i__] + d__[i__] - csp;
-/* L185: */
-	}
-L190:
-	i__2 = *n;
-	for (kk = low; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    d__[i__] = rinf;
-	    l[i__] = 0;
-/* L191: */
-	}
-	i__2 = qlen;
-	for (kk = 1; kk <= i__2; ++kk) {
-	    i__ = q[kk];
-	    d__[i__] = rinf;
-	    l[i__] = 0;
-/* L193: */
-	}
-L100:
-	;
-    }
-/* End of main loop */
-/* Set dual column variable in D(1:N) */
-L1000:
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	k = jperm[j];
-	if (k != 0) {
-	    d__[j] = a[k] - u[irn[k]];
-	} else {
-	    d__[j] = 0.;
-	}
-	if (iperm[j] == 0) {
-	    u[j] = 0.;
-	}
-/* L200: */
-    }
-    if (*num == *n) {
-	goto L1100;
-    }
-/* The matrix is structurally singular, complete IPERM. */
-/* JPERM, OUT are work arrays */
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	jperm[j] = 0;
-/* L300: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	if (iperm[i__] == 0) {
-	    ++k;
-	    out[k] = i__;
-	} else {
-	    j = iperm[i__];
-	    jperm[j] = i__;
-	}
-/* L310: */
-    }
-    k = 0;
-    i__1 = *n;
-    for (j = 1; j <= i__1; ++j) {
-	if (jperm[j] != 0) {
-	    goto L320;
-	}
-	++k;
-	jdum = out[k];
-	iperm[jdum] = j;
-L320:
-	;
-    }
-L1100:
-    return 0;
-} /* mc64wd_ */
-
-
diff --git a/SRC/memory.c b/SRC/memory.c
index fd54862..4846242 100644
--- a/SRC/memory.c
+++ b/SRC/memory.c
@@ -12,9 +12,12 @@ at the top-level directory.
  * \brief Memory utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 1.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * September 1, 1999
+ * 
+ * Modified:
+ *   September 30, 2017, add aligned malloc for Intel
  * </pre>
  */
 
@@ -112,17 +115,31 @@ void superlu_free_dist(void *addr)
 
 #else  /* The production mode. */
 
-void *superlu_malloc_dist(size_t size)
-{
+#if defined (__INTEL_COMPILER)
+#include <immintrin.h>
+void * superlu_malloc_dist(size_t size) {
+    void* ptr;
+    int alignment = 1<<12; // align at 4K page
+    if (size > 1<<19 ) { alignment=1<<21; }
+    return (_mm_malloc(size, alignment));
+}
+void  superlu_free_dist(void * ptr)  { _mm_free(ptr); }
+
+// #elif (_POSIX_C_SOURCE>=200112L)
+//
+// void * MALLOC(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);}
+//void   FREE(void * ptr)    {free(ptr);}
+
+#else // normal malloc/free 
+
+void *superlu_malloc_dist(size_t size) {
     void *buf;
     buf = (void *) malloc(size);
     return (buf);
 }
+void superlu_free_dist(void *addr) { free (addr); }
 
-void superlu_free_dist(void *addr)
-{
-    free (addr);
-}
+#endif
 
 #endif  /* End debug malloc/free. */
 
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index dc1bff5..080cd60 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -650,8 +650,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     }
 
     /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix. (simple scheme)
-       ------------------------------------------------------------*/
+     * Diagonal scaling to equilibrate the matrix. (simple scheme)
+     *   for row i = 1:n,  A(i,:) <- A(i,:) / max(abs(A(i,:));
+     *   for column j = 1:n,  A(:,j) <- A(:, j) / max(abs(A(:,j))
+     * ------------------------------------------------------------*/
     if ( Equil ) {
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC(iam, "Enter equil");
@@ -727,7 +729,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 #if ( PRNTlevel>=1 )
 	    if ( !iam ) {
 		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
+		fflush(stdout);
 	    }
 #endif
 	} /* end if Fact ... */
@@ -896,8 +898,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        t = SuperLU_timer_() - t;
 	        stat->utime[ROWPERM] = t;
 #if ( PRNTlevel>=1 )
-                if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
-	                            job, t);
+                if ( !iam ) {
+		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    fflush(stdout);
+		}
 #endif
             } /* end if Fact ... */
 
@@ -916,7 +920,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	else *(unsigned char *)norm = 'I';
 	anorm = pdlangs(norm, A, grid);
 #if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
+	if ( !iam ) { printf(".. anorm %e\n", anorm); 	fflush(stdout); }
 #endif
     }
 
@@ -1020,9 +1024,11 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
                    the nonzero data structures for L & U. */
 #if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+                if ( !iam ) {
+		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+		    fflush(stdout);
+	        }
 #endif
   	        t = SuperLU_timer_();
 	        if ( !(Glu_freeable = (Glu_freeable_t *)
@@ -1048,6 +1054,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 			   	symb_mem_usage.for_lu*1e-6, 
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
+			fflush(stdout);
 		    }
 #endif
 	    	} else { /* symbfact out of memory */
@@ -1216,6 +1223,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
+		fflush(stdout);
             }
 	} /* end printing stats */
     
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index 00aaeba..98bdd7e 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -14,7 +14,7 @@ at the top-level directory.
  * \brief Performs LU factorization in parallel
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.3) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
@@ -25,7 +25,8 @@ at the top-level directory.
  *     July    12, 2011  static scheduling and arbitrary look-ahead
  *     March   13, 2013  change NTAGS to MPI_TAG_UB value
  *     September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
- *     December 31, 2015 rename xMACH to xMACH_DIST
+ *     December 31, 2015 rename xMACH to xMACH_DIST.
+ *     September 30, 2017 optimization for Intel Knights Landing (KNL) node .
  *
  * Sketch of the algorithm 
  *
@@ -139,6 +140,14 @@ at the top-level directory.
 */
 #define PHI_FRAMEWORK
 
+#if 0
+#define CACHELINE 64  /* bytes, Xeon Phi KNL */
+#else
+#define CACHELINE 0  /* not worry about false sharing of different threads */
+#endif
+//#define GEMM_PADLEN 1
+#define GEMM_PADLEN 8
+
 #define PDGSTRF2 pdgstrf2_trsm
 #define PDGSTRS2 pdgstrs2_omp
 
@@ -275,7 +284,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
     double *ucol;
     int *indirect, *indirect2;
-    double *tempv, *tempv2d;
+    int_t *tempi;
+    double *tempu, *tempv, *tempr;
+    /*    double *tempv2d, *tempU2d;  Sherry */
     int iinfo;
     int *ToRecv, *ToSendD, **ToSendR;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -283,8 +294,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     superlu_scope_t *scp;
     float s_eps;
     double thresh;
-    double *tempU2d, *tempu;
-    int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+    /*int full;*/
+    int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
     int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
         *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
         *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
@@ -298,10 +309,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 		    *     2 : transferred in Usub_buf[]
 		    *     3 : transferred in Uval_buf[]
 		    */
-    int **msgcnts, **msgcntsU; /* counts for each panel in the
-                                  look-ahead window */
-    int *factored;  /* factored[j]==0 : L col panel j is factorized */
-    int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+    int **msgcnts, **msgcntsU; /* counts in the look-ahead window */
+    int *factored;  /* factored[j] == 0 : L col panel j is factorized. */
+    int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */
     int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
     etree_node *head, *tail, *ptr;
     int *num_child;
@@ -314,16 +324,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     void *attr_val;
     int flag;
 
+    /* The following variables are used to pad GEMM dimensions so that
+       each is a multiple of vector length (8 doubles for KNL)  */
+    int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN,
+        gemm_n_pad = GEMM_PADLEN;
+    int gemm_padding = 0;
+
     int iword = sizeof (int_t);
     int dword = sizeof (double);
 
-    /* For measuring load imbalence in omp threads*/
+    /* For measuring load imbalence in omp threads */
     double omp_load_imblc = 0.0;
     double *omp_loop_time;
 
-    double CPUOffloadTimer      = 0;
-    double CPUOffloadFlop       = 0;
-    double CPUOffloadMop        = 0;
     double schur_flop_timer     = 0.0;
     double pdgstrf2_timer       = 0.0;
     double pdgstrs2_timer       = 0.0;
@@ -331,8 +344,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double InitTimer            = 0.0; /* including compute schedule, malloc */
     double tt_start, tt_end;
 
-#if !defined( GPU_ACC )
-    /* Counter for couting memory operations */
+/* #if !defined( GPU_ACC ) */
+    /* Counters for memory operations and timings */
     double scatter_mem_op_counter  = 0.0;
     double scatter_mem_op_timer    = 0.0;
     double scatterL_mem_op_counter = 0.0;
@@ -340,6 +353,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double scatterU_mem_op_counter = 0.0;
     double scatterU_mem_op_timer   = 0.0;
 
+    /* Counters for flops/gather/scatter and timings */
     double GatherLTimer            = 0.0;
     double LookAheadRowSepMOP      = 0.0;
     double GatherUTimer             = 0.0;
@@ -349,10 +363,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double LookAheadScatterTimer   = 0.0;
     double LookAheadScatterMOP     = 0.0;
     double RemainGEMMTimer         = 0.0;
+    double RemainGEMM_flops        = 0.0;
     double RemainScatterTimer      = 0.0;
     double NetSchurUpTimer         = 0.0;
     double schur_flop_counter      = 0.0;
-#endif
+/* #endif */
 
 #if ( PRNTlevel>= 1)
     /* count GEMM max dimensions */
@@ -368,6 +383,15 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
     double t1, t2;
     float msg_vol = 0, msg_cnt = 0;
+    double comm_wait_time = 0.0;
+    /* Record GEMM dimensions and times */
+    FILE *fopen(), *fgemm;
+    int gemm_count = 0;
+    typedef struct {
+	int m, n, k;
+	double microseconds;
+    } gemm_profile;
+    gemm_profile *gemm_stats;
 #endif
 
     /* Test the input parameters. */
@@ -383,6 +407,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
     /* Quick return if possible. */
     if (m == 0 || n == 0) return 0;
+
+    double tt1 = SuperLU_timer_ ();
  
     /* 
      * Initialization.  
@@ -405,8 +431,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int tag_ub = *(int *) attr_val;
 
 #if ( PRNTlevel>=1 )
-    if (!iam)
-        printf ("MPI tag upper bound = %d\n", tag_ub);
+    if (!iam) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
 #endif
 
 #if ( DEBUGlevel>=1 )
@@ -414,6 +441,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf (" ***** warning s_eps = %e *****\n", s_eps);
     CHECK_MALLOC (iam, "Enter pdgstrf()");
 #endif
+#if (PROFlevel >= 1 )
+    gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
+    if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w");
+    int *prof_sendR = intCalloc_dist(nsupers);
+#endif
 
     stat->ops[FACT]      = 0.0;
     stat->current_buffer = 0.0;
@@ -435,29 +467,37 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         if (i != 0) {
             if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
                 ABORT ("Malloc fails for Lsub_buf.");
+	    tempi = Llu->Lsub_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+		Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+	    //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
         }
         i = Llu->bufmax[1];
         if (i != 0) {
             if (!(Llu->Lval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
                 ABORT ("Malloc fails for Lval_buf[].");
+	    tempr = Llu->Lval_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+		Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
         }
         i = Llu->bufmax[2];
         if (i != 0) {
             if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
                 ABORT ("Malloc fails for Usub_buf_2[].");
+	    tempi = Llu->Usub_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+                Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+                //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
         }
         i = Llu->bufmax[3];
         if (i != 0) {
             if (!(Llu->Uval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * i)))
                 ABORT ("Malloc fails for Uval_buf_2[].");
+	    tempr = Llu->Uval_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+                Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
         }
     }
 
@@ -519,15 +559,16 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
         ABORT ("Malloc fails for factoredU[].");
     for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+
     log_memory(2 * nsupers * iword, stat);
 
     int num_threads = 1;
 #ifdef _OPENMP
 #pragma omp parallel default(shared)
+    #pragma omp master
     {
-        if (omp_get_thread_num () == 0) {
-            num_threads = omp_get_num_threads ();
-        }
+         //if (omp_get_thread_num () == 0)
+        num_threads = omp_get_num_threads ();
     }
 #endif
 
@@ -538,9 +579,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+    if(!iam) {
+       printf(".. Starting with %d OpenMP threads \n", num_threads );
+       fflush(stdout);
+    }
 #endif
-    double tt1 = SuperLU_timer_ ();
 
     nblocks = 0;
     ncb = nsupers / Pc; /* number of column blocks, horizontal */
@@ -556,10 +599,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
     blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
 #endif
-    log_memory(2 * ncb * iword, stat);
-
 
-    /* insert a check condition here */
+    log_memory(2 * ncb * iword, stat);
 
 #if 0  /* Sherry: not used? */
     /* This bunch is used for static scheduling */
@@ -595,11 +636,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
     look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
     look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
-    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
     log_memory(3 * nsupers * iword, stat);
 
-    /* go through U-factor */
-    for (lb = 0; lb < nrb; ++lb) {
+    /* Sherry: omp parallel? 
+       not worth doing, due to concurrent write to look_ahead_l[jb] */
+    for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
         ib = lb * Pr + myrow;
         index = Llu->Ufstnz_br_ptr[lb];
         if (index) { /* Not an empty row */
@@ -613,7 +655,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             }
         }
     }
-    if (myrow < nsupers % grid->nprow) {
+    if (myrow < nsupers % grid->nprow) { /* leftover block rows */
         ib = nrb * Pr + myrow;
         index = Llu->Ufstnz_br_ptr[nrb];
         if (index) {             /* Not an empty row */
@@ -629,8 +671,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 
     if (options->SymPattern == NO) {
-        /* go through L-factor */
-        for (lb = 0; lb < ncb; lb++) {
+	/* Sherry: omp parallel?
+	   not worth doing, due to concurrent write to look_ahead_l[jb] */
+        for (lb = 0; lb < ncb; lb++) { /* go through L-factor */
             ib = lb * Pc + mycol;
             index = Llu->Lrowind_bc_ptr[lb];
             if (index) {
@@ -644,7 +687,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 }
             }
         }
-        if (mycol < nsupers % grid->npcol) {
+        if (mycol < nsupers % grid->npcol) { /* leftover block columns */
             ib = ncb * Pc + mycol;
             index = Llu->Lrowind_bc_ptr[ncb];
             if (index) {
@@ -678,8 +721,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     /* Instead of half storage, we'll do full storage */
     if (!(Llu->ujrow = doubleCalloc_dist (k * k)))
         ABORT ("Malloc fails for ujrow[].");
-    log_memory(k * k * iword, stat);
 #endif
+    log_memory(k * k * iword, stat);
 
 #if ( PRNTlevel>=1 )
     if (!iam) {
@@ -690,6 +733,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
              (long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
              (long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
              (long int) Llu->bufmax[4]);
+        fflush(stdout);
     }
 #endif
    
@@ -704,26 +748,30 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
     k = CEILING (nsupers, Pr);  /* Number of local block rows */
 
-    /* Following circuit is for finding maximum block size */
+    /* Following code is for finding maximum row dimension of all L panels */
     int local_max_row_size = 0;
     int max_row_size;
 
-    for (int i = 0; i < nsupers; ++i) {
-        int tpc = PCOL (i, grid);
-        if (mycol == tpc) {
-            lk = LBj (i, grid);
-            lsub = Lrowind_bc_ptr[lk];
-            if (lsub != NULL) {
-                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
-            }
-        }
+#if 0
+#if defined _OPENMP  // Sherry: parallel reduction -- seems slower?
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) 
+#endif
+#endif
+    for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
+        //int tpc = PCOL (i, grid);
+	lk = LBj (i, grid);
+	lsub = Lrowind_bc_ptr[lk];
+	if (lsub != NULL) {
+	    if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1];
+	}
 
     }
 
-    /* Max row size is global reduction of within A row */
-    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+    /* Max row size is global reduction within a row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
 
-    /* Buffer size is max of look ahead window */
+    /* Buffer size is max of look-ahead window */
     /* int_t buffer_size =
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
@@ -758,15 +806,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 					  Glu_persist, grid, perm_u );
 #endif
 
+    /* +16 to avoid cache line false sharing */
+    int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt),
+				  (ldt*ldt + CACHELINE / dword) * num_threads);
+
     /* bigU and bigV are either on CPU or on GPU, not both. */
     double* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
-                     bigU has the same size either on CPU or on CPU. */
-    double* bigV; /* for GEMM output matrix, i.e. update matrix. 
-                     On CPU, bigV is small for block-by-block update.
-	             On GPU, bigV is large to hold the aggregate GEMM output.*/
+                      bigU has the same size either on CPU or on CPU. */
+    double* bigV; /* for storing GEMM output matrix, i.e. update matrix. 
+	              bigV is large to hold the aggregate GEMM output.*/
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+    if(!iam) {
+	printf("max_nrows in L panel %d\n", max_row_size);
+	printf("\t.. GEMM buffer size: max_nrows X max_ncols = %d x %d\n",
+	       max_row_size, (bigu_size / ldt));
+	printf(".. BIG U size %d\t BIG V size %d\n", bigu_size, bigv_size);
+	fflush(stdout);
+    }
 #endif
 
 #ifdef GPU_ACC
@@ -774,7 +831,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( checkCuda(cudaHostAlloc((void**)&bigU,  bigu_size * sizeof(double), cudaHostAllocDefault)) )
         ABORT("Malloc fails for dgemm buffer U ");
 
-    int bigv_size = buffer_size;
+    bigv_size = buffer_size;
 #if ( PRNTlevel>=1 )
     if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
 #endif
@@ -830,18 +887,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
 			  + bigu_size + buffer_size ) * dword;
 
-#else  /* not to use GPU */
+#else  /* not CUDA */
     
+    // for GEMM padding 0
+    j = bigu_size / ldt;
+    bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
+    bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
+
+#ifdef __INTEL_COMPILER
+    bigU = _mm_malloc(bigu_size * sizeof(double), 1<<12); // align at 4K page
+    bigV = _mm_malloc(bigv_size * sizeof(double), 1<<12);
+#else
     if ( !(bigU = doubleMalloc_dist(bigu_size)) )
-        ABORT ("Malloc fails for dgemm u buff U"); 
+        ABORT ("Malloc fails for dgemm U buffer"); 
           //Maximum size of bigU= sqrt(buffsize) ?
-
-    int bigv_size = 8 * ldt * ldt * num_threads;
-#if ( PRNTlevel>=1 )
-    if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
-#endif
+    // int bigv_size = 8 * ldt * ldt * num_threads;
     if ( !(bigV = doubleMalloc_dist(bigv_size)) )
-        ABORT ("Malloc failed for dgemm buffer V");
+        ABORT ("Malloc failed for dgemm V buffer");
+#endif
 
 #endif /* end ifdef GPU_ACC */
 
@@ -853,21 +916,27 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if(!iam) {
 	printf ("  Max row size is %d \n", max_row_size);
         printf ("  Threads per process %d \n", num_threads);
-	/* printf ("  Using buffer_size of %d \n", buffer_size); */
+	fflush(stdout);
     }
+
 #endif
 
+#if 0 /* Sherry */
     if (!(tempv2d = doubleCalloc_dist (2 * ((size_t) ldt) * ldt)))
         ABORT ("Calloc fails for tempv2d[].");
     tempU2d = tempv2d + ldt * ldt;
-    if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+#endif
+    /* Sherry: (ldt + 16), avoid cache line false sharing.
+       KNL cacheline size = 64 bytes = 16 int */
+    iinfo = ldt + CACHELINE / sizeof(int);
+    if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
         ABORT ("Malloc fails for indirect[].");
-    if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+    if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
         ABORT ("Malloc fails for indirect[].");
     if (!(iuip = intMalloc_dist (k)))  ABORT ("Malloc fails for iuip[].");
     if (!(ruip = intMalloc_dist (k)))  ABORT ("Malloc fails for ruip[].");
 
-    log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+    log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword
 	       + 2 * k * iword, stat);
 
     int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
@@ -897,13 +966,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else
     Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
 #endif
-    log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
 
-    double *lookAhead_L_buff, *Remain_L_buff;
+    double *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */
     Ublock_info_t *Ublock_info;
-    ldt = sp_ienv_dist (3);       /* max supernode size */
+    ldt = sp_ienv_dist (3); /* max supernode size */
+    /* The following is quite loose */
     lookAhead_L_buff = doubleMalloc_dist(ldt*ldt* (num_look_aheads+1) );
-    log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
 
 #if 0
     Remain_L_buff = (double *) _mm_malloc( sizeof(double)*(Llu->bufmax[1]),64);
@@ -912,13 +980,18 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
     int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
 #else
-    Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1]);
+    j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad);
+    Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1] + j); /* This is loose */
     Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
     int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
     int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
     int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
 #endif
-    log_memory(Llu->bufmax[1] * dword, stat);
+
+    long long alloc_mem = 4 * mrb * iword + mrb * sizeof(Remain_info_t)
+                        + ldt * ldt * (num_look_aheads+1) * dword
+ 			+ Llu->bufmax[1] * dword ;
+    log_memory(alloc_mem, stat);
 
     InitTimer = SuperLU_timer_() - tt1;
 
@@ -928,7 +1001,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
        ** Handle first block column separately to start the pipeline. **
        ################################################################## */
     look_id = 0;
-    msgcnt = msgcnts[0]; /* First count in the window */
+    msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */
     send_req = send_reqs[0];
     recv_req = recv_reqs[0];
 
@@ -952,7 +1025,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         lsub = Lrowind_bc_ptr[lk];
         lusup = Lnzval_bc_ptr[lk];
         if (lsub) {
+	    /* number of entries in Lsub_buf[] to be transferred */
             msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+	    /* number of entries in Lval_buf[] to be transferred */
             msgcnt[1] = lsub[1] * SuperSize (k);
         } else {
             msgcnt[0] = msgcnt[1] = 0;
@@ -964,9 +1039,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 TIC (t1);
 #endif
 
-                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, 0) /* 0 */,
                            scp->comm, &send_req[pj]);
-                MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+                MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj,
+                           SLU_MPI_TAG (1, 0) /* 1 */,
                            scp->comm, &send_req[pj + Pc]);
 #if ( DEBUGlevel>=2 )
                 printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
@@ -976,6 +1053,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
+		++prof_sendR[lk];
                 msg_cnt += 2;
                 msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
 #endif
@@ -984,12 +1063,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     } else {  /* Post immediate receives. */
         if (ToRecv[k] >= 1) {   /* Recv block column L(:,0). */
             scp = &grid->rscp;  /* The scope of process row. */
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
             MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
                        SLU_MPI_TAG (0, 0) /* 0 */ ,
                        scp->comm, &recv_req[0]);
             MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, kcol,
                        SLU_MPI_TAG (1, 0) /* 1 */ ,
                        scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
         }
     } /* end if mycol == 0 */
 
@@ -1001,12 +1088,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             scp = &grid->cscp;  /* The scope of process column. */
             Usub_buf = Llu->Usub_buf_2[0];
             Uval_buf = Llu->Uval_buf_2[0];
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
             MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
                        SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
                        scp->comm, &recv_reqs_u[0][0]);
             MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
                        SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
                        scp->comm, &recv_reqs_u[0][1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DOWN] += t2;
+#endif
         }
     }
 
@@ -1034,7 +1129,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
             look_id = kk0 % (1 + num_look_aheads); /* which column in window */
 
-            if (look_ahead[kk] < k0) { /* does not depend on current column */
+            if (look_ahead[kk] < k0) { /* does not depend on current column k */
                 kcol = PCOL (kk, grid);
                 if (mycol == kcol) { /* I own this panel */
 
@@ -1053,7 +1148,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     msgcnt = msgcnts[look_id];  /* point to the proper count array */
                     send_req = send_reqs[look_id];
 
-                    lk = LBj (kk, grid);    /* Local block number in L */
+                    lk = LBj (kk, grid);    /* Local block number in L. */
                     lsub1 = Lrowind_bc_ptr[lk];
                     if (lsub1) {
                         msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
@@ -1066,12 +1161,21 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     for (pj = 0; pj < Pc; ++pj) {
                         if (ToSendR[lk][pj] != EMPTY) {
                             lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+			    TIC (t1);
+#endif
                             MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                                        SLU_MPI_TAG (0, kk0),  /* (4*kk0)%tag_ub */
                                        scp->comm, &send_req[pj]);
                             MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
                                        SLU_MPI_TAG (1, kk0),  /* (4*kk0+1)%tag_ub */
                                        scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+			    TOC (t2, t1);
+			    stat->utime[COMM] += t2;
+			    stat->utime[COMM_RIGHT] += t2;
+			    ++prof_sendR[lk];
+#endif
 #if ( DEBUGlevel>=2 )
 			    printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
 				    iam, kk, msgcnt[0], msgcnt[1], pj);
@@ -1084,7 +1188,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     if (ToRecv[kk] >= 1) {
                         scp = &grid->rscp;  /* The scope of process row. */
                         recv_req = recv_reqs[look_id];
-
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
                                    mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                    scp->comm, &recv_req[0]);
@@ -1092,29 +1198,41 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                    MPI_DOUBLE, kcol,
                                    SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                    scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
                     }
                     /* stat->time10 += SuperLU_timer_() - ttt1; */
                 }  /* end if mycol == Pc(kk) */
-            }  /* end if look-ahead in L supernodes */
+            }  /* end if look-ahead in L panels */
 
-            /* post irecv for U-row look-ahead */
+            /* Pre-post irecv for U-row look-ahead */
             krow = PROW (kk, grid);
             if (myrow != krow) {
                 if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
                     scp = &grid->cscp;  /* The scope of process column. */
                     Usub_buf = Llu->Usub_buf_2[look_id];
                     Uval_buf = Llu->Uval_buf_2[look_id];
-
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
                     MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
                                SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
                                scp->comm, &recv_reqs_u[look_id][0]);
                     MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
                                SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
                                scp->comm, &recv_reqs_u[look_id][1]);
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
                 }
             }
 
-        }  /* end for each column in look-ahead window for L supernodes */
+        }  /* end for each column in look-ahead window for L panels */
 
         /* stat->time4 += SuperLU_timer_()-tt1; */
 
@@ -1126,6 +1244,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         for (kk0 = kk1; kk0 < kk2; kk0++) {
             kk = perm_c_supno[kk0]; /* order determined from static schedule */  
             if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+		/* does not depend on current column k */
                 kcol = PCOL (kk, grid);
                 krow = PROW (kk, grid);
                 lk = LBj (kk, grid);  /* Local block number across row. NOT USED?? -- Sherry */
@@ -1146,6 +1265,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 } else { /* Check to receive L(:,kk) from the left */
                     flag0 = flag1 = 0;
                     if ( ToRecv[kk] >= 1 ) {
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         if ( recv_req[0] != MPI_REQUEST_NULL ) {
                             MPI_Test (&recv_req[0], &flag0, &status);
                             if ( flag0 ) {
@@ -1161,7 +1283,14 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                 recv_req[1] = MPI_REQUEST_NULL;
                             }
                         } else flag1 = 1;
-                    } else msgcnt[0] = 0;
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
+                    } else {
+                        msgcnt[0] = 0;
+ 	            }
                 }
 
                 if (flag0 && flag1) { /* L(:,kk) is ready */
@@ -1171,10 +1300,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         factoredU[kk0] = 1;
                         /* Parallel triangular solve across process row *krow* --
                            U(k,j) = L(k,k) \ A(k,j).  */
-                        /* double ttt2 = SuperLU_timer_(); */
                         double ttt2 = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */
 #endif
 			{
                             PDGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
@@ -1226,7 +1354,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         /* stat->time2 += SuperLU_timer_()-tt1; */
 
                     } /* end if myrow == krow */
-                } /* end if flag0 ... */
+                } /* end if flag0 & flag1 ... */
             } /* end if factoredU[] ... */
         } /* end for kk0 ... */
 
@@ -1248,13 +1376,21 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         if (mycol == kcol) {
             lk = LBj (k, grid); /* Local block number in L */
 
+#if ( PROFlevel>=1 )
+	    TIC(t1);
+#endif
             for (pj = 0; pj < Pc; ++pj) {
-                /* Wait for Isend to complete before using lsub/lusup buffer */
+                /* Wait for Isend to complete before using lsub/lusup buffer. */
                 if (ToSendR[lk][pj] != EMPTY) {
                     MPI_Wait (&send_req[pj], &status);
                     MPI_Wait (&send_req[pj + Pc], &status);
                 }
             }
+#if ( PROFlevel>=1 )
+	    TOC(t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
             lsub = Lrowind_bc_ptr[lk];
             lusup = Lnzval_bc_ptr[lk];
         } else {
@@ -1265,8 +1401,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 /* ============================================= *
                  * Waiting for L(:,kk) for outer-product uptate  *
                  * if iam in U(kk,:), then the diagonal block    *
-		 * did not reach in time for panel factorization *
-		 * of U(k,:)           	                         *
+                 * did not reach in time for panel factorization *
+                 * of U(k,:).          	                         *
                  * ============================================= */
 #if ( PROFlevel>=1 )
                 TIC (t1);
@@ -1298,6 +1434,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
 #endif
 #if ( DEBUGlevel>=2 )
                 printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
@@ -1315,7 +1452,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
             lsub = Lsub_buf_2[look_id];
             lusup = Lval_buf_2[look_id];
-        }                       /* if mycol = Pc(k) */
+        }  /* else if mycol = Pc(k) */
         /* stat->time1 += SuperLU_timer_()-tt1; */
 
         scp = &grid->cscp;      /* The scope of process column. */
@@ -1331,7 +1468,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                    U(k,j) = L(k,k) \ A(k,j).  */
                  double ttt2 = SuperLU_timer_(); 
 #ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */
 #endif
                 {
                     PDGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
@@ -1350,7 +1487,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
                 if (ToSendD[lk] == YES) {
                     for (pi = 0; pi < Pr; ++pi) {
-                        if (pi != myrow) {
+                        if (pi != myrow) { /* Matching recv was pre-posted before */
 #if ( PROFlevel>=1 )
                             TIC (t1);
 #endif
@@ -1363,6 +1500,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                             TOC (t2, t1);
                             stat->utime[COMM] += t2;
+                            stat->utime[COMM_DOWN] += t2;
                             msg_cnt += 2;
                             msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
 #endif
@@ -1373,20 +1511,28 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     } /* for pi ... */
                 } /* if ToSendD ... */
 
-            } else { /* Panel U(k,:) already factorized */
+            } else { /* Panel U(k,:) already factorized from previous look-ahead */
 
                /* ================================================ *
-                 * Wait for downward sending of U(k,:) to complete *
-		 * for outer-product update                        *
-                 * =============================================== */
+                * Wait for downward sending of U(k,:) to complete  *
+		* for outer-product update.                        *
+                * ================================================ */
 
                 if (ToSendD[lk] == YES) {
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
                     for (pi = 0; pi < Pr; ++pi) {
                         if (pi != myrow) {
                             MPI_Wait (&send_reqs_u[look_id][pi], &status);
                             MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
                         }
                     }
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
                 }
                 msgcnt[2] = msgcntsU[look_id][2];
                 msgcnt[3] = msgcntsU[look_id][3];
@@ -1395,9 +1541,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
         } else {    /* myrow != krow */
 
-            /* ========================================= *
-             * wait for U(k,:) for outer-product updates *
-             * ========================================= */
+            /* ========================================== *
+             * Wait for U(k,:) for outer-product updates. *
+             * ========================================== */
 
             if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
 #if ( PROFlevel>=1 )
@@ -1411,6 +1557,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_DOWN] += t2;
 #endif
                 usub = Usub_buf;
                 uval = Uval_buf;
@@ -1484,8 +1631,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             j = jj0 = 0;
 
 /************************************************************************/
+#if 0
+	for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
             double ttx =SuperLU_timer_();
 
+//#include "dlook_ahead_update_v4.c"
 #include "dlook_ahead_update.c"
 
             lookaheadupdatetimer += SuperLU_timer_() - ttx;
@@ -1512,6 +1663,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
                         look_id = kk0 % (1 + num_look_aheads);
                         recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
                                    mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                    scp->comm, &recv_req[0]);
@@ -1519,6 +1673,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                    MPI_DOUBLE, kcol,
                                    SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                    scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
                     }
                 } else {
                     lk = LBj (kk, grid);    /* Local block number. */
@@ -1551,15 +1710,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         scp = &grid->rscp;  /* The scope of process row. */
                         for (pj = 0; pj < Pc; ++pj) {
                             if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+			       TIC (t1);
+#endif
                                 MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                                            SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                            scp->comm, &send_req[pj]);
                                 MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
                                            SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                            scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+				TOC (t2, t1);
+				stat->utime[COMM] += t2;
+				stat->utime[COMM_RIGHT] += t2;
+				++prof_sendR[lk];
+#endif
                             }
-                        }
-                    }           /* for pj ... */
+                        } /* end for pj ... */
+                    } /* if    factored[kk] ... */
                 }
             }
         }
@@ -1575,6 +1743,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else 
 
 /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+//#include "dSchCompUdt-2Ddynamic_v6.c"
+
 #include "dSchCompUdt-2Ddynamic.c"
 
 #endif 
@@ -1584,7 +1754,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         
         NetSchurUpTimer += SuperLU_timer_() - tsch;
 
-    }  /* for k0 = 0, ... */
+    }  /* MAIN LOOP for k0 = 0, ... */
 
     /* ##################################################################
        ** END MAIN LOOP: for k0 = ...
@@ -1592,12 +1762,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     
     pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
 
-    /* updating total flops */
 #if ( PRNTlevel>=1 )
+    /* Print detailed statistics */
+    /* Updating total flops */
+    double allflops;
+    MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
+	       0, grid->comm);
     if ( iam==0 ) {
 	printf("\nInitialization time\t%8.2lf seconds\n"
 	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
-        printf("\n---- Time breakdown in factorization ----\n");
+        printf("\n==== Time breakdown in factorization (rank 0) ====\n");
+	printf("Panel factorization \t %8.2lf seconds\n",
+	       pdgstrf2_timer + pdgstrs2_timer);
+	printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
+	printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
 	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
         printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
         printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
@@ -1606,21 +1784,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf(".. Time in GEMM %8.2lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
-
+        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", 
+	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
         printf(".. Time to Scatter %8.2lf \n", 
 	       LookAheadScatterTimer + RemainScatterTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
         printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
 
-        printf("Total Time in Factorization            \t: %8.2lf seconds, \n", pxgstrfTimer);
-        printf("Total time in Schur update with offload\t  %8.2lf seconds,\n",CPUOffloadTimer );
+        printf("Total factorization time            \t: %8.2lf seconds, \n", pxgstrfTimer);
         printf("--------\n");
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
 #endif
     
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
             dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
@@ -1632,8 +1809,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 #endif
 
-    // printf("Debug : MPI buffers 1\n");
-
     /********************************************************
      * Free memory                                          *
      ********************************************************/
@@ -1673,7 +1848,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE (factored);
     log_memory(-(6 * nsupers * iword), stat);
 
-
     for (i = 0; i <= num_look_aheads; i++) {
         SUPERLU_FREE (msgcnts[i]);
         SUPERLU_FREE (msgcntsU[i]);
@@ -1693,8 +1867,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE (recv_reqs);
     SUPERLU_FREE (send_reqs);
 
-    // printf("Debug : MPI buffers 3\n");
-
 #ifdef GPU_ACC
     checkCuda (cudaFreeHost (bigV));
     checkCuda (cudaFreeHost (bigU));
@@ -1705,15 +1877,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE( streams );
     SUPERLU_FREE( stream_end_col );
 #else
+  #ifdef __INTEL_COMPILER
+    _mm_free (bigU);
+    _mm_free (bigV);
+  #else
     SUPERLU_FREE (bigV);
     SUPERLU_FREE (bigU);
-#endif
-
+  #endif
+    /* Decrement freed memory from memory stat. */
     log_memory(-(bigv_size + bigu_size) * dword, stat);
-    // printf("Debug : MPI buffers 5\n");
+#endif
 
     SUPERLU_FREE (Llu->ujrow);
-    SUPERLU_FREE (tempv2d);
+    // SUPERLU_FREE (tempv2d);/* Sherry */
     SUPERLU_FREE (indirect);
     SUPERLU_FREE (indirect2); /* Sherry added */
     SUPERLU_FREE (iuip);
@@ -1727,7 +1903,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE(omp_loop_time);
     SUPERLU_FREE(full_u_cols);
     SUPERLU_FREE(blk_ldu);
+#if ( PRNTlevel>=1 )
     log_memory(-2 * ncb * dword, stat);
+#endif
 
     SUPERLU_FREE(lookAheadFullRow);
     SUPERLU_FREE(lookAheadStRow);
@@ -1761,8 +1939,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( iinfo == n + 1 ) *info = 0;
     else *info = iinfo;
 
-    // printf("test out\n");
-
 #if ( PROFlevel>=1 )
     TOC (t2, t1);
     stat->utime[COMM] += t2;
@@ -1777,13 +1953,29 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
         MPI_Reduce (&msg_vol, &msg_vol_max,
                     1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
-        if (!iam) {
+        if ( iam==0 ) {
             printf ("\tPDGSTRF comm stat:"
                     "\tAvg\tMax\t\tAvg\tMax\n"
                     "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
                     msg_cnt_sum / Pr / Pc, msg_cnt_max,
                     msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+	    printf("\t\tcomm time on task 0: %8.2lf\n"
+		   "\t\t\tcomm down DIAG block %8.2lf\n"
+		   "\t\t\tcomm right L panel %8.2lf\n"
+		   "\t\t\tcomm down U panel %8.2lf\n",
+		   stat->utime[COMM], stat->utime[COMM_DIAG],
+		   stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]);
+	    //#include <float.h>
+	    //int Digs = DECIMAL_DIG;
+	    printf("gemm_count %d\n", gemm_count);
+	    for (i = 0; i < gemm_count; ++i)
+		fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
+			gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
+	    
+	    fclose(fgemm);
         }
+	SUPERLU_FREE(gemm_stats);
+	SUPERLU_FREE(prof_sendR);
     }
 #endif
 
@@ -1796,7 +1988,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf (".. # total msg\t%d\n", iinfo);
 #endif
 
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
             dPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 06f0f37..bdff2eb 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -14,10 +14,13 @@ at the top-level directory.
  * \brief Performs panel LU factorization.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * August 15, 2014
  *
+ * Modified:
+ *   September 30, 2017
+ *
  * <pre>
  * Purpose
  * =======
@@ -97,6 +100,7 @@ pdgstrf2_trsm
     int_t Pr;
     MPI_Status status;
     MPI_Comm comm = (grid->cscp).comm;
+    double t1, t2;
 
     /* Initialization. */
     iam = grid->iam;
@@ -128,16 +132,25 @@ pdgstrf2_trsm
     if ( U_diag_blk_send_req && 
 	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
         /* There are pending sends - wait for all Isend to complete */
-        for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
+        for (pr = 0; pr < Pr; ++pr) {
             if (pr != myrow) {
                 MPI_Wait (U_diag_blk_send_req + pr, &status);
             }
-
+	}
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
 	/* flag no more outstanding send request. */
 	U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
     }
 
     if (iam == pkk) {            /* diagonal process */
+	/* ++++ First step compute diagonal block ++++++++++ */
         for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
             /* Diagonal pivot */
             i = luptr;
@@ -196,13 +209,16 @@ pdgstrf2_trsm
 
         }                       /* for column j ...  first loop */
 
-	/* ++++++++++second step ====== */
+	/* ++++ Second step compute off-diagonal block with communication  ++*/
 
         ublk_ptr = ujrow = Llu->ujrow;
 
-        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block */
+        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block downward */
             /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
-            for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
+            for (pr = 0; pr < Pr; ++pr) {
                 if (pr != krow) {
                     /* tag = ((k0<<2)+2) % tag_ub;        */
                     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -211,6 +227,12 @@ pdgstrf2_trsm
                                comm, U_diag_blk_send_req + pr);
 
                 }
+            }
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DIAG] += t2;
+#endif
 
 	    /* flag outstanding Isend */
             U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
@@ -218,8 +240,6 @@ pdgstrf2_trsm
 
         /* pragma below would be changed by an MKL call */
 
-        char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
-
         l = nsupr - nsupc;
         // n = nsupc;
         double alpha = 1.0;
@@ -229,32 +249,36 @@ pdgstrf2_trsm
 #endif
 
 #if defined (USE_VENDOR_BLAS)
-        dtrsm_ (&side, &uplo, &transa, &diag,
-                &l, &nsupc,
+        dtrsm_ ("R", "U", "N", "N", &l, &nsupc,
                 &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
 		1, 1, 1, 1);
 #else
-        dtrsm_ (&side, &uplo, &transa, &diag,
-                &l, &nsupc,
+        dtrsm_ ("R", "U", "N", "N", &l, &nsupc,
                 &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
 #endif
-
+	stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l;
     } else {  /* non-diagonal process */
-        /* ================================================ *
-         * Receive the diagonal block of U                  *
-         * for panel factorization of L(:,k)                *
-         * note: we block for panel factorization of L(:,k) *
-         * but panel factorization of U(:,k) don't          *
-         * ================================================ */
+        /* ================================================================== *
+         * Receive the diagonal block of U for panel factorization of L(:,k). * 
+         * Note: we block for panel factorization of L(:,k), but panel        *
+	 * factorization of U(:,k) do not block                               *
+         * ================================================================== */
 
         /* tag = ((k0<<2)+2) % tag_ub;        */
         /* tag = (4*(nsupers+k0)+2) % tag_ub; */
         // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
         MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow,
                   SLU_MPI_TAG (4, k0) /* tag */ ,
                   comm, &status);
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
         if (nsupr > 0) {
-            char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
             double alpha = 1.0;
 
 #ifdef PI_DEBUG
@@ -263,17 +287,16 @@ pdgstrf2_trsm
                 printf (" Rank :%d \t Empty block column occured :\n", iam);
 #endif
 #if defined (USE_VENDOR_BLAS)
-            dtrsm_ (&side, &uplo, &transa, &diag,
-                    &nsupr, &nsupc,
+            dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
                     &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
 #else
-            dtrsm_ (&side, &uplo, &transa, &diag,
-                    &nsupr, &nsupc,
+            dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
                     &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
 #endif
+	    stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr;
         }
 
-    }                           /* end if pkk ... */
+    } /* end if pkk ... */
 
     /* printf("exiting pdgstrf2 %d \n", grid->iam);  */
 
@@ -300,12 +323,10 @@ void pdgstrs2_omp
     int_t *usub;
     double *lusup, *uval;
 
-#ifdef _OPENMP
-    int thread_id = omp_get_thread_num ();
-    int num_thread = omp_get_num_threads ();
-#else
-    int thread_id = 0;
-    int num_thread = 1;
+#if 0
+    //#ifdef USE_VTUNE
+    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+    __itt_resume(); // start VTune, again use 2 underscores
 #endif
 
     /* Quick return. */
@@ -315,15 +336,12 @@ void pdgstrs2_omp
     /* Initialization. */
     iam = grid->iam;
     pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
-    int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
-    int gb_col_cycle;  /* cycle through block columns  */
+    //int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
+    //int gb_col_cycle;  /* cycle through block columns  */
     klst = FstBlockC (k + 1);
     knsupc = SuperSize (k);
     usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
     uval = Llu->Unzval_br_ptr[lk];
-    nb = usub[0];
-    iukp = BR_HEADER;
-    rukp = 0;
     if (iam == pkk) {
         lk = LBj (k, grid);
         nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
@@ -333,28 +351,45 @@ void pdgstrs2_omp
         lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
     }
 
-    /* Loop through all the row blocks. */
-    for (b = 0; b < nb; ++b)  {
-        /* assuming column cyclic distribution of data among threads */
-        gb = usub[iukp];
-        gb_col_cycle = gb / grid->npcol;
-        nsupc = SuperSize (gb);
-        iukp += UB_DESCRIPTOR;
+    /////////////////////new-test//////////////////////////
+    /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */
+
+    /* Master thread: set up pointers to each block in the row */
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    
+    int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
+    int* blocks_value_pointers = blocks_index_pointers + nb;
+    int* nsupc_temp = blocks_value_pointers + nb;
+    for (b = 0; b < nb; b++) { /* set up pointers to each block */
+	blocks_index_pointers[b] = iukp + UB_DESCRIPTOR;
+	blocks_value_pointers[b] = rukp;
+	gb = usub[iukp];
+	rukp += usub[iukp+1];
+	nsupc = SuperSize( gb );
+	nsupc_temp[b] = nsupc;
+	iukp += (UB_DESCRIPTOR + nsupc);  /* move to the next block */
+    }
+
+    // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
+    // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#pragma omp parallel for schedule(static) default(shared) \
+    private(b,j,iukp,rukp,segsize)
+    /* Loop through all the blocks in the row. */
+    for (b = 0; b < nb; ++b) {
+	iukp = blocks_index_pointers[b];
+	rukp = blocks_value_pointers[b];
 
         /* Loop through all the segments in the block. */
-        for (j = 0; j < nsupc; ++j) {
-#ifdef PI_DEBUG
-            printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
-#endif 
+        for (j = 0; j < nsupc_temp[b]; j++) {
             segsize = klst - usub[iukp++];
-            if (segsize) {    /* Nonzero segment. */
-                luptr = (knsupc - segsize) * (nsupr + 1);
+	    if (segsize) {
+#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30)
+		{ /* Nonzero segment. */
+		    int_t luptr = (knsupc - segsize) * (nsupr + 1);
+		    //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr);
 
-		/* if gb belongs to present thread then do the factorize */
-                if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
-#ifdef PI_DEBUG
-                    printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
-#endif
 #if defined (USE_VENDOR_BLAS)
                     dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
                             &uval[rukp], &incx, 1, 1, 1);
@@ -362,14 +397,22 @@ void pdgstrs2_omp
                     dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
                             &uval[rukp], &incx);
 #endif
-                }
+		} /* end task */
+		rukp += segsize;
+		stat->ops[FACT] += segsize * (segsize + 1);
+	    } /* end if segsize > 0 */
+	} /* end for j in parallel ... */
+/* #pragma omp taskwait */
+    }  /* end for b ... */
 
-                if (thread_id == 0)
-                    stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
-                rukp += segsize;
-            }
-        }
-    }                           /* for b ... */
+    /* Deallocate memory */
+    SUPERLU_FREE(blocks_index_pointers);
+
+#if 0
+    //#ifdef USE_VTUNE
+    __itt_pause(); // stop VTune
+    __SSC_MARK(0x222); // stop SDE tracing
+#endif
 
 } /* PDGSTRS2_omp */
 
diff --git a/SRC/psymbfact.h b/SRC/psymbfact.h
index b65f382..549e51e 100644
--- a/SRC/psymbfact.h
+++ b/SRC/psymbfact.h
@@ -279,8 +279,10 @@ typedef struct {
 /* Code for the type of the memory to expand */
 #define USUB_PR 0
 #define LSUB_PR 1
+/* Sherry: the following are already defined in superlu_enum_const.h 
 #define USUB 0
 #define LSUB 1
+*/
 
 /* 
  * Code for the type of computation - right looking (RL_SYMB); left
@@ -297,6 +299,3 @@ typedef struct {
 
 
 #endif /* __SUPERLU_DIST_PSYMBFACT */
-
-
-
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index 288e6eb..cf92c9f 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -649,8 +649,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     }
 
     /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix. (simple scheme)
-       ------------------------------------------------------------*/
+     * Diagonal scaling to equilibrate the matrix. (simple scheme)
+     *   for row i = 1:n,  A(i,:) <- A(i,:) / max(abs(A(i,:));
+     *   for column j = 1:n,  A(:,j) <- A(:, j) / max(abs(A(:,j))
+     * ------------------------------------------------------------*/
     if ( Equil ) {
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC(iam, "Enter equil");
@@ -727,7 +729,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 #if ( PRNTlevel>=1 )
 	    if ( !iam ) {
 		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
+		fflush(stdout);
 	    }
 #endif
 	} /* end if Fact ... */
@@ -897,8 +899,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        t = SuperLU_timer_() - t;
 	        stat->utime[ROWPERM] = t;
 #if ( PRNTlevel>=1 )
-                if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
-	                            job, t);
+                if ( !iam ) {
+		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    fflush(stdout);
+		}
 #endif
             } /* end if Fact ... */
 
@@ -917,7 +921,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	else *(unsigned char *)norm = 'I';
 	anorm = pzlangs(norm, A, grid);
 #if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
+	if ( !iam ) { printf(".. anorm %e\n", anorm); 	fflush(stdout); }
 #endif
     }
 
@@ -1021,9 +1025,11 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
                    the nonzero data structures for L & U. */
 #if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+                if ( !iam ) {
+		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+		    fflush(stdout);
+	        }
 #endif
   	        t = SuperLU_timer_();
 	        if ( !(Glu_freeable = (Glu_freeable_t *)
@@ -1049,6 +1055,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 			   	symb_mem_usage.for_lu*1e-6, 
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
+			fflush(stdout);
 		    }
 #endif
 	    	} else { /* symbfact out of memory */
@@ -1217,6 +1224,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
+		fflush(stdout);
             }
 	} /* end printing stats */
     
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 61c3aa4..2c1eda0 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -13,7 +13,7 @@ at the top-level directory.
  * \brief Performs LU factorization in parallel
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.3) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
@@ -24,7 +24,8 @@ at the top-level directory.
  *     July    12, 2011  static scheduling and arbitrary look-ahead
  *     March   13, 2013  change NTAGS to MPI_TAG_UB value
  *     September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
- *     December 31, 2015 rename xMACH to xMACH_DIST
+ *     December 31, 2015 rename xMACH to xMACH_DIST.
+ *     September 30, 2017 optimization for Intel Knights Landing (KNL) node .
  *
  * Sketch of the algorithm 
  *
@@ -138,6 +139,14 @@ at the top-level directory.
 */
 #define PHI_FRAMEWORK
 
+#if 0
+#define CACHELINE 64  /* bytes, Xeon Phi KNL */
+#else
+#define CACHELINE 0  /* not worry about false sharing of different threads */
+#endif
+//#define GEMM_PADLEN 1
+#define GEMM_PADLEN 8
+
 #define PZGSTRF2 pzgstrf2_trsm
 #define PZGSTRS2 pzgstrs2_omp
 
@@ -275,7 +284,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
     doublecomplex *ucol;
     int *indirect, *indirect2;
-    doublecomplex *tempv, *tempv2d;
+    int_t *tempi;
+    doublecomplex *tempu, *tempv, *tempr;
+    /*    doublecomplex *tempv2d, *tempU2d;  Sherry */
     int iinfo;
     int *ToRecv, *ToSendD, **ToSendR;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -283,8 +294,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     superlu_scope_t *scp;
     float s_eps;
     double thresh;
-    doublecomplex *tempU2d, *tempu;
-    int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+    /*int full;*/
+    int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
     int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
         *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
         *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
@@ -298,10 +309,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 		    *     2 : transferred in Usub_buf[]
 		    *     3 : transferred in Uval_buf[]
 		    */
-    int **msgcnts, **msgcntsU; /* counts for each panel in the
-                                  look-ahead window */
-    int *factored;  /* factored[j]==0 : L col panel j is factorized */
-    int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+    int **msgcnts, **msgcntsU; /* counts in the look-ahead window */
+    int *factored;  /* factored[j] == 0 : L col panel j is factorized. */
+    int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */
     int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
     etree_node *head, *tail, *ptr;
     int *num_child;
@@ -314,16 +324,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     void *attr_val;
     int flag;
 
+    /* The following variables are used to pad GEMM dimensions so that
+       each is a multiple of vector length (8 doubles for KNL)  */
+    int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN,
+        gemm_n_pad = GEMM_PADLEN;
+    int gemm_padding = 0;
+
     int iword = sizeof (int_t);
     int dword = sizeof (doublecomplex);
 
-    /* For measuring load imbalence in omp threads*/
+    /* For measuring load imbalence in omp threads */
     double omp_load_imblc = 0.0;
     double *omp_loop_time;
 
-    double CPUOffloadTimer      = 0;
-    double CPUOffloadFlop       = 0;
-    double CPUOffloadMop        = 0;
     double schur_flop_timer     = 0.0;
     double pdgstrf2_timer       = 0.0;
     double pdgstrs2_timer       = 0.0;
@@ -331,8 +344,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double InitTimer            = 0.0; /* including compute schedule, malloc */
     double tt_start, tt_end;
 
-#if !defined( GPU_ACC )
-    /* Counter for couting memory operations */
+/* #if !defined( GPU_ACC ) */
+    /* Counters for memory operations and timings */
     double scatter_mem_op_counter  = 0.0;
     double scatter_mem_op_timer    = 0.0;
     double scatterL_mem_op_counter = 0.0;
@@ -340,6 +353,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double scatterU_mem_op_counter = 0.0;
     double scatterU_mem_op_timer   = 0.0;
 
+    /* Counters for flops/gather/scatter and timings */
     double GatherLTimer            = 0.0;
     double LookAheadRowSepMOP      = 0.0;
     double GatherUTimer             = 0.0;
@@ -349,10 +363,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     double LookAheadScatterTimer   = 0.0;
     double LookAheadScatterMOP     = 0.0;
     double RemainGEMMTimer         = 0.0;
+    double RemainGEMM_flops        = 0.0;
     double RemainScatterTimer      = 0.0;
     double NetSchurUpTimer         = 0.0;
     double schur_flop_counter      = 0.0;
-#endif
+/* #endif */
 
 #if ( PRNTlevel>= 1)
     /* count GEMM max dimensions */
@@ -368,6 +383,15 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
     double t1, t2;
     float msg_vol = 0, msg_cnt = 0;
+    double comm_wait_time = 0.0;
+    /* Record GEMM dimensions and times */
+    FILE *fopen(), *fgemm;
+    int gemm_count = 0;
+    typedef struct {
+	int m, n, k;
+	double microseconds;
+    } gemm_profile;
+    gemm_profile *gemm_stats;
 #endif
 
     /* Test the input parameters. */
@@ -383,6 +407,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
     /* Quick return if possible. */
     if (m == 0 || n == 0) return 0;
+
+    double tt1 = SuperLU_timer_ ();
  
     /* 
      * Initialization.  
@@ -405,8 +431,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int tag_ub = *(int *) attr_val;
 
 #if ( PRNTlevel>=1 )
-    if (!iam)
-        printf ("MPI tag upper bound = %d\n", tag_ub);
+    if (!iam) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
 #endif
 
 #if ( DEBUGlevel>=1 )
@@ -414,6 +441,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf (" ***** warning s_eps = %e *****\n", s_eps);
     CHECK_MALLOC (iam, "Enter pdgstrf()");
 #endif
+#if (PROFlevel >= 1 )
+    gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
+    if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w");
+    int *prof_sendR = intCalloc_dist(nsupers);
+#endif
 
     stat->ops[FACT]      = 0.0;
     stat->current_buffer = 0.0;
@@ -435,29 +467,37 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         if (i != 0) {
             if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
                 ABORT ("Malloc fails for Lsub_buf.");
+	    tempi = Llu->Lsub_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+		Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+	    //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
         }
         i = Llu->bufmax[1];
         if (i != 0) {
             if (!(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
                 ABORT ("Malloc fails for Lval_buf[].");
+	    tempr = Llu->Lval_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+		Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
         }
         i = Llu->bufmax[2];
         if (i != 0) {
             if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
                 ABORT ("Malloc fails for Usub_buf_2[].");
+	    tempi = Llu->Usub_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+                Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+                //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
         }
         i = Llu->bufmax[3];
         if (i != 0) {
             if (!(Llu->Uval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * i)))
                 ABORT ("Malloc fails for Uval_buf_2[].");
+	    tempr = Llu->Uval_buf_2[0];
             for (jj = 0; jj < num_look_aheads; jj++)
-                Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+                Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
         }
     }
 
@@ -519,15 +559,16 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
         ABORT ("Malloc fails for factoredU[].");
     for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+
     log_memory(2 * nsupers * iword, stat);
 
     int num_threads = 1;
 #ifdef _OPENMP
 #pragma omp parallel default(shared)
+    #pragma omp master
     {
-        if (omp_get_thread_num () == 0) {
-            num_threads = omp_get_num_threads ();
-        }
+         //if (omp_get_thread_num () == 0)
+        num_threads = omp_get_num_threads ();
     }
 #endif
 
@@ -538,9 +579,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+    if(!iam) {
+       printf(".. Starting with %d OpenMP threads \n", num_threads );
+       fflush(stdout);
+    }
 #endif
-    double tt1 = SuperLU_timer_ ();
 
     nblocks = 0;
     ncb = nsupers / Pc; /* number of column blocks, horizontal */
@@ -556,10 +599,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
     blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
 #endif
-    log_memory(2 * ncb * iword, stat);
-
 
-    /* insert a check condition here */
+    log_memory(2 * ncb * iword, stat);
 
 #if 0  /* Sherry: not used? */
     /* This bunch is used for static scheduling */
@@ -595,11 +636,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
     look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
     look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
-    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
     log_memory(3 * nsupers * iword, stat);
 
-    /* go through U-factor */
-    for (lb = 0; lb < nrb; ++lb) {
+    /* Sherry: omp parallel? 
+       not worth doing, due to concurrent write to look_ahead_l[jb] */
+    for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
         ib = lb * Pr + myrow;
         index = Llu->Ufstnz_br_ptr[lb];
         if (index) { /* Not an empty row */
@@ -613,7 +655,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             }
         }
     }
-    if (myrow < nsupers % grid->nprow) {
+    if (myrow < nsupers % grid->nprow) { /* leftover block rows */
         ib = nrb * Pr + myrow;
         index = Llu->Ufstnz_br_ptr[nrb];
         if (index) {             /* Not an empty row */
@@ -629,8 +671,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 
     if (options->SymPattern == NO) {
-        /* go through L-factor */
-        for (lb = 0; lb < ncb; lb++) {
+	/* Sherry: omp parallel?
+	   not worth doing, due to concurrent write to look_ahead_l[jb] */
+        for (lb = 0; lb < ncb; lb++) { /* go through L-factor */
             ib = lb * Pc + mycol;
             index = Llu->Lrowind_bc_ptr[lb];
             if (index) {
@@ -644,7 +687,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 }
             }
         }
-        if (mycol < nsupers % grid->npcol) {
+        if (mycol < nsupers % grid->npcol) { /* leftover block columns */
             ib = ncb * Pc + mycol;
             index = Llu->Lrowind_bc_ptr[ncb];
             if (index) {
@@ -678,8 +721,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     /* Instead of half storage, we'll do full storage */
     if (!(Llu->ujrow = doublecomplexCalloc_dist (k * k)))
         ABORT ("Malloc fails for ujrow[].");
-    log_memory(k * k * iword, stat);
 #endif
+    log_memory(k * k * iword, stat);
 
 #if ( PRNTlevel>=1 )
     if (!iam) {
@@ -690,6 +733,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
              (long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
              (long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
              (long int) Llu->bufmax[4]);
+        fflush(stdout);
     }
 #endif
    
@@ -704,26 +748,30 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
     k = CEILING (nsupers, Pr);  /* Number of local block rows */
 
-    /* Following circuit is for finding maximum block size */
+    /* Following code is for finding maximum row dimension of all L panels */
     int local_max_row_size = 0;
     int max_row_size;
 
-    for (int i = 0; i < nsupers; ++i) {
-        int tpc = PCOL (i, grid);
-        if (mycol == tpc) {
-            lk = LBj (i, grid);
-            lsub = Lrowind_bc_ptr[lk];
-            if (lsub != NULL) {
-                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
-            }
-        }
+#if 0
+#if defined _OPENMP  // Sherry: parallel reduction -- seems slower?
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) 
+#endif
+#endif
+    for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
+        //int tpc = PCOL (i, grid);
+	lk = LBj (i, grid);
+	lsub = Lrowind_bc_ptr[lk];
+	if (lsub != NULL) {
+	    if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1];
+	}
 
     }
 
-    /* Max row size is global reduction of within A row */
-    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+    /* Max row size is global reduction within a row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
 
-    /* Buffer size is max of look ahead window */
+    /* Buffer size is max of look-ahead window */
     /* int_t buffer_size =
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
@@ -758,15 +806,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 					  Glu_persist, grid, perm_u );
 #endif
 
+    /* +16 to avoid cache line false sharing */
+    int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt),
+				  (ldt*ldt + CACHELINE / dword) * num_threads);
+
     /* bigU and bigV are either on CPU or on GPU, not both. */
     doublecomplex* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
-                     bigU has the same size either on CPU or on CPU. */
-    doublecomplex* bigV; /* for GEMM output matrix, i.e. update matrix. 
-                     On CPU, bigV is small for block-by-block update.
-	             On GPU, bigV is large to hold the aggregate GEMM output.*/
+                      bigU has the same size either on CPU or on CPU. */
+    doublecomplex* bigV; /* for storing GEMM output matrix, i.e. update matrix. 
+	              bigV is large to hold the aggregate GEMM output.*/
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+    if(!iam) {
+	printf("max_nrows in L panel %d\n", max_row_size);
+	printf("\t.. GEMM buffer size: max_nrows X max_ncols = %d x %d\n",
+	       max_row_size, (bigu_size / ldt));
+	printf(".. BIG U size %d\t BIG V size %d\n", bigu_size, bigv_size);
+	fflush(stdout);
+    }
 #endif
 
 #ifdef GPU_ACC
@@ -774,7 +831,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( checkCuda(cudaHostAlloc((void**)&bigU,  bigu_size * sizeof(doublecomplex), cudaHostAllocDefault)) )
         ABORT("Malloc fails for zgemm buffer U ");
 
-    int bigv_size = buffer_size;
+    bigv_size = buffer_size;
 #if ( PRNTlevel>=1 )
     if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
 #endif
@@ -830,18 +887,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
 			  + bigu_size + buffer_size ) * dword;
 
-#else  /* not to use GPU */
+#else  /* not CUDA */
     
+    // for GEMM padding 0
+    j = bigu_size / ldt;
+    bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
+    bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
+
+#ifdef __INTEL_COMPILER
+    bigU = _mm_malloc(bigu_size * sizeof(doublecomplex), 1<<12); // align at 4K page
+    bigV = _mm_malloc(bigv_size * sizeof(doublecomplex), 1<<12);
+#else
     if ( !(bigU = doublecomplexMalloc_dist(bigu_size)) )
-        ABORT ("Malloc fails for zgemm u buff U"); 
+        ABORT ("Malloc fails for zgemm U buffer"); 
           //Maximum size of bigU= sqrt(buffsize) ?
-
-    int bigv_size = 8 * ldt * ldt * num_threads;
-#if ( PRNTlevel>=1 )
-    if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
-#endif
+    // int bigv_size = 8 * ldt * ldt * num_threads;
     if ( !(bigV = doublecomplexMalloc_dist(bigv_size)) )
-        ABORT ("Malloc failed for zgemm buffer V");
+        ABORT ("Malloc failed for zgemm V buffer");
+#endif
 
 #endif /* end ifdef GPU_ACC */
 
@@ -853,21 +916,27 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if(!iam) {
 	printf ("  Max row size is %d \n", max_row_size);
         printf ("  Threads per process %d \n", num_threads);
-	/* printf ("  Using buffer_size of %d \n", buffer_size); */
+	fflush(stdout);
     }
+
 #endif
 
+#if 0 /* Sherry */
     if (!(tempv2d = doublecomplexCalloc_dist (2 * ((size_t) ldt) * ldt)))
         ABORT ("Calloc fails for tempv2d[].");
     tempU2d = tempv2d + ldt * ldt;
-    if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+#endif
+    /* Sherry: (ldt + 16), avoid cache line false sharing.
+       KNL cacheline size = 64 bytes = 16 int */
+    iinfo = ldt + CACHELINE / sizeof(int);
+    if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
         ABORT ("Malloc fails for indirect[].");
-    if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+    if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
         ABORT ("Malloc fails for indirect[].");
     if (!(iuip = intMalloc_dist (k)))  ABORT ("Malloc fails for iuip[].");
     if (!(ruip = intMalloc_dist (k)))  ABORT ("Malloc fails for ruip[].");
 
-    log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+    log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword
 	       + 2 * k * iword, stat);
 
     int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
@@ -897,13 +966,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else
     Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
 #endif
-    log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
 
-    doublecomplex *lookAhead_L_buff, *Remain_L_buff;
+    doublecomplex *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */
     Ublock_info_t *Ublock_info;
-    ldt = sp_ienv_dist (3);       /* max supernode size */
+    ldt = sp_ienv_dist (3); /* max supernode size */
+    /* The following is quite loose */
     lookAhead_L_buff = doublecomplexMalloc_dist(ldt*ldt* (num_look_aheads+1) );
-    log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
 
 #if 0
     Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex)*(Llu->bufmax[1]),64);
@@ -912,13 +980,18 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
     int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
 #else
-    Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1]);
+    j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad);
+    Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1] + j); /* This is loose */
     Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
     int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
     int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
     int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
 #endif
-    log_memory(Llu->bufmax[1] * dword, stat);
+
+    long long alloc_mem = 4 * mrb * iword + mrb * sizeof(Remain_info_t)
+                        + ldt * ldt * (num_look_aheads+1) * dword
+ 			+ Llu->bufmax[1] * dword ;
+    log_memory(alloc_mem, stat);
 
     InitTimer = SuperLU_timer_() - tt1;
 
@@ -928,7 +1001,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
        ** Handle first block column separately to start the pipeline. **
        ################################################################## */
     look_id = 0;
-    msgcnt = msgcnts[0]; /* First count in the window */
+    msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */
     send_req = send_reqs[0];
     recv_req = recv_reqs[0];
 
@@ -952,7 +1025,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         lsub = Lrowind_bc_ptr[lk];
         lusup = Lnzval_bc_ptr[lk];
         if (lsub) {
+	    /* number of entries in Lsub_buf[] to be transferred */
             msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+	    /* number of entries in Lval_buf[] to be transferred */
             msgcnt[1] = lsub[1] * SuperSize (k);
         } else {
             msgcnt[0] = msgcnt[1] = 0;
@@ -964,9 +1039,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 TIC (t1);
 #endif
 
-                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, 0) /* 0 */,
                            scp->comm, &send_req[pj]);
-                MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+                MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                           SLU_MPI_TAG (1, 0) /* 1 */,
                            scp->comm, &send_req[pj + Pc]);
 #if ( DEBUGlevel>=2 )
                 printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
@@ -976,6 +1053,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
+		++prof_sendR[lk];
                 msg_cnt += 2;
                 msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
 #endif
@@ -984,12 +1063,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     } else {  /* Post immediate receives. */
         if (ToRecv[k] >= 1) {   /* Recv block column L(:,0). */
             scp = &grid->rscp;  /* The scope of process row. */
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
             MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
                        SLU_MPI_TAG (0, 0) /* 0 */ ,
                        scp->comm, &recv_req[0]);
             MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol,
                        SLU_MPI_TAG (1, 0) /* 1 */ ,
                        scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
         }
     } /* end if mycol == 0 */
 
@@ -1001,12 +1088,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             scp = &grid->cscp;  /* The scope of process column. */
             Usub_buf = Llu->Usub_buf_2[0];
             Uval_buf = Llu->Uval_buf_2[0];
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
             MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
                        SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
                        scp->comm, &recv_reqs_u[0][0]);
             MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
                        SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
                        scp->comm, &recv_reqs_u[0][1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DOWN] += t2;
+#endif
         }
     }
 
@@ -1034,7 +1129,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
             look_id = kk0 % (1 + num_look_aheads); /* which column in window */
 
-            if (look_ahead[kk] < k0) { /* does not depend on current column */
+            if (look_ahead[kk] < k0) { /* does not depend on current column k */
                 kcol = PCOL (kk, grid);
                 if (mycol == kcol) { /* I own this panel */
 
@@ -1053,7 +1148,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     msgcnt = msgcnts[look_id];  /* point to the proper count array */
                     send_req = send_reqs[look_id];
 
-                    lk = LBj (kk, grid);    /* Local block number in L */
+                    lk = LBj (kk, grid);    /* Local block number in L. */
                     lsub1 = Lrowind_bc_ptr[lk];
                     if (lsub1) {
                         msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
@@ -1066,12 +1161,21 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     for (pj = 0; pj < Pc; ++pj) {
                         if (ToSendR[lk][pj] != EMPTY) {
                             lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+			    TIC (t1);
+#endif
                             MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                                        SLU_MPI_TAG (0, kk0),  /* (4*kk0)%tag_ub */
                                        scp->comm, &send_req[pj]);
                             MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
                                        SLU_MPI_TAG (1, kk0),  /* (4*kk0+1)%tag_ub */
                                        scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+			    TOC (t2, t1);
+			    stat->utime[COMM] += t2;
+			    stat->utime[COMM_RIGHT] += t2;
+			    ++prof_sendR[lk];
+#endif
 #if ( DEBUGlevel>=2 )
 			    printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
 				    iam, kk, msgcnt[0], msgcnt[1], pj);
@@ -1084,7 +1188,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     if (ToRecv[kk] >= 1) {
                         scp = &grid->rscp;  /* The scope of process row. */
                         recv_req = recv_reqs[look_id];
-
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
                                    mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                    scp->comm, &recv_req[0]);
@@ -1092,29 +1198,41 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                    SuperLU_MPI_DOUBLE_COMPLEX, kcol,
                                    SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                    scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
                     }
                     /* stat->time10 += SuperLU_timer_() - ttt1; */
                 }  /* end if mycol == Pc(kk) */
-            }  /* end if look-ahead in L supernodes */
+            }  /* end if look-ahead in L panels */
 
-            /* post irecv for U-row look-ahead */
+            /* Pre-post irecv for U-row look-ahead */
             krow = PROW (kk, grid);
             if (myrow != krow) {
                 if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
                     scp = &grid->cscp;  /* The scope of process column. */
                     Usub_buf = Llu->Usub_buf_2[look_id];
                     Uval_buf = Llu->Uval_buf_2[look_id];
-
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
                     MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
                                SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
                                scp->comm, &recv_reqs_u[look_id][0]);
                     MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
                                SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
                                scp->comm, &recv_reqs_u[look_id][1]);
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
                 }
             }
 
-        }  /* end for each column in look-ahead window for L supernodes */
+        }  /* end for each column in look-ahead window for L panels */
 
         /* stat->time4 += SuperLU_timer_()-tt1; */
 
@@ -1126,6 +1244,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         for (kk0 = kk1; kk0 < kk2; kk0++) {
             kk = perm_c_supno[kk0]; /* order determined from static schedule */  
             if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+		/* does not depend on current column k */
                 kcol = PCOL (kk, grid);
                 krow = PROW (kk, grid);
                 lk = LBj (kk, grid);  /* Local block number across row. NOT USED?? -- Sherry */
@@ -1146,6 +1265,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 } else { /* Check to receive L(:,kk) from the left */
                     flag0 = flag1 = 0;
                     if ( ToRecv[kk] >= 1 ) {
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         if ( recv_req[0] != MPI_REQUEST_NULL ) {
                             MPI_Test (&recv_req[0], &flag0, &status);
                             if ( flag0 ) {
@@ -1161,7 +1283,14 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                 recv_req[1] = MPI_REQUEST_NULL;
                             }
                         } else flag1 = 1;
-                    } else msgcnt[0] = 0;
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
+                    } else {
+                        msgcnt[0] = 0;
+ 	            }
                 }
 
                 if (flag0 && flag1) { /* L(:,kk) is ready */
@@ -1171,10 +1300,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         factoredU[kk0] = 1;
                         /* Parallel triangular solve across process row *krow* --
                            U(k,j) = L(k,k) \ A(k,j).  */
-                        /* double ttt2 = SuperLU_timer_(); */
                         double ttt2 = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */
 #endif
 			{
                             PZGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
@@ -1226,7 +1354,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         /* stat->time2 += SuperLU_timer_()-tt1; */
 
                     } /* end if myrow == krow */
-                } /* end if flag0 ... */
+                } /* end if flag0 & flag1 ... */
             } /* end if factoredU[] ... */
         } /* end for kk0 ... */
 
@@ -1248,13 +1376,21 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         if (mycol == kcol) {
             lk = LBj (k, grid); /* Local block number in L */
 
+#if ( PROFlevel>=1 )
+	    TIC(t1);
+#endif
             for (pj = 0; pj < Pc; ++pj) {
-                /* Wait for Isend to complete before using lsub/lusup buffer */
+                /* Wait for Isend to complete before using lsub/lusup buffer. */
                 if (ToSendR[lk][pj] != EMPTY) {
                     MPI_Wait (&send_req[pj], &status);
                     MPI_Wait (&send_req[pj + Pc], &status);
                 }
             }
+#if ( PROFlevel>=1 )
+	    TOC(t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
             lsub = Lrowind_bc_ptr[lk];
             lusup = Lnzval_bc_ptr[lk];
         } else {
@@ -1265,8 +1401,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                 /* ============================================= *
                  * Waiting for L(:,kk) for outer-product uptate  *
                  * if iam in U(kk,:), then the diagonal block    *
-		 * did not reach in time for panel factorization *
-		 * of U(k,:)           	                         *
+                 * did not reach in time for panel factorization *
+                 * of U(k,:).          	                         *
                  * ============================================= */
 #if ( PROFlevel>=1 )
                 TIC (t1);
@@ -1298,6 +1434,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
 #endif
 #if ( DEBUGlevel>=2 )
                 printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
@@ -1315,7 +1452,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
             lsub = Lsub_buf_2[look_id];
             lusup = Lval_buf_2[look_id];
-        }                       /* if mycol = Pc(k) */
+        }  /* else if mycol = Pc(k) */
         /* stat->time1 += SuperLU_timer_()-tt1; */
 
         scp = &grid->cscp;      /* The scope of process column. */
@@ -1331,7 +1468,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                    U(k,j) = L(k,k) \ A(k,j).  */
                  double ttt2 = SuperLU_timer_(); 
 #ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */
 #endif
                 {
                     PZGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
@@ -1350,7 +1487,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
                 if (ToSendD[lk] == YES) {
                     for (pi = 0; pi < Pr; ++pi) {
-                        if (pi != myrow) {
+                        if (pi != myrow) { /* Matching recv was pre-posted before */
 #if ( PROFlevel>=1 )
                             TIC (t1);
 #endif
@@ -1363,6 +1500,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                             TOC (t2, t1);
                             stat->utime[COMM] += t2;
+                            stat->utime[COMM_DOWN] += t2;
                             msg_cnt += 2;
                             msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
 #endif
@@ -1373,20 +1511,28 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     } /* for pi ... */
                 } /* if ToSendD ... */
 
-            } else { /* Panel U(k,:) already factorized */
+            } else { /* Panel U(k,:) already factorized from previous look-ahead */
 
                /* ================================================ *
-                 * Wait for downward sending of U(k,:) to complete *
-		 * for outer-product update                        *
-                 * =============================================== */
+                * Wait for downward sending of U(k,:) to complete  *
+		* for outer-product update.                        *
+                * ================================================ */
 
                 if (ToSendD[lk] == YES) {
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
                     for (pi = 0; pi < Pr; ++pi) {
                         if (pi != myrow) {
                             MPI_Wait (&send_reqs_u[look_id][pi], &status);
                             MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
                         }
                     }
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
                 }
                 msgcnt[2] = msgcntsU[look_id][2];
                 msgcnt[3] = msgcntsU[look_id][3];
@@ -1395,9 +1541,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
         } else {    /* myrow != krow */
 
-            /* ========================================= *
-             * wait for U(k,:) for outer-product updates *
-             * ========================================= */
+            /* ========================================== *
+             * Wait for U(k,:) for outer-product updates. *
+             * ========================================== */
 
             if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
 #if ( PROFlevel>=1 )
@@ -1411,6 +1557,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
+                stat->utime[COMM_DOWN] += t2;
 #endif
                 usub = Usub_buf;
                 uval = Uval_buf;
@@ -1484,8 +1631,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
             j = jj0 = 0;
 
 /************************************************************************/
+#if 0
+	for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
             double ttx =SuperLU_timer_();
 
+//#include "zlook_ahead_update_v4.c"
 #include "zlook_ahead_update.c"
 
             lookaheadupdatetimer += SuperLU_timer_() - ttx;
@@ -1512,6 +1663,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
                         look_id = kk0 % (1 + num_look_aheads);
                         recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
                         MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
                                    mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                    scp->comm, &recv_req[0]);
@@ -1519,6 +1673,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                                    SuperLU_MPI_DOUBLE_COMPLEX, kcol,
                                    SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                    scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
                     }
                 } else {
                     lk = LBj (kk, grid);    /* Local block number. */
@@ -1551,15 +1710,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                         scp = &grid->rscp;  /* The scope of process row. */
                         for (pj = 0; pj < Pc; ++pj) {
                             if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+			       TIC (t1);
+#endif
                                 MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                                            SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
                                            scp->comm, &send_req[pj]);
                                 MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
                                            SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
                                            scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+				TOC (t2, t1);
+				stat->utime[COMM] += t2;
+				stat->utime[COMM_RIGHT] += t2;
+				++prof_sendR[lk];
+#endif
                             }
-                        }
-                    }           /* for pj ... */
+                        } /* end for pj ... */
+                    } /* if    factored[kk] ... */
                 }
             }
         }
@@ -1575,6 +1743,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else 
 
 /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+//#include "zSchCompUdt-2Ddynamic_v6.c"
+
 #include "zSchCompUdt-2Ddynamic.c"
 
 #endif 
@@ -1584,7 +1754,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         
         NetSchurUpTimer += SuperLU_timer_() - tsch;
 
-    }  /* for k0 = 0, ... */
+    }  /* MAIN LOOP for k0 = 0, ... */
 
     /* ##################################################################
        ** END MAIN LOOP: for k0 = ...
@@ -1592,12 +1762,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     
     pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
 
-    /* updating total flops */
 #if ( PRNTlevel>=1 )
+    /* Print detailed statistics */
+    /* Updating total flops */
+    double allflops;
+    MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
+	       0, grid->comm);
     if ( iam==0 ) {
 	printf("\nInitialization time\t%8.2lf seconds\n"
 	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
-        printf("\n---- Time breakdown in factorization ----\n");
+        printf("\n==== Time breakdown in factorization (rank 0) ====\n");
+	printf("Panel factorization \t %8.2lf seconds\n",
+	       pdgstrf2_timer + pdgstrs2_timer);
+	printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
+	printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
 	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
         printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
         printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
@@ -1606,21 +1784,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf(".. Time in GEMM %8.2lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
-
+        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", 
+	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
         printf(".. Time to Scatter %8.2lf \n", 
 	       LookAheadScatterTimer + RemainScatterTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
         printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
 
-        printf("Total Time in Factorization            \t: %8.2lf seconds, \n", pxgstrfTimer);
-        printf("Total time in Schur update with offload\t  %8.2lf seconds,\n",CPUOffloadTimer );
+        printf("Total factorization time            \t: %8.2lf seconds, \n", pxgstrfTimer);
         printf("--------\n");
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
 #endif
     
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
             zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
@@ -1632,8 +1809,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 #endif
 
-    // printf("Debug : MPI buffers 1\n");
-
     /********************************************************
      * Free memory                                          *
      ********************************************************/
@@ -1673,7 +1848,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE (factored);
     log_memory(-(6 * nsupers * iword), stat);
 
-
     for (i = 0; i <= num_look_aheads; i++) {
         SUPERLU_FREE (msgcnts[i]);
         SUPERLU_FREE (msgcntsU[i]);
@@ -1693,8 +1867,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE (recv_reqs);
     SUPERLU_FREE (send_reqs);
 
-    // printf("Debug : MPI buffers 3\n");
-
 #ifdef GPU_ACC
     checkCuda (cudaFreeHost (bigV));
     checkCuda (cudaFreeHost (bigU));
@@ -1705,15 +1877,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE( streams );
     SUPERLU_FREE( stream_end_col );
 #else
+  #ifdef __INTEL_COMPILER
+    _mm_free (bigU);
+    _mm_free (bigV);
+  #else
     SUPERLU_FREE (bigV);
     SUPERLU_FREE (bigU);
-#endif
-
+  #endif
+    /* Decrement freed memory from memory stat. */
     log_memory(-(bigv_size + bigu_size) * dword, stat);
-    // printf("Debug : MPI buffers 5\n");
+#endif
 
     SUPERLU_FREE (Llu->ujrow);
-    SUPERLU_FREE (tempv2d);
+    // SUPERLU_FREE (tempv2d);/* Sherry */
     SUPERLU_FREE (indirect);
     SUPERLU_FREE (indirect2); /* Sherry added */
     SUPERLU_FREE (iuip);
@@ -1727,7 +1903,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     SUPERLU_FREE(omp_loop_time);
     SUPERLU_FREE(full_u_cols);
     SUPERLU_FREE(blk_ldu);
+#if ( PRNTlevel>=1 )
     log_memory(-2 * ncb * dword, stat);
+#endif
 
     SUPERLU_FREE(lookAheadFullRow);
     SUPERLU_FREE(lookAheadStRow);
@@ -1761,8 +1939,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( iinfo == n + 1 ) *info = 0;
     else *info = iinfo;
 
-    // printf("test out\n");
-
 #if ( PROFlevel>=1 )
     TOC (t2, t1);
     stat->utime[COMM] += t2;
@@ -1777,13 +1953,29 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
                     1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
         MPI_Reduce (&msg_vol, &msg_vol_max,
                     1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
-        if (!iam) {
+        if ( iam==0 ) {
             printf ("\tPZGSTRF comm stat:"
                     "\tAvg\tMax\t\tAvg\tMax\n"
                     "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
                     msg_cnt_sum / Pr / Pc, msg_cnt_max,
                     msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+	    printf("\t\tcomm time on task 0: %8.2lf\n"
+		   "\t\t\tcomm down DIAG block %8.2lf\n"
+		   "\t\t\tcomm right L panel %8.2lf\n"
+		   "\t\t\tcomm down U panel %8.2lf\n",
+		   stat->utime[COMM], stat->utime[COMM_DIAG],
+		   stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]);
+	    //#include <float.h>
+	    //int Digs = DECIMAL_DIG;
+	    printf("gemm_count %d\n", gemm_count);
+	    for (i = 0; i < gemm_count; ++i)
+		fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
+			gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
+	    
+	    fclose(fgemm);
         }
+	SUPERLU_FREE(gemm_stats);
+	SUPERLU_FREE(prof_sendR);
     }
 #endif
 
@@ -1796,7 +1988,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         printf (".. # total msg\t%d\n", iinfo);
 #endif
 
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
             zPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 3f63915..b4e3aca 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -13,10 +13,13 @@ at the top-level directory.
  * \brief Performs panel LU factorization.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * August 15, 2014
  *
+ * Modified:
+ *   September 30, 2017
+ *
  * <pre>
  * Purpose
  * =======
@@ -96,6 +99,7 @@ pzgstrf2_trsm
     int_t Pr;
     MPI_Status status;
     MPI_Comm comm = (grid->cscp).comm;
+    double t1, t2;
 
     /* Initialization. */
     iam = grid->iam;
@@ -127,16 +131,25 @@ pzgstrf2_trsm
     if ( U_diag_blk_send_req && 
 	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
         /* There are pending sends - wait for all Isend to complete */
-        for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
+        for (pr = 0; pr < Pr; ++pr) {
             if (pr != myrow) {
                 MPI_Wait (U_diag_blk_send_req + pr, &status);
             }
-
+	}
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
 	/* flag no more outstanding send request. */
 	U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
     }
 
     if (iam == pkk) {            /* diagonal process */
+	/* ++++ First step compute diagonal block ++++++++++ */
         for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
             /* Diagonal pivot */
             i = luptr;
@@ -197,13 +210,16 @@ pzgstrf2_trsm
 
         }                       /* for column j ...  first loop */
 
-	/* ++++++++++second step ====== */
+	/* ++++ Second step compute off-diagonal block with communication  ++*/
 
         ublk_ptr = ujrow = Llu->ujrow;
 
-        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block */
+        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block downward */
             /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
-            for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
+            for (pr = 0; pr < Pr; ++pr) {
                 if (pr != krow) {
                     /* tag = ((k0<<2)+2) % tag_ub;        */
                     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -212,6 +228,12 @@ pzgstrf2_trsm
                                comm, U_diag_blk_send_req + pr);
 
                 }
+            }
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DIAG] += t2;
+#endif
 
 	    /* flag outstanding Isend */
             U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
@@ -219,8 +241,6 @@ pzgstrf2_trsm
 
         /* pragma below would be changed by an MKL call */
 
-        char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
-
         l = nsupr - nsupc;
         // n = nsupc;
 	doublecomplex alpha = {1.0, 0.0};
@@ -230,32 +250,36 @@ pzgstrf2_trsm
 #endif
 
 #if defined (USE_VENDOR_BLAS)
-        ztrsm_ (&side, &uplo, &transa, &diag,
-                &l, &nsupc,
+        ztrsm_ ("R", "U", "N", "N", &l, &nsupc,
                 &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
 		1, 1, 1, 1);
 #else
-        ztrsm_ (&side, &uplo, &transa, &diag,
-                &l, &nsupc,
+        ztrsm_ ("R", "U", "N", "N", &l, &nsupc,
                 &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
 #endif
-
+	stat->ops[FACT] += 4.0 * ((flops_t) nsupc * (nsupc+1) * l);
     } else {  /* non-diagonal process */
-        /* ================================================ *
-         * Receive the diagonal block of U                  *
-         * for panel factorization of L(:,k)                *
-         * note: we block for panel factorization of L(:,k) *
-         * but panel factorization of U(:,k) don't          *
-         * ================================================ */
+        /* ================================================================== *
+         * Receive the diagonal block of U for panel factorization of L(:,k). * 
+         * Note: we block for panel factorization of L(:,k), but panel        *
+	 * factorization of U(:,k) do not block                               *
+         * ================================================================== */
 
         /* tag = ((k0<<2)+2) % tag_ub;        */
         /* tag = (4*(nsupers+k0)+2) % tag_ub; */
         // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
         MPI_Recv (ublk_ptr, (nsupc * nsupc), SuperLU_MPI_DOUBLE_COMPLEX, krow,
                   SLU_MPI_TAG (4, k0) /* tag */ ,
                   comm, &status);
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
         if (nsupr > 0) {
-            char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
             doublecomplex alpha = {1.0, 0.0};
 
 #ifdef PI_DEBUG
@@ -264,17 +288,16 @@ pzgstrf2_trsm
                 printf (" Rank :%d \t Empty block column occured :\n", iam);
 #endif
 #if defined (USE_VENDOR_BLAS)
-            ztrsm_ (&side, &uplo, &transa, &diag,
-                    &nsupr, &nsupc,
+            ztrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
                     &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
 #else
-            ztrsm_ (&side, &uplo, &transa, &diag,
-                    &nsupr, &nsupc,
+            ztrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
                     &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
 #endif
+	    stat->ops[FACT] += 4.0 * ((flops_t) nsupc * (nsupc+1) * nsupr);
         }
 
-    }                           /* end if pkk ... */
+    } /* end if pkk ... */
 
     /* printf("exiting pzgstrf2 %d \n", grid->iam);  */
 
@@ -301,12 +324,10 @@ void pzgstrs2_omp
     int_t *usub;
     doublecomplex *lusup, *uval;
 
-#ifdef _OPENMP
-    int thread_id = omp_get_thread_num ();
-    int num_thread = omp_get_num_threads ();
-#else
-    int thread_id = 0;
-    int num_thread = 1;
+#if 0
+    //#ifdef USE_VTUNE
+    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+    __itt_resume(); // start VTune, again use 2 underscores
 #endif
 
     /* Quick return. */
@@ -316,15 +337,12 @@ void pzgstrs2_omp
     /* Initialization. */
     iam = grid->iam;
     pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
-    int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
-    int gb_col_cycle;  /* cycle through block columns  */
+    //int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
+    //int gb_col_cycle;  /* cycle through block columns  */
     klst = FstBlockC (k + 1);
     knsupc = SuperSize (k);
     usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
     uval = Llu->Unzval_br_ptr[lk];
-    nb = usub[0];
-    iukp = BR_HEADER;
-    rukp = 0;
     if (iam == pkk) {
         lk = LBj (k, grid);
         nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
@@ -334,28 +352,45 @@ void pzgstrs2_omp
         lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
     }
 
-    /* Loop through all the row blocks. */
-    for (b = 0; b < nb; ++b)  {
-        /* assuming column cyclic distribution of data among threads */
-        gb = usub[iukp];
-        gb_col_cycle = gb / grid->npcol;
-        nsupc = SuperSize (gb);
-        iukp += UB_DESCRIPTOR;
+    /////////////////////new-test//////////////////////////
+    /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */
+
+    /* Master thread: set up pointers to each block in the row */
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+    
+    int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
+    int* blocks_value_pointers = blocks_index_pointers + nb;
+    int* nsupc_temp = blocks_value_pointers + nb;
+    for (b = 0; b < nb; b++) { /* set up pointers to each block */
+	blocks_index_pointers[b] = iukp + UB_DESCRIPTOR;
+	blocks_value_pointers[b] = rukp;
+	gb = usub[iukp];
+	rukp += usub[iukp+1];
+	nsupc = SuperSize( gb );
+	nsupc_temp[b] = nsupc;
+	iukp += (UB_DESCRIPTOR + nsupc);  /* move to the next block */
+    }
+
+    // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
+    // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#pragma omp parallel for schedule(static) default(shared) \
+    private(b,j,iukp,rukp,segsize)
+    /* Loop through all the blocks in the row. */
+    for (b = 0; b < nb; ++b) {
+	iukp = blocks_index_pointers[b];
+	rukp = blocks_value_pointers[b];
 
         /* Loop through all the segments in the block. */
-        for (j = 0; j < nsupc; ++j) {
-#ifdef PI_DEBUG
-            printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
-#endif 
+        for (j = 0; j < nsupc_temp[b]; j++) {
             segsize = klst - usub[iukp++];
-            if (segsize) {    /* Nonzero segment. */
-                luptr = (knsupc - segsize) * (nsupr + 1);
+	    if (segsize) {
+#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30)
+		{ /* Nonzero segment. */
+		    int_t luptr = (knsupc - segsize) * (nsupr + 1);
+		    //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr);
 
-		/* if gb belongs to present thread then do the factorize */
-                if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
-#ifdef PI_DEBUG
-                    printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
-#endif
 #if defined (USE_VENDOR_BLAS)
                     ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
                             &uval[rukp], &incx, 1, 1, 1);
@@ -363,14 +398,22 @@ void pzgstrs2_omp
                     ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
                             &uval[rukp], &incx);
 #endif
-                }
+		} /* end task */
+		rukp += segsize;
+		stat->ops[FACT] += segsize * (segsize + 1);
+	    } /* end if segsize > 0 */
+	} /* end for j in parallel ... */
+/* #pragma omp taskwait */
+    }  /* end for b ... */
 
-                if (thread_id == 0)
-                    stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
-                rukp += segsize;
-            }
-        }
-    }                           /* for b ... */
+    /* Deallocate memory */
+    SUPERLU_FREE(blocks_index_pointers);
+
+#if 0
+    //#ifdef USE_VTUNE
+    __itt_pause(); // stop VTune
+    __SSC_MARK(0x222); // stop SDE tracing
+#endif
 
 } /* PZGSTRS2_omp */
 
diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c
index 27cbf93..94db174 100644
--- a/SRC/sp_colorder.c
+++ b/SRC/sp_colorder.c
@@ -125,10 +125,9 @@ sp_colorder(superlu_dist_options_t *options,  SuperMatrix *A, int_t *perm_c,
     }
 	
     if ( options->Fact == DOFACT 
-	 || options->Fact == SamePattern )
+	 || options->Fact == SamePattern ) {
 	/* In this case, perm_r[] may be changed, etree(Pr*A + (Pr*A)')
 	   may be changed, so need to recompute etree.   */
-    { 
 	/* Factor A "from scratch" -- we also compute the etree, and
 	 * make perm_c consistent with the postorder of the etree.
 	 */
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 24386cb..08d1e8f 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -103,7 +103,10 @@ sp_ienv_dist(int_t ispec)
             return 128;
 
 #endif
-        case 6: return (5);
+        case 6: 
+            ttemp = getenv("FILL");
+            if ( ttemp ) return(atoi(ttemp));
+            else return (5);
         case 7:
 	    ttemp = getenv ("N_GEMM");
 	    if (ttemp) return atoi (ttemp);
diff --git a/SRC/static_schedule.c b/SRC/static_schedule.c
index b653047..bc1933b 100644
--- a/SRC/static_schedule.c
+++ b/SRC/static_schedule.c
@@ -45,6 +45,14 @@ static_schedule(superlu_dist_options_t * options, int m, int n,
 		LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat,
 		int_t *perm_c_supno, int_t *iperm_c_supno, int *info)
 {
+/* 
+ * Arguments
+ *
+ * perm_c_supno (output) 
+ *      perm_c_superno[k] = j means at the k-th step of elimination, the j-th
+ *      panel is chosen.
+ * 
+ */
     int_t *xsup;
     int_t  i, ib, jb, lb, nlb, il, iu;
     int_t Pc, Pr;
@@ -961,6 +969,8 @@ static_schedule(superlu_dist_options_t * options, int m, int n,
 
 #if ( DEBUGlevel >= 1 )
     print_memorylog(stat, "after static schedule");
+    check_perm_dist("perm_c_supno", nsupers, perm_c_supno);
+    check_perm_dist("iperm_c_supno", nsupers, iperm_c_supno);
 #endif
 
     return 0;
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 007fbe3..27b3487 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -161,7 +161,7 @@ typedef struct {
                              indices of A are translated into the relative
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
-    /*int_t *xrow_to_proc; Xiaoye: can be removed */
+    int_t *xrow_to_proc; /* used by PDSLin */
 } SOLVEstruct_t;
 
 
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index 27c1bdf..b52b537 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -12,7 +12,7 @@ at the top-level directory.
  * \brief Definitions which are precision-neutral
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * November 1, 2007
  *
@@ -43,6 +43,12 @@ at the top-level directory.
 #include <limits.h>
 #include <string.h>
 
+/* Following is for vtune */
+#if 0
+#include <ittnotify.h>
+#define USE_VTUNE
+#endif
+
 /*************************************************************************
  * Constants
  **************************************************************************/
@@ -57,9 +63,11 @@ at the top-level directory.
  * Versions 4.x and earlier do not include a #define'd version numbers.
  */
 #define SUPERLU_DIST_MAJOR_VERSION     5
-#define SUPERLU_DIST_MINOR_VERSION     1
-#define SUPERLU_DIST_PATCH_VERSION     3
+#define SUPERLU_DIST_MINOR_VERSION     2
+#define SUPERLU_DIST_PATCH_VERSION     2
+#define SUPERLU_DIST_RELEASE_DATE      "October 24, 2017"
 
+#include "superlu_dist_config.h"
 /* Define my integer size int_t */
 #ifdef _CRAY
   typedef short int_t;
@@ -703,6 +711,7 @@ extern void  PStatFree(SuperLUStat_t *);
 extern void  PStatPrint(superlu_dist_options_t *, SuperLUStat_t *, gridinfo_t *);
 extern void  log_memory(long long, SuperLUStat_t *);
 extern void  print_memorylog(SuperLUStat_t *, char *);
+extern int   superlu_dist_GetVersionNumber(int *, int *, int *);
 
 /* Prototypes for parallel symbolic factorization */
 extern float symbfact_dist
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
new file mode 100644
index 0000000..7cda561
--- /dev/null
+++ b/SRC/superlu_dist_config.h
@@ -0,0 +1,4 @@
+/* #define XSDK_INDEX_SIZE 64 */
+#if (XSDK_INDEX_SIZE == 64)
+#define _LONGINT 1
+#endif
diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in
new file mode 100644
index 0000000..3fa100f
--- /dev/null
+++ b/SRC/superlu_dist_config.h.in
@@ -0,0 +1,9 @@
+/* superlu_dist_config.h.in */
+
+/* enable 64bit index mode */
+#cmakedefine XSDK_INDEX_SIZE @XSDK_INDEX_SIZE@
+
+#if (XSDK_INDEX_SIZE == 64)
+#define _LONGINT 1
+#endif
+
diff --git a/SRC/superlu_dist_version.c b/SRC/superlu_dist_version.c
new file mode 100644
index 0000000..c6c8759
--- /dev/null
+++ b/SRC/superlu_dist_version.c
@@ -0,0 +1,30 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/** @file superlu_dist_version.h
+ * \brief Gets the SuperLU_DIST's version information from the library.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley, 
+ * October 13, 2017
+ *
+ */
+
+#include "superlu_defs.h"
+
+int superlu_dist_GetVersionNumber(int *major, int *minor, int *bugfix)
+{
+  if (major) *major = SUPERLU_DIST_MAJOR_VERSION;
+  if (minor) *minor = SUPERLU_DIST_MINOR_VERSION;
+  if (bugfix) *bugfix = SUPERLU_DIST_PATCH_VERSION;
+  return 0;
+}
+
+
diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h
index 07fb1a4..cd592a5 100644
--- a/SRC/superlu_enum_consts.h
+++ b/SRC/superlu_enum_consts.h
@@ -67,6 +67,9 @@ typedef enum {
     DIST,    /* distribute matrix. */
     FACT,    /* perform LU factorization */
     COMM,    /* communication for factorization */
+    COMM_DIAG, /* Bcast diagonal block to process column */
+    COMM_RIGHT, /* communicate L panel */
+    COMM_DOWN, /* communicate U panel */
     SOL_COMM,/* communication for solve */
     RCOND,   /* estimate reciprocal condition number */
     SOLVE,   /* forward and back solves */
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index dc918b2..10999c4 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -161,7 +161,7 @@ typedef struct {
                              indices of A are translated into the relative
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pzgsmv() */
-    /*int_t *xrow_to_proc; Xiaoye: can be removed */
+    int_t *xrow_to_proc; /* used by PDSLin */
 } SOLVEstruct_t;
 
 
diff --git a/SRC/util.c b/SRC/util.c
index 2ae3ccd..75911a4 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -323,7 +323,7 @@ void set_default_options_dist(superlu_dist_options_t *options)
     options->ParSymbFact       = NO;
     options->ColPerm           = METIS_AT_PLUS_A;
     options->RowPerm           = LargeDiag;
-    options->ReplaceTinyPivot  = YES;
+    options->ReplaceTinyPivot  = NO;
     options->IterRefine        = SLU_DOUBLE;
     options->Trans             = NOTRANS;
     options->SolveInitialized  = NO;
@@ -364,9 +364,10 @@ void print_sp_ienv_dist(superlu_dist_options_t *options)
 
     printf("**************************************************\n");
     printf(".. blocking parameters from sp_ienv():\n");
-    printf("**    relaxation           : " IFMT "\n", sp_ienv_dist(2));
-    printf("**    max supernode        : " IFMT "\n", sp_ienv_dist(3));
-    printf("**    estimated fill ratio : " IFMT "\n", sp_ienv_dist(6));
+    printf("**    relaxation                 : " IFMT "\n", sp_ienv_dist(2));
+    printf("**    max supernode              : " IFMT "\n", sp_ienv_dist(3));
+    printf("**    estimated fill ratio       : " IFMT "\n", sp_ienv_dist(6));
+    printf("**    min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7));
     printf("**************************************************\n");
 }
 
@@ -882,23 +883,23 @@ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
   int_t TEMP;
   IGAP = N / 2;
   while (IGAP > 0) {
-    for (I = IGAP; I < N; I++) {
-    J = I - IGAP;
-    while (J >= 0) {
-      if (ARRAY1[J] > ARRAY1[J + IGAP]) {
-        TEMP = ARRAY1[J];
-        ARRAY1[J] = ARRAY1[J + IGAP];
-        ARRAY1[J + IGAP] = TEMP;
-        TEMP = ARRAY2[J];
-        ARRAY2[J] = ARRAY2[J + IGAP];
-        ARRAY2[J + IGAP] = TEMP;
-        J = J - IGAP;
-      } else {
-        break;
+      for (I = IGAP; I < N; I++) {
+	  J = I - IGAP;
+	  while (J >= 0) {
+	      if (ARRAY1[J] > ARRAY1[J + IGAP]) {
+		  TEMP = ARRAY1[J];
+		  ARRAY1[J] = ARRAY1[J + IGAP];
+		  ARRAY1[J + IGAP] = TEMP;
+		  TEMP = ARRAY2[J];
+		  ARRAY2[J] = ARRAY2[J + IGAP];
+		  ARRAY2[J + IGAP] = TEMP;
+		  J = J - IGAP;
+	      } else {
+		  break;
+	      }
+	  }
       }
-    }
-  }
-    IGAP = IGAP / 2;
+      IGAP = IGAP / 2;
   }
 }
 
@@ -908,40 +909,36 @@ void isort1(int_t N, int_t *ARRAY)
 /*
  * Purpose
  * =======
- * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing
- * order of ARRAY1.
+ * Use quick sort algorithm to sort ARRAY in increasing order.
  *
  * Arguments
  * =========
  * N       (input) INTEGER
  *          On entry, specifies the size of the arrays.
  *
- * ARRAY1  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ * ARRAY   (input/output) DOUBLE PRECISION ARRAY of LENGTH N
  *          On entry, contains the array to be sorted.
  *          On exit, contains the sorted array.
  *
- * ARRAY2  (input/output) DOUBLE PRECISION ARRAY of LENGTH N
- *          On entry, contains the array to be sorted.
- *          On exit, contains the sorted array.
  */
   int_t IGAP, I, J;
   int_t TEMP;
   IGAP = N / 2;
   while (IGAP > 0) {
-  for (I = IGAP; I < N; I++) {
-    J = I - IGAP;
-    while (J >= 0) {
-      if (ARRAY[J] > ARRAY[J + IGAP]) {
-        TEMP = ARRAY[J];
-        ARRAY[J] = ARRAY[J + IGAP];
-        ARRAY[J + IGAP] = TEMP;
-        J = J - IGAP;
-      } else {
-        break;
+      for (I = IGAP; I < N; I++) {
+	  J = I - IGAP;
+	  while (J >= 0) {
+	      if (ARRAY[J] > ARRAY[J + IGAP]) {
+		  TEMP = ARRAY[J];
+		  ARRAY[J] = ARRAY[J + IGAP];
+		  ARRAY[J + IGAP] = TEMP;
+		  J = J - IGAP;
+	      } else {
+		  break;
+	      }
+	  }
       }
-    }
-  }
-    IGAP = IGAP / 2;
+      IGAP = IGAP / 2;
   }
 }
 
@@ -1060,7 +1057,7 @@ arrive_at_ublock (int_t j,      /* j-th block in a U panel */
                   int_t * nsupc,/* supernode size of destination block */
                   int_t iukp0,  /* input : search starting point */
                   int_t rukp0, 
-		  int_t * usub, /* usub scripts */
+		  int_t * usub, /* U subscripts */
                   int_t * perm_u, /* permutation vector from static schedule */
                   int_t * xsup, /* for SuperSize and LBj */
                   gridinfo_t * grid)
@@ -1069,19 +1066,26 @@ arrive_at_ublock (int_t j,      /* j-th block in a U panel */
     *iukp = iukp0; /* point to the first block in index[] */
     *rukp = rukp0; /* point to the start of nzval[] */
 
+    /* Sherry -- why always starts from 0 ?? Can continue at 
+       the column left from last search.  */
+    /* Caveat: There is a permutation perm_u involved for j. That's why
+       the search need to restart from 0.  */
 #ifdef ISORT
     for (jj = 0; jj < perm_u[j]; jj++) /* perm_u[j] == j */
 #else
     for (jj = 0; jj < perm_u[2 * j + 1]; jj++) /* == j */
 #endif
     {
+        /* Reinitilize the pointers to the begining of the 
+	 * k-th column/row of L/U factors.
+	 * usub[] - index array for panel U(k,:)
+	 */
         // printf("iukp %d \n",*iukp );
         *jb = usub[*iukp];      /* Global block number of block U(k,j). */
         // printf("jb %d \n",*jb );
         *nsupc = SuperSize (*jb);
         // printf("nsupc %d \n",*nsupc );
         *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
         *rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj);
 				     Move to block U(k,jj+1) in nzval[] */ 
         *iukp += *nsupc;
diff --git a/SRC/zSchCompUdt-2Ddynamic.c b/SRC/zSchCompUdt-2Ddynamic.c
index 46fa613..0468f5f 100644
--- a/SRC/zSchCompUdt-2Ddynamic.c
+++ b/SRC/zSchCompUdt-2Ddynamic.c
@@ -15,29 +15,46 @@ at the top-level directory.
  *        Uses 2D partitioning for the scatter phase.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: September 14, 2017
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros), 
+ *     gather only trailing columns of the L-panel corresponding to the nonzero
+ *     of U-rows.
+ *   - Padding zeros for nice dimensions of GEMM.
+ *
  */
 
 #define SCHEDULE_STRATEGY guided 
-double tt_start;
-double tt_end;
+
+/* 
+ * Buffers:
+ *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ *                                            (A matrix in C := A*B )
+ *     bigU : stores the U-panel (B matrix in C := A*B)
+ *     bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
 
 if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
     int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
     int temp_nbrow;   /* nonzero rows in current block L(i,k) */
     lptr  = lptr0;
     luptr = luptr0;
-    /**
+    int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+			   and remaining part.  */
+
+    /*******************************************************************
      * Seperating L blocks into the top part within look-ahead window
      * and the remaining ones.
-     */
+     *******************************************************************/
+
      int lookAheadBlk=0, RemainBlk=0;
 
      tt_start = SuperLU_timer_();
 
+     /* Sherry -- can this loop be threaded?? */
      /* Loop through all blocks in L(:,k) to set up pointers to the start 
       * of each block in the data arrays.
       *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
@@ -46,36 +63,36 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
       *   - (ditto Remain_Info[i])
       */
      for (int i = 0; i < nlb; ++i) {
-	 ib = lsub[lptr];            /* block number of L(i,k). */
+	 ib = lsub[lptr];            /* Block number of L(i,k). */
 	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
         
 	 int look_up_flag = 1; /* assume ib is outside look-up window */
-	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
-	     {
-		 if(ib == perm_c_supno[j]) {
-		     look_up_flag=0; /* flag ib is within look-up window */
-                     break; /* Sherry -- can exit the loop?? */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+	      ++j) {
+		 if ( ib == perm_c_supno[j] ) {
+		     look_up_flag = 0; /* flag ib within look-up window */
+                     break;            /* Sherry -- can exit the loop?? */
                  }
-	     }
+	 }
 	 
-	 if( look_up_flag == 0 ) { /* ib is within look up window */
+	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
 	     if (lookAheadBlk==0) {
 		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
 	     } else {
-		 lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];   
+		 lookAheadFullRow[lookAheadBlk] = 
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];   
 	     }
 	     lookAheadStRow[lookAheadBlk] = cum_nrow;
 	     lookAhead_lptr[lookAheadBlk] = lptr;
 	     lookAhead_ib[lookAheadBlk] = ib; 
 	     lookAheadBlk++;
-	 } else { /* ib is not in look up window */
-
-	     if (RemainBlk==0) {
+	 } else { /* ib is not in look-up window */
+	     if ( RemainBlk==0 ) {
 		 Remain_info[RemainBlk].FullRow = temp_nbrow;
 	     } else {
-		 Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;   
+		 Remain_info[RemainBlk].FullRow = 
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;   
 	     }
-
              RemainStRow[RemainBlk] = cum_nrow;
              // Remain_lptr[RemainBlk] = lptr;
 	     Remain_info[RemainBlk].lptr = lptr;
@@ -84,139 +101,105 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 	     RemainBlk++;
 	 }
 	 
-         cum_nrow +=temp_nbrow;
+         cum_nrow += temp_nbrow;
 	 
 	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 	 lptr += temp_nbrow;     /* Move to next block */
 	 luptr += temp_nbrow;
-     }  /* for i ... all blocks in L(:,k) */
+     }  /* for i ... set up pointers for all blocks in L(:,k) */
 
      lptr = lptr0;
      luptr = luptr0;
 
-     /* leading dimension of L buffer */
-#if 0
-     int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
-#else /* Piyush fix */
-     int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
-#endif
-
-     /* Loop through the look-ahead blocks to copy Lval into the buffer */
-#ifdef __OPENMP
-     /* #pragma omp parallel for -- why not?? Sherry */
-#endif
-     for (int i = 0; i < lookAheadBlk; ++i) {
-	 int StRowDest  = 0;
-	 int temp_nbrow;
-	 if (i==0) {
-	     temp_nbrow = lookAheadFullRow[0];
-	 } else {
-	     StRowDest   = lookAheadFullRow[i-1];
-	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
-	 }
-	 
-	 int StRowSource=lookAheadStRow[i];
-	 
-	 /* Now copying the matrix*/
-	 // #pragma omp parallel for (gives slow down)
-	 for (int j = 0; j < knsupc; ++j) {
-	     memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
-		    &lusup[luptr+j*nsupr+StRowSource],
-		    temp_nbrow * sizeof(doublecomplex) );
-	 }
-     }
-
-     int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-
-    /* Loop through the remaining blocks to copy Lval into the buffer */
-#ifdef _OPENMP
-#pragma omp parallel for 
-#endif
-     for (int i = 0; i < RemainBlk; ++i) {
-	 int StRowDest  = 0;
-	 int temp_nbrow;
-         if (i==0)  {
-	     temp_nbrow = Remain_info[0].FullRow;
-	 } else  {
-	     StRowDest   = Remain_info[i-1].FullRow;
-	     temp_nbrow  = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
-	 }
-
-	 int StRowSource=RemainStRow[i];
-
-	 /* Now copying the matrix*/
-	 // #pragma omp parallel for (gives slow down)
-	 for (int j = 0; j < knsupc; ++j) {
-	     // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
-	     memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
-		    &lusup[luptr+j*nsupr+StRowSource],
-                    temp_nbrow * sizeof(doublecomplex) );
-	 }
-     } /* parallel for i ... */
-
-#if ( PRNTlevel>=1 )
-     tt_end = SuperLU_timer_();
-     GatherLTimer += tt_end - tt_start;
-#endif
-#if 0
-     LookAheadRowSepMOP  +=  2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
-#else
-     int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
-                              or remaining part.  */
-     lnbrow = lookAheadBlk==0 ? 0  : lookAheadFullRow[lookAheadBlk-1];
-     rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-     nbrow = lnbrow + rnbrow; /* total number of rows in L */
+     /* leading dimension of L look-ahead buffer, same as Lnbrow */
+     //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+     Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+     /* leading dimension of L remaining buffer, same as Rnbrow */
+     //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+     /* Piyush fix */
+     //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+     nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
      LookAheadRowSepMOP += 2*knsupc*(nbrow);
-#endif     
-     
-     /**********************
-      * Gather U blocks *
-      **********************/
 
+     /***********************************************
+      * Gather U blocks (AFTER LOOK-AHEAD WINDOW)   *
+      ***********************************************/
      tt_start = SuperLU_timer_();
-#if 0     
-     nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
-#endif
 
      if ( nbrow > 0 ) { /* L(:,k) is not empty */
 	 /*
 	  * Counting U blocks
 	  */
-	 ncols = 0; /* total number of nonzero columns in U(k,:) */
-	 ldu   = 0;
-	 full  = 1; /* flag the U block is indeed 'full', containing segments
-	               of same length. No need padding 0 */
-	 int temp_ncols=0;
+	 ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
+	 int temp_ncols = 0;
 
-         /* Loop through all blocks in U(k,:) to set up pointers to the start
+#if 0
+	 /* jj0 contains the look-ahead window that was updated in 
+	    dlook_ahead_update.c. Now the search can continue from that point,
+	    not to start from block 0. */
+	 iukp = iukp0; /* point to the first block in index[] */
+	 rukp = rukp0; /* point to the start of nzval[] */
+#esle
+	 /* Save pointers at location right after look-ahead window
+	    for later restart. */
+	 iukp0 = iukp;
+	 rukp0 = rukp;
+#endif
+
+	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+	     
+         /* 
+	  * Loop through all blocks in U(k,:) to set up pointers to the start
           * of each block in the data arrays, store them in Ublock_info[j]
           * for block U(k,j).
   	  */
-	 for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+	 for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
 	     temp_ncols = 0;
+#if 0
+	     /* Sherry - can remove following call, since perm_u == Identity  */
 	     arrive_at_ublock(
 			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
 			      iukp0, rukp0, usub, perm_u, xsup, grid
 			      );
+#else
+	     jb = usub[iukp];
+	     /* ljb = LBj (jb, grid);   Local block number of U(k,j). */
+	     nsupc = SuperSize(jb);
+	     iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
 	     Ublock_info[j].iukp = iukp;
 	     Ublock_info[j].rukp = rukp;
 	     Ublock_info[j].jb = jb;
-	     
+
+	     /* if ( iam==0 )
+		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+			"Ublock_info[j].jb %d, nsupc %d\n", 
+			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+			Ublock_info[j].jb, nsupc); */
+
 	     /* Prepare to call GEMM. */
 	     jj = iukp;
-	     
 	     for (; jj < iukp+nsupc; ++jj) {
 		 segsize = klst - usub[jj];
 		 if ( segsize ) {
                     ++temp_ncols;
-                    if ( segsize != ldu ) full = 0; /* need padding 0 */
                     if ( segsize > ldu ) ldu = segsize;
 		 }
 	     }
 
 	     Ublock_info[j].full_u_cols = temp_ncols;
 	     ncols += temp_ncols;
-	 }
+#if 1	     
+	     /* Jump number of nonzeros in block U(k,jj);
+		Move to block U(k,j+1) in nzval[] array.  */
+	     rukp += usub[iukp - 1];
+	     iukp += nsupc;
+#endif
+         } /* end for j ... compute ldu & ncols */
 
 	 /* Now doing prefix sum on full_u_cols.
 	  * After this, full_u_cols is the number of nonzero columns
@@ -226,101 +209,239 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
 	 }
             
+	 /* Padding zeros to make {m,n,k} multiple of vector length. */
+	 jj = 8; //n;
+	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+	     gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+	     gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+	     //gemm_n_pad = ncols;
+	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+	     gemm_k_pad = ldu;
+	     
+	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
+		 for (j = 0; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = 0; i < Rnbrow; ++i)         
+		 for (j = ldu; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
+		 for (j = 0; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	     for (i = 0; i < ldu; ++i)
+		 for (j = ncols; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	 } else {
+	     gemm_m_pad = Rnbrow;
+	     gemm_n_pad = ncols;
+	     gemm_k_pad = ldu;
+	 }
+     
 	 tempu = bigU; /* buffer the entire row block U(k,:) */
 
          /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
-#ifdef _OPENMP        
-#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
-	lead_zero, jj, i) \
-        default (shared) schedule(SCHEDULE_STRATEGY)
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+    private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+    default (shared) schedule(SCHEDULE_STRATEGY)
 #endif
-        for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+        for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
 
-            if(j==jj0) tempu = bigU;
-            else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+            if (j==jj0) tempu = bigU;
+            //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+            else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
 
-            /* == processing each of the remaining columns == */
+            /* == processing each of the remaining columns in parallel == */
+#if 0
+	    /* Sherry - can remove following call, since perm_u == Identity  */
             arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
 			     iukp0, rukp0, usub,perm_u, xsup, grid);
-
-            /* Copy from U(k,:) to tempu[], padding zeros.  */            
+#else
+	    iukp = Ublock_info[j].iukp;
+	    rukp = Ublock_info[j].rukp;
+	    jb = Ublock_info[j].jb;
+	    nsupc = SuperSize (jb );
+#endif
+            /* Copy from U(k,j) to tempu[], padding zeros.  */            
             for (jj = iukp; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
                     lead_zero = ldu - segsize;
                     for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-                    tempu += lead_zero;
-                    for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+		    //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+		    for (i=0; i<segsize; ++i) tempu[i+lead_zero] = uval[rukp+i];
+
                     rukp += segsize;
-                    tempu += segsize;
+#if 0
+		    tempu += segsize;
+#else
+                    tempu += gemm_k_pad;
+#endif
                 }
-            }
+	    }
+#if 0
+	    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
+        }   /* parallel for j = jj0 .. nub */
+
+#if 0
+	if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+	fflush(stdout);
+#endif
+    }  /* end if (nbrow>0), end gather U blocks */
+
+    GatherUTimer += SuperLU_timer_() - tt_start;
+    GatherMOP += 2*ldu*ncols;
+    int jj_cpu = nub;       /* limit between CPU and GPU */
+    int thread_id;
+    /*tempv = bigV;*/
 
-            rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
 
-        }   /* parallel for j:jjj_st..jjj */
+    /**********************
+     * Gather L blocks    *
+     **********************/
+     tt_start = SuperLU_timer_();
 
-        tempu = bigU;  /* setting to the start of padded U(k,:) */
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+     for (int i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest, temp_nbrow;
+	 if ( i==0 ) {
+	     StRowDest = 0;
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+	 
+	 int StRowSource = lookAheadStRow[i];
+	 
+	 /* Now copying one block into L lookahead buffer */
+	 /* #pragma omp parallel for (gives slow down) */
+	 // for (int j = 0; j < knsupc; ++j) { 
+	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+						    corresponding to zero U rows */
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+	     //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+	     tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+	     tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+	     memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+		    &lusup[luptr+j*nsupr + StRowSource],
+		    temp_nbrow * sizeof(doublecomplex) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... gather Lval blocks from lookahead window */
+
+     /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+    schedule(SCHEDULE_STRATEGY)
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+         int StRowDest, temp_nbrow;
+         if ( i==0 )  {
+	     StRowDest  = 0;
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+	 }
 
-    }  /* end if (nbrow>0) */
+	 int StRowSource = RemainStRow[i];
 
-#if ( PRNTlevel>=1 )
-    GatherUTimer += SuperLU_timer_() - tt_start;
+	 /* Now copying a block into L remaining buffer */
+	 // #pragma omp parallel for (gives slow down)
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (int j = knsupc-ldu; j < knsupc; ++j) {
+	     // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+	     //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+	     tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+	     tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
 #endif
-    GatherMOP += 2*ldu*ncols;
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+	     memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+		    &lusup[luptr+j*nsupr + StRowSource],
+                    temp_nbrow * sizeof(doublecomplex) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... copy Lval into the remaining buffer */
 
-    int Lnbrow   = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
-    int Rnbrow   = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-    int jj_cpu=nub;       /*limit between CPU and GPU */
-    int thread_id;
-    tempv = bigV;
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
 
-    /**************************************
-     * Perform GEMM followed by Scatter *
-     **************************************/
 
-    if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
-        /* Perform a large GEMM call */
-        ncols = Ublock_info[nub-1].full_u_cols;
-        schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
-        stat->ops[FACT]    += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+     /*************************************************************************
+      * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+      *************************************************************************/
+     tempu = bigU;  /* setting to the start of padded U(k,:) */
+    
+     if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+	 /***************************************************************
+	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+	  ***************************************************************/
+
+	 /* Count flops for total GEMM calls */
+	 ncols = Ublock_info[nub-1].full_u_cols;
+	 flops_t flps = 8.0 * (flops_t)Lnbrow * ldu * ncols;
+	 LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+	 schur_flop_counter += flps;
+	 stat->ops[FACT]    += flps;
+	 LookAheadGEMMFlOp  += flps;
 
-        /***************************************************************
-         * Updating look-ahead blocks in both L and U look-ahead windows.
-         ***************************************************************/
 #ifdef _OPENMP
-#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
-     {
- 	thread_id = omp_get_thread_num();
+#pragma omp parallel default (shared) private(thread_id)
+	 {
+	   thread_id = omp_get_thread_num();
  
- 	/* Ideally, should organize the loop as:
-                for (j = 0; j < nub; ++j) {
-                    for (lb = 0; lb < lookAheadBlk; ++lb) {
- 	               L(lb,k) X U(k,j) -> tempv[]
-                    }
-                }
- 	   But now, we use collapsed loop to achieve more parallelism.
- 	   Total number of block updates is:
- 	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
- 	*/
+	   /* Ideally, should organize the loop as:
+	      for (j = 0; j < nub; ++j) {
+	          for (lb = 0; lb < lookAheadBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+		  }
+	      }
+	      But now, we use collapsed loop to achieve more parallelism.
+	      Total number of block updates is:
+	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+	   */
+
+	   int i = sizeof(int);
+	   int* indirect_thread    = indirect + (ldt + CACHELINE/i) * thread_id;
+	   int* indirect2_thread   = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
 #pragma omp for \
-    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
     schedule(dynamic)
 #else /* not use _OPENMP */
- 	thread_id = 0;
+	   thread_id = 0;
+	   int* indirect_thread    = indirect;
+	   int* indirect2_thread   = indirect2;
 #endif
- 	/* Each thread is assigned one loop index ij, responsible for 
- 	   block update L(lb,k) * U(k,j) -> tempv[]. */
-        for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
-	    if ( thread_id == 0 ) tt_start = SuperLU_timer_();
-
-            int j   = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+	   /* Each thread is assigned one loop index ij, responsible for 
+	      block update L(lb,k) * U(k,j) -> tempv[]. */
+	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	       /* jj0 starts after look-ahead window. */
+            int j   = ij/lookAheadBlk + jj0;
             int lb  = ij%lookAheadBlk;
 
-            int* indirect_thread    = indirect + ldt*thread_id;
-            int* indirect2_thread   = indirect2 + ldt*thread_id;
-            doublecomplex* tempv1 = bigV + thread_id*ldt*ldt; 
-
             /* Getting U block U(k,j) information */
             /* unsigned long long ut_start, ut_end; */
             int_t rukp =  Ublock_info[j].rukp;
@@ -329,8 +450,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
             int nsupc = SuperSize(jb);
             int ljb = LBj (jb, grid);  /* destination column block */
             int st_col;
-            int ncols;
-            if ( j>jj0 ) { /* jj0 was set to 0 */
+            int ncols;  /* Local variable counts only columns in the block */
+            if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
                 ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
                 st_col = Ublock_info[j-1].full_u_cols;
             } else {
@@ -345,7 +466,16 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
             lptr += LB_DESCRIPTOR;
             int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
 
+	    /* Block-by-block GEMM in look-ahead window */
+#if 0
+	    i = sizeof(doublecomplex);
+	    doublecomplex* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+	    doublecomplex* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
 #if ( PRNTlevel>= 1)
+	    if (thread_id == 0) tt_start = SuperLU_timer_();
 	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
 	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
 	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
@@ -353,14 +483,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 
 #if defined (USE_VENDOR_BLAS)            
             zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
-                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
 #else
             zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-                  &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
-                  &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
 #endif
-#if ( PRNTlevel>=1 )
+
+#if (PRNTlevel>=1 )
 	    if (thread_id == 0) {
 		tt_end = SuperLU_timer_();
 		LookAheadGEMMTimer += tt_end - tt_start;
@@ -378,6 +511,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 				 grid
 			        );
             } else {
+#if 0
+		//#ifdef USE_VTUNE
+	    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	    __itt_resume(); // start VTune, again use 2 underscores
+#endif
                 zscatter_l (
 				 ib, ljb, 
 				 nsupc, iukp, xsup,
@@ -388,137 +526,187 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
 				 Lrowind_bc_ptr, Lnzval_bc_ptr,
 				 grid
 				);
+#if 0
+		//#ifdef USE_VTUNE
+		__itt_pause(); // stop VTune
+		__SSC_MARK(0x222); // stop SDE tracing
+#endif
             }
 
 #if ( PRNTlevel>=1 )
-	    if (thread_id == 0)
+	    if (thread_id == 0) 
 		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
 #endif
-        } /* end omp for ij = ... */
+	   } /* end omp for ij = ... */
+
 #ifdef _OPENMP
-    } /* end omp parallel */
+	 } /* end omp parallel */
 #endif
-        LookAheadGEMMFlOp  += 2*(double)Lnbrow * (double)ldu * (double)ncols;
-        stat->ops[FACT]    += 2*(double)Lnbrow * (double)ldu * (double)ncols;
-        LookAheadScatterMOP += 3*Lnbrow*ncols;
-    } /* end if Lnbrow < ... */
-    
+     } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
     /***************************************************************
      * Updating remaining rows and columns on CPU.
      ***************************************************************/
-    Rnbrow  = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-    ncols   = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+    ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+	double flps = 8.0 * (double)Rnbrow * ldu * ncols;
+	schur_flop_counter  += flps;
+	stat->ops[FACT]     += flps;
 
-    schur_flop_counter  += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
-    stat->ops[FACT]     += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+#if ( PRNTlevel>=1 )
+	RemainGEMM_flops += flps;
+	gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+	gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+	tt_start = SuperLU_timer_();
+	/* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+	   iam, k0,Rnbrow,ldu,ncols,RemainBlk);  fflush(stdout);
+	assert( Rnbrow*ncols < bigv_size ); */
+#endif
+	/* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+	//zgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	zgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+	//zgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	zgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
 
+#if ( PRNTlevel>=1 )
+	tt_end = SuperLU_timer_();
+	RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+	//fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+	// (tt_end - tt_start)*1e6); // time in microsecond
+	//fflush(fgemm);
+	gemm_stats[gemm_count].m = Rnbrow;
+	gemm_stats[gemm_count].n = ncols;
+	gemm_stats[gemm_count].k = ldu;
+	gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+	tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* Scatter into destination block-by-block. */
 #ifdef _OPENMP
-#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
-    {
-	thread_id = omp_get_thread_num();
+#pragma omp parallel default(shared) private(thread_id)
+	{
+	    thread_id = omp_get_thread_num();
  
-	/* Ideally, should organize the loop as:
+	    /* Ideally, should organize the loop as:
                for (j = 0; j < jj_cpu; ++j) {
-                   for (lb = 0; lb < RemainBlk; ++lb) {
+	           for (lb = 0; lb < RemainBlk; ++lb) {
 	               L(lb,k) X U(k,j) -> tempv[]
                    }
                }
-	   But now, we use collapsed loop to achieve more parallelism.
-	   Total number of block updates is:
-	      (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
-	*/
+	       But now, we use collapsed loop to achieve more parallelism.
+	       Total number of block updates is:
+	       (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	    */
+
+	    int i = sizeof(int);
+	    int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+	    int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
 #pragma omp for \
-    private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
     schedule(dynamic)
 #else /* not use _OPENMP */
-    thread_id = 0;
-#endif
-	/* Each thread is assigned one loop index ij, responsible for 
-	   block update L(lb,k) * U(k,j) -> tempv[]. */
-    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
-	int j   = ij / RemainBlk + jj0; 
-	int lb  = ij % RemainBlk;
-
-	int* indirect_thread = indirect + ldt*thread_id;
-	int* indirect2_thread = indirect2 + ldt*thread_id;
-	doublecomplex* tempv1 = bigV + thread_id*ldt*ldt; 
-
-	/* Getting U block U(k,j) information */
-	/* unsigned long long ut_start, ut_end; */
-	int_t rukp =  Ublock_info[j].rukp;
-	int_t iukp =  Ublock_info[j].iukp;
-	int jb   =  Ublock_info[j].jb;
-	int nsupc = SuperSize(jb);
-	int ljb = LBj (jb, grid);
-	int st_col;
-	int ncols;
-	if ( j>jj0 ) {
-	    ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
-	    st_col = Ublock_info[j-1].full_u_cols;
-	} else {
-	    ncols  = Ublock_info[j].full_u_cols;
-	    st_col = 0;   
-	}
-
-	/* Getting L block L(i,k) information */
-	int_t lptr = Remain_info[lb].lptr;
-	int ib   = Remain_info[lb].ib;
-	int temp_nbrow = lsub[lptr+1];
-	lptr += LB_DESCRIPTOR;
-	int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-
+	    thread_id = 0;
+	    int* indirect_thread = indirect;
+	    int* indirect2_thread = indirect2;
+#endif
+	    /* Each thread is assigned one loop index ij, responsible for 
+	       block update L(lb,k) * U(k,j) -> tempv[]. */
+	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+		/* jj_cpu := nub, jj0 starts after look-ahead window. */
+		int j   = ij / RemainBlk + jj0; /* j-th block in U panel */
+		int lb  = ij % RemainBlk;       /* lb-th block in L panel */
+
+		/* Getting U block U(k,j) information */
+		/* unsigned long long ut_start, ut_end; */
+		int_t rukp =  Ublock_info[j].rukp;
+		int_t iukp =  Ublock_info[j].iukp;
+		int jb   =  Ublock_info[j].jb;
+		int nsupc = SuperSize(jb);
+		int ljb = LBj (jb, grid);
+		int st_col;
+		int ncols;
+		if ( j>jj0 ) {
+		    ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+		    st_col = Ublock_info[j-1].full_u_cols;
+		} else {
+		    ncols = Ublock_info[j].full_u_cols;
+		    st_col = 0;   
+		}
+
+		/* Getting L block L(i,k) information */
+		int_t lptr = Remain_info[lb].lptr;
+		int ib   = Remain_info[lb].ib;
+		int temp_nbrow = lsub[lptr+1];
+		lptr += LB_DESCRIPTOR;
+		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+		
+		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry 
+		doublecomplex* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+		/* Now scattering the block */
+
+		if ( ib < jb ) {
+		    zscatter_u (
+				ib, jb,
+				nsupc, iukp, xsup,
+				//klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+				klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+				lptr, temp_nbrow, /* row dimension of the block */
+				lsub, usub, tempv1,
+				Ufstnz_br_ptr, Unzval_br_ptr,
+				grid
+				);
+		} else {
+		    zscatter_l(
+			       ib, ljb,
+			       nsupc, iukp, xsup,
+			       //klst, temp_nbrow, Sherry
+			       klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+			       lptr, temp_nbrow, /* row dimension of the block */
+			       usub, lsub, tempv1,
+			       indirect_thread, indirect2_thread,
+			       Lrowind_bc_ptr,Lnzval_bc_ptr,
+			       grid
+			       );
+		}
+		
+	    } /* end omp for (int ij =...) */
+	    
+#ifdef _OPENMP
+	} /* end omp parallel region */
+#endif
+	
 #if ( PRNTlevel>=1 )
-	if ( thread_id==0 ) tt_start = SuperLU_timer_();
+	RemainScatterTimer += SuperLU_timer_() - tt_start;
 #endif
 
-	/* calling GEMM */
-#if defined (USE_VENDOR_BLAS)
-	zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
-	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-#else
-	zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-	      &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
-	      &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
 #endif
 
-#if ( PRNTlevel>=1 )
-	if (thread_id==0) {
-	    tt_end = SuperLU_timer_();
-	    RemainGEMMTimer += tt_end - tt_start;
-	    tt_start = tt_end;
-	}
-#endif
-
-	/* Now scattering the block */
-	if ( ib<jb ) {
-	    zscatter_u(
-			    ib, jb,
-			    nsupc, iukp, xsup,
-			    klst, temp_nbrow,
-			    lptr, temp_nbrow,lsub,
-			    usub, tempv1,
-			    Ufstnz_br_ptr, Unzval_br_ptr,
-			    grid
-		           );
-	} else {
-	    zscatter_l(
-			    ib, ljb,
-			    nsupc, iukp, xsup,
-			    klst, temp_nbrow,
-			    lptr, temp_nbrow,
-			    usub, lsub, tempv1,
-			    indirect_thread, indirect2_thread,
-			    Lrowind_bc_ptr,Lnzval_bc_ptr,
-			    grid
-			   );
-	}
+    } /* end if Rnbrow>0 ... update remaining block */
 
-#if ( PRNTlevel>=1 )
-	if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
-#endif
-    } /* end omp for (int ij =...) */
-#ifdef _OPENMP
-    } /* end omp parallel region */
-#endif
 }  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c
new file mode 100644
index 0000000..dc86959
--- /dev/null
+++ b/SRC/zbinary_io.c
@@ -0,0 +1,40 @@
+#include "superlu_zdefs.h"
+
+int
+zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, 
+	     doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+    size_t isize = sizeof(int_t), dsize = sizeof(double);
+    int nnz_read;
+    fread(n, isize, 1, fp);
+    fread(nnz, isize, 1, fp);
+    printf("fread n %d\tnnz %d\n", *n, *nnz);
+    *m = *n;
+    *colptr = intMalloc_dist(*n+1);
+    *rowind = intMalloc_dist(*nnz);
+    *nzval  = doublecomplexMalloc_dist(*nnz);
+    fread(*colptr, isize, (size_t) (*n + 1), fp);
+    fread(*rowind, isize, (size_t) *nnz, fp);
+    nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp);
+    printf("# of doubles fread: %d\n", nnz_read);
+    fclose(fp);
+}
+
+int
+zwrite_binary(int_t n, int_t nnz,
+	      doublecomplex *values, int_t *rowind, int_t *colptr)
+{       
+    FILE  *fp1;
+    int nnz_written;
+    size_t isize = sizeof(int_t), dsize = sizeof(double);
+    fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+    fwrite(&n, isize, 1, fp1);
+    fwrite(&nnz, isize, 1, fp1);
+    fwrite(colptr, isize, n+1, fp1);
+    fwrite(rowind, isize, nnz, fp1);
+    nnz_written = fwrite(values, dsize, 2*nnz, fp1);
+    printf("n %d, # of doublecomplex: %d\n", n, nnz);
+    printf("dump binary file ... # of doubles fwrite: %d\n", nnz_written);
+    assert(nnz_written == 2*nnz);
+    fclose(fp1);
+}
diff --git a/SRC/zlook_ahead_update.c b/SRC/zlook_ahead_update.c
index 683f0af..0fe20bf 100644
--- a/SRC/zlook_ahead_update.c
+++ b/SRC/zlook_ahead_update.c
@@ -14,11 +14,17 @@ at the top-level directory.
  * \brief Look-ahead update of the Schur complement.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: September 18, 2017
+ *   
  */
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+
 #ifdef ISORT
 while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
 #else
@@ -27,6 +33,8 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 {
     doublecomplex zero = {0.0, 0.0};
 
+#if 0 // Sherry: no need to search
+    /* Caveat: There is a permutation perm_u involved for j  */
     /* Search along the row for the pointers {iukp, rukp} pointing to
      * block U(k,j).
      * j    -- current block in look-ahead window, initialized to 0 on entry
@@ -38,6 +46,13 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
          	     iukp0, rukp0, usub, perm_u, xsup, grid
 		    );
+#else
+    jb = usub[iukp];
+    ljb = LBj (jb, grid);     /* Local block number of U(k,j). */
+    nsupc = SuperSize(jb);
+    iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
     j++;
     jj0++;
     jj = iukp;
@@ -46,48 +61,47 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 
     ldu = klst - usub[jj++];
     ncols = 1;
-    full = 1; /* flag the U block is indeed 'full', containing segments
-                 of same length. No need padding 0.  */
+
+    /* This loop computes ldu. */
     for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
         segsize = klst - usub[jj];
         if (segsize) {
             ++ncols;
-            if (segsize != ldu) full = 0; /* need padding 0 */
             if (segsize > ldu)  ldu = segsize;
         }
     }
 #if ( DEBUGlevel>=3 )
     ++num_update;
 #endif
-    if (0) {
-        tempu = &uval[rukp];
-    }
-    else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+
 #if ( DEBUGlevel>=3 )
-        printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-                iam, full, k, jb, ldu, ncols, nsupc);
-        ++num_copy;
+    printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+	    iam, k, jb, ldu, ncols, nsupc);
+    ++num_copy;
 #endif
-        tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
-        for (jj = iukp; jj < iukp + nsupc; ++jj) {
-            segsize = klst - usub[jj];
-            if (segsize) {
-                lead_zero = ldu - segsize;
-                for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-                tempu += lead_zero;
-                for (i = 0; i < segsize; ++i) {
-                    tempu[i] = uval[rukp + i];
-                }
-                rukp += segsize;
-                tempu += segsize;
+
+    /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+    tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+    for (jj = iukp; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            lead_zero = ldu - segsize;
+            for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+            tempu += lead_zero;
+            for (i = 0; i < segsize; ++i) {
+                tempu[i] = uval[rukp + i];
             }
+            rukp += segsize;
+            tempu += segsize;
         }
-        tempu = bigU;
-        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-    } /* if full ... */
+    }
+    tempu = bigU; /* set back to the beginning of the buffer */
+#if 0
+    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
 
     nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
-    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
     // double ttx =SuperLU_timer_();
 
     int current_b = 0; /* Each thread starts searching from first block.
@@ -98,9 +112,9 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 #ifdef _OPENMP
     /* Sherry -- examine all the shared variables ??
        'firstprivate' ensures that the private variables are initialized
-       to the values before entering the loop  */
+       to the values before entering the loop.  */
 #pragma omp parallel for \
-    firstprivate(lptr,luptr,ib,tempv,current_b)	private(lb) \
+    firstprivate(lptr,luptr,ib,current_b) private(lb) \
     default(shared) schedule(dynamic)
 #endif
     for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
@@ -133,7 +147,10 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 
         lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 
+	/*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
         /* calling gemm */
+	stat->ops[FACT] += 8.0 * (flops_t)temp_nbrow * ldu * ncols;
 #if defined (USE_VENDOR_BLAS)
         zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
                    &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
@@ -144,7 +161,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
                    tempu, &ldu, &beta, tempv, &temp_nbrow );
 #endif
 
-        /* Now scattering the output*/
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	} 
+#endif
+        /* Now scattering the output. */
         if (ib < jb) {    /* A(i,j) is in U. */
             zscatter_u (ib, jb,
                        nsupc, iukp, xsup,
@@ -158,14 +182,22 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
                        Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
         }
 
-        ++current_b;         /* move to next block */
+        ++current_b;         /* Move to next block. */
         lptr += temp_nbrow;
         luptr += temp_nbrow;
 
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadScatterTimer += tt_end - tt_start;
+	}
+#endif
     } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
 
-    rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
-    iukp += nsupc;
+#if 0
+    rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+#endif
+    iukp += nsupc; /* Mov to block U(k,j+1) */
 
     /* =========================================== *
      * == factorize L(:,j) and send if possible == *
@@ -186,17 +218,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
         /* Factor diagonal and subdiagonal blocks and test for exact
            singularity.  */
         factored[kk] = 0;
-        /* double ttt1 = SuperLU_timer_(); */
-#if ( VAMPIR>=1 )
-        VT_begin (5);
-#endif
+
+        double tt1 = SuperLU_timer_();
 
         PZGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-#if ( VAMPIR>=1 )
-        VT_end (5);
-#endif
+        pdgstrf2_timer += SuperLU_timer_() - tt1; 
+
         /* stat->time7 += SuperLU_timer_() - ttt1; */
 
         /* Multicasts numeric values of L(:,kk) to process rows. */
@@ -220,18 +249,12 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
 #if ( PROFlevel>=1 )
                 TIC (t1);
 #endif
-#if ( VAMPIR>=1 )
-                VT_begin (1);
-#endif
                 MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
                            SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
                            scp->comm, &send_req[pj]);
                 MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
                            SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
                            scp->comm, &send_req[pj + Pc]);
-#if ( VAMPIR>=1 )
-                VT_end (1);
-#endif
 #if ( PROFlevel>=1 )
                 TOC (t2, t1);
                 stat->utime[COMM] += t2;
diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c
index 896c06d..bbaa3aa 100644
--- a/SRC/zmemory_dist.c
+++ b/SRC/zmemory_dist.c
@@ -128,10 +128,13 @@ int_t zQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
     mem_usage->total += (float)(2 * k * iword);
 #else
     /*mem_usage->total += stat->current_buffer;*/
-    printf(".. zQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
     mem_usage->total += stat->peak_buffer;
-#endif
 
+#if ( PRNTlevel>=1 )
+    if (iam==0) printf(".. zQuerySpace: peak_buffer %.2f (MB)\n",
+                       stat->peak_buffer * 1.0e-6);
+#endif
+#endif
     return 0;
 } /* zQuerySpace_dist */
 
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
index 668a995..f9573a8 100644
--- a/SRC/zreadMM.c
+++ b/SRC/zreadMM.c
@@ -16,6 +16,7 @@ at the top-level directory.
  *
  */
 #include <ctype.h>
+#include <stdio.h>
 #include "superlu_zdefs.h"
 
 #undef EXPAND_SYM
@@ -42,6 +43,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     int_t    zero_base = 0;
     char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
     int expand;
+    char *cs;
 
     /* 	File format:
      *    %%MatrixMarket matrix coordinate real general/symmetric/...
@@ -53,7 +55,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
      /* 1/ read header */ 
-     fgets(line,512,fp);
+     cs = fgets(line,512,fp);
      for (p=line; *p!='\0'; *p=tolower(*p),p++);
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
@@ -76,9 +78,9 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
        exit(-1);
      }
 
-     if(strcmp(arith,"real")) {
-       if(!strcmp(arith,"complex")) {
-         printf("Complex matrix; use zreadMM instead!\n");
+     if(strcmp(arith,"complex")) {
+       if(!strcmp(arith,"real")) {
+         printf("Complex matrix; use dreadMM instead!\n");
          exit(-1);
        }
        else if(!strcmp(arith, "pattern")) {
@@ -99,7 +101,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 2/ Skip comments */
      while(banner[0]=='%') {
-       fgets(line,512,fp);
+       cs = fgets(line,512,fp);
        sscanf(line,"%s",banner);
      }
 
@@ -122,16 +124,17 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
     *m = *n;
     printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    fflush(stdout);
     zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
     a    = *nzval;
     asub = *rowind;
     xa   = *colptr;
 
-    if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+    if ( !(val = doublecomplexMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for val[]");
-    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+    if ( !(row = (int_t *) intMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for row[]");
-    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+    if ( !(col = (int_t *) intMalloc_dist(new_nonz)) )
         ABORT("Malloc fails for col[]");
 
     for (j = 0; j < *n; ++j) xa[j] = 0;
@@ -139,17 +142,19 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* 4/ Read triplets of values */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+	j = fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #else
-	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+	j = fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #endif
 
-	if ( nnz == 0 ) /* first nonzero */
+	if ( nnz == 0 ) /* first nonzero */ {
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
 	    } else
 		printf("triplet file: row/col indices are one-based.\n");
+	    fflush(stdout);
+	}
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
@@ -180,6 +185,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     *nonz = nz;
     if(expand) {
       printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+      fflush(stdout);
     }
     
 
@@ -233,8 +239,6 @@ static void zreadrhs(int m, doublecomplex *b)
 	exit(-1);
     }
     for (i = 0; i < m; ++i)
-      fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i);
+      i = fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i);
     fclose(fp);
 }
-
-
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
index 069d3b1..f14870e 100644
--- a/SRC/zscatter.c
+++ b/SRC/zscatter.c
@@ -13,10 +13,13 @@ at the top-level directory.
  * \brief Scatter the computed blocks into LU destination.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
+ * Modified: 
+ *   September 18, 2017, enable SIMD vectorized scatter operation.
+ *   
  */
 #include <math.h>
 #include "superlu_zdefs.h"
@@ -112,9 +115,9 @@ zscatter_l (
            int_t iukp, /* point to destination supernode's index[] */
            int_t* xsup,
            int klst,
-           int nbrow,
+           int nbrow,  /* LDA of the block in tempv[] */
            int_t lptr, /* Input, point to index[] location of block L(i,k) */
-	   int temp_nbrow, /* number of rows in block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
            int_t* usub,
            int_t* lsub,
            doublecomplex *tempv,
@@ -126,7 +129,7 @@ zscatter_l (
     int_t rel, i, segsize, jj;
     doublecomplex *nzval;
     int_t *index = Lrowind_bc_ptr[ljb];
-    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t ldv = index[1];       /* LDA of the destination lusup. */
     int_t lptrj = BC_HEADER;
     int_t luptrj = 0;
     int_t ijb = index[lptrj];
@@ -139,36 +142,43 @@ zscatter_l (
     }
     
     /*
-     * Build indirect table. This is needed because the
-     * indices are not sorted for the L blocks.
+     * Build indirect table. This is needed because the indices are not sorted
+     * in the L blocks.
      */
     int_t fnz = FstBlockC (ib);
     int_t dest_nbrow; 
     lptrj += LB_DESCRIPTOR;
     dest_nbrow=index[lptrj - 1];
     
-    for (i = 0; i < dest_nbrow; ++i)
-    {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    for (i = 0; i < dest_nbrow; ++i) {
         rel = index[lptrj + i] - fnz;
         indirect_thread[rel] = i;
 
     }
 
-    /* can be precalculated */
-    for (i = 0; i < temp_nbrow; ++i)
-    {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    /* can be precalculated? */
+    for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
         rel = lsub[lptr + i] - fnz;
         indirect2[i] =indirect_thread[rel]; 
     }
 
-    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
-    for (jj = 0; jj < nsupc; ++jj)
-    {
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+    for (jj = 0; jj < nsupc; ++jj) {
         segsize = klst - usub[iukp + jj];
-        if (segsize)
-        {
-            for (i = 0; i < temp_nbrow; ++i)
-            {
+        if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
                 z_sub(&nzval[indirect2[i]], &nzval[indirect2[i]], &tempv[i]);
             }
             tempv += nbrow;
@@ -186,9 +196,9 @@ zscatter_u (int ib,
            int_t iukp,
            int_t * xsup,
            int klst,
-           int nbrow,
-           int_t lptr,
-           int temp_nbrow,
+ 	   int nbrow,      /* LDA of the block in tempv[] */
+           int_t lptr,     /* point to index location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
            int_t* lsub,
            int_t* usub,
            doublecomplex* tempv,
@@ -208,8 +218,8 @@ zscatter_u (int ib,
     int_t lib = LBi (ib, grid);
     int_t *index = Ufstnz_br_ptr[lib];
 
-    /* Reinitilize the pointers to the begining of the 
-     * k-th column/row of L/U factors.
+    /* Reinitilize the pointers to the begining of the k-th column/row of
+     * L/U factors.
      * usub[] - index array for panel U(k,:)
      */
     int_t iuip_lib, ruip_lib;
@@ -217,38 +227,32 @@ zscatter_u (int ib,
     ruip_lib = 0;
 
     int_t ijb = index[iuip_lib];
-    while (ijb < jb)            /* Search for dest block. */
-    {
+    while (ijb < jb) {   /* Search for destination block. */
         ruip_lib += index[iuip_lib + 1];
         // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
         iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
         ijb = index[iuip_lib];
     }
-    /* Skip descriptor.  Now point to fstnz index of
-       block U(i,j). */
+    /* Skip descriptor. Now point to fstnz index of block U(i,j). */
     iuip_lib += UB_DESCRIPTOR;
 
     // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
-    for (jj = 0; jj < nsupc; ++jj)
-    {
+    for (jj = 0; jj < nsupc; ++jj) {
         segsize = klst - usub[iukp + jj];
         fnz = index[iuip_lib++];
-        if (segsize)            /* Nonzero segment in U(k.j). */
-        {
+        if (segsize) {          /* Nonzero segment in U(k,j). */
             ucol = &Unzval_br_ptr[lib][ruip_lib];
 
             // printf("========Entering loop=========\n");
-            for (i = 0; i < temp_nbrow; ++i)
-            {
-
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
                 rel = lsub[lptr + i] - fnz;
                 // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
                 // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
-
                 z_sub(&ucol[rel], &ucol[rel], &tempv[i]);
 
-                // printf("hello\n");
-
 #ifdef PI_DEBUG
                 double zz = 0.0;
                 if (!(*(long *) &zz == *(long *) &tempv[i]))
@@ -256,15 +260,16 @@ zscatter_u (int ib,
                             ucol[rel]);
                 //printing triplets (location??, old value, new value ) if none of them is zero
 #endif
-            }                   /* for i=0..temp_nbropw */
-            tempv += nbrow;
+            } /* for i = 0:temp_nbropw */
+            tempv += nbrow; /* Jump LDA to next column */
 #ifdef PI_DEBUG
             // printf("\n");
 #endif
-        }                       /*ig segsize */
+        }  /* if segsize */
+
         ruip_lib += ilst - fnz;
 
-    }                           /*for jj=0:nsupc */
+    }  /* for jj = 0:nsupc */
 #ifdef PI_DEBUG
     // printf("\n");
 #endif
diff --git a/TEST/#pztest.c# b/TEST/#pztest.c#
new file mode 100644
index 0000000..17fda5c
--- /dev/null
+++ b/TEST/#pztest.c#
@@ -0,0 +1,517 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for testing PZGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name:		pztest.c
+ * Purpose:             MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_zdefs.h"
+
+#define NTESTS 1 /*5*/      /* Number of test types */
+#define NTYPES 11     /* Number of matrix types */
+#define NTRAN  2    
+#define THRESH 20.0
+#define FMT1   "%10s:n=%d, test(%d)=%12.5g\n"
+#define	FMT2   "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3   "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp);
+
+extern int
+pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+		doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+		gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+zCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NRformat_loc *Astore;
+    NRformat_loc *Bstore;
+    int_t i, nnz_loc, m_loc;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow = A->nrow;;
+    B->ncol = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    Bstore = (NRformat_loc *) B->Store;
+    Bstore->nnz_loc = Astore->nnz_loc;
+    nnz_loc = Astore->nnz_loc;
+    Bstore->m_loc = Astore->m_loc;
+    m_loc = Astore->m_loc;
+    Bstore->fst_row = Astore->fst_row;
+    memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(doublecomplex));
+    memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+    memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+    if ( nfail > 0 )
+	printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+	       type, nfail, nrun);
+    else
+	printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+    if ( nerrs > 0 )
+	printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZTEST is the main test program for the DOUBLE COMPLEX linear 
+ * equation driver routines PZGSSVX.
+ * 
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A, Asave;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    doublecomplex   *nzval_save;
+    int_t    *colind_save, *rowptr_save;
+    double   *berr, *R, *C;
+    doublecomplex   *b, *bsave, *xtrue, *solx;
+    int    i, j, m, n, izero = 0;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    char matrix_type[8], equed[1];
+    int  relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+         min_gemm_gpu_offload=0;
+    int    equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+    int    nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+    fact_t fact;
+    double rowcnd, colcnd, amax;
+    double result[NTESTS];
+
+    /* Fixed set of parameters */
+    int     iseed[]  = {1988, 1989, 1990, 1991};
+    char    equeds[]  = {'N', 'R', 'C', 'B'};
+    DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+    fact_t  facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+    trans_t transs[]  = {NOTRANS, TRANS, CONJ};
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+    for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+    /* Parse command line argv[]. */
+    parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+		       &relax, &maxsuper,
+		       &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( 0 ) {
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* Set the default input options. */
+    set_default_options_dist(&options);
+    options.PrintStat = NO;
+	
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+	
+    /* Loop through all the input options. */
+    for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+	//if (!iam) printf("imat loop ... %d\n", imat);
+	/* ------------------------------------------------------------
+	   GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+	   ------------------------------------------------------------*/
+	zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+	m = A.nrow;
+	n = A.ncol;
+
+	if ( !(bsave = doublecomplexMalloc_dist(ldb * nrhs)) )
+	    ABORT("Malloc fails for bsave[]");
+	for (j = 0; j < nrhs; ++j)
+	    for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+	/* Save a copy of matrix A in Asave. */
+	Astore = (NRformat_loc *) A.Store;
+	int_t nnz_loc = Astore->nnz_loc;
+	int_t m_loc = Astore->m_loc;
+	nzval_save = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+	colind_save = (int_t *) intMalloc_dist(nnz_loc);
+	rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+	zCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+				       nzval_save, colind_save, rowptr_save,
+				       SLU_NR_loc, SLU_D, SLU_GE);
+	zCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+	for (iequed = 0; iequed < 4; ++iequed) {
+	    int what_equil = equils[iequed];
+	    if (iequed == 0) nfact = 4;
+	    else { /* Only test factored, pre-equilibrated matrix */
+		nfact = 1;
+		options.RowPerm = NOROWPERM; /* Turn off MC64 */
+	    }
+	    //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+	    for (ifact = 0; ifact < nfact; ++ifact) {
+		fact = facts[ifact];
+		options.Fact = fact;
+		//if (!iam) printf("ifact loop ... %d\n", ifact);
+
+		for (equil = 0; equil < 2; ++equil) {
+
+		    //if (!iam) printf("equil loop ... %d\n", equil);
+
+		    options.Equil = equil;
+		    /* Need a first factor */
+		    prefact   = ( options.Fact == FACTORED ||
+				  options.Fact == SamePattern ||
+				  options.Fact == SamePattern_SameRowPerm );
+
+		    /* Restore the matrix A. */
+		    zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+		    /* Initialize ScalePermstruct and LUstruct. */
+		    ScalePermstructInit(m, n, &ScalePermstruct);
+		    LUstructInit(n, &LUstruct);
+
+		    //if ( options.Fact == FACTORED || 
+		    // options.Fact == SamePattern_SameRowPerm ) {
+
+		    if ( prefact ) {
+
+			R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+			C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+			
+			/* Later call to PZGSSVX only needs to solve. */
+                        if ( equil || iequed ) {
+			    /* Compute row and column scale factors to
+			       equilibrate matrix A.    */
+			    pzgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &info,
+				    &grid);
+
+			    /* Force equilibration. */
+			    if ( info==0 && n > 0 ) {
+				if ( what_equil == ROW ) {
+				    rowcnd = 0.;
+				    colcnd = 1.;
+				    ScalePermstruct.DiagScale = ROW;
+				    ScalePermstruct.R = R;
+				} else if ( what_equil == COL ) {
+				    rowcnd = 1.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = COL;
+				    ScalePermstruct.C = C;
+				} else if ( what_equil == BOTH ) {
+				    rowcnd = 0.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = BOTH;
+				    ScalePermstruct.R = R;
+				    ScalePermstruct.C = C;
+				}
+			    }
+			
+			    /* Equilibrate the matrix. */
+			    pzlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+			    // printf("after pdlaqgs: *equed %c\n", *equed);
+
+			    /* Not equilibrate anymore when calling PDGSSVX,.
+			     * so, no malloc/free {R,C} inside PDGSSVX. */
+			    options.Equil = NO;
+			} /* end if (equil || iequed) */
+		    } /* end if prefact */
+
+		    if ( prefact ) { /* Need a first factor */
+			
+		        /* Save Fact option. */
+		        fact = options.Fact;
+			options.Fact = DOFACT;
+
+			/* Initialize the statistics variables. */
+			PStatInit(&stat);
+	
+			int nrhs1 = 0; /* Only performs factorization */
+			pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+				&grid, &LUstruct, &SOLVEstruct,
+				berr, &stat, &info);
+
+			if ( info ) {
+			    printf("** First factor: nrun %d: fact %d, info %d, "
+				   "equil %d, what_equil %d, DiagScale %d \n",
+				   nrun, fact, info, equil, what_equil,
+				   ScalePermstruct.DiagScale);
+			}
+
+			PStatFree(&stat);
+
+		        /* Restore Fact option. */
+			options.Fact = fact;
+			if ( fact == SamePattern ) {
+			    // {L,U} not re-used in subsequent call to PDGSSVX.
+			    Destroy_LU(n, &grid, &LUstruct);
+			}
+
+		    } /* end if .. first time factor */
+
+		    /*----------------
+		     * Test pzgssvx
+		     *----------------*/
+
+		    if ( options.Fact != FACTORED ) {
+			/* Restore the matrix A. */
+			zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+		    } 
+
+		    /* Set the right-hand side. */
+		    zCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+		    PStatInit(&stat);
+
+		    /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n", 
+		      nrun, iequed, equil, options.Fact);*/
+		    /* Testing PDGSSVX: solve and compute the error bounds. */
+		    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+			    &grid, &LUstruct, &SOLVEstruct,
+			    berr, &stat, &info);
+
+		    PStatFree(&stat);
+#if 0
+		    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+				     nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+		    /*		    if ( info && info != izero ) {*/
+		    if ( info ) {
+			printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
+		    } else {
+			/* Restore the matrix A. */
+			zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+			/* Compute residual of the computed solution.*/
+			solx = b;
+			pzcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+					&grid, &SOLVEstruct, &result[0]);
+			
+#if 0  /* how to get RCOND? */
+			/* Check solution accuracy from generated exact solution. */
+			dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+					  &result[2]);
+			pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+					 nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+			/* Print information about the tests that did
+			   not pass the threshold.    */
+			int k1 = 0;
+			for (i = k1; i < NTESTS; ++i) {
+			    if ( result[i] >= THRESH ) {
+				printf(FMT2, "pzgssvx", options.Fact, 
+				       ScalePermstruct.DiagScale,
+				       n, imat, i, result[i], berr[0]);
+				++nfail;
+			    }
+			}
+			nrun += NTESTS;
+		    } /* end else .. info == 0 */
+		   
+		    /* -----------------------------------------------------
+		       Deallocate storage associated with {L,U}.
+		       ----------------------------------------------------- */
+		    if ( prefact ) {
+			SUPERLU_FREE(R);
+			SUPERLU_FREE(C);
+			ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+		    }
+		    ScalePermstructFree(&ScalePermstruct);
+		    Destroy_LU(n, &grid, &LUstruct);
+		    LUstructFree(&LUstruct);
+		    if ( options.SolveInitialized ) {
+			zSolveFinalize(&options, &SOLVEstruct);
+		    }
+
+		} /* end for equil ... */
+		    
+	    } /* end for ifact ... */
+		
+	} /* end for iequed ... */
+	
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	Destroy_CompRowLoc_Matrix_dist(&A);
+	Destroy_CompRowLoc_Matrix_dist(&Asave);
+	//	ScalePermstructFree(&ScalePermstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(bsave);
+	SUPERLU_FREE(xtrue);
+
+    } /* end for imat ... */
+
+    /* Print a summary of the testing results. */
+    if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*  
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp)
+{
+    int c;
+    extern char *optarg;
+    char  str[20];
+
+    while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+	switch (c) {
+	  case 'h':
+	    printf("Options:\n");
+	    printf("\t-r <int> - process rows\n");
+	    printf("\t-c <int> - process columns\n");
+	    printf("\t-n <int> - matrix dimension\n");
+	    printf("\t-x <int> - granularity of relaxed supernodes\n");
+	    printf("\t-m <int> - maximum size of supernode\n");
+	    printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+	    printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+	    printf("\t-s <int> - number of right-hand sides\n");
+	    printf("\t-f <char[]> - file name storing a sparse matrix\n");
+	    exit(1);
+	    break;
+	  case 'r': *nprow = atoi(optarg);
+	            break;
+	  case 'c': *npcol = atoi(optarg);
+	            break;
+	  case 'n': *n = atoi(optarg);
+	            break;
+	  case 'x': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+	            setenv("NREL", str, 1);
+	            //printf("Reset relax env. variable to %d\n", c);
+	            break;
+	  case 'm': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("NSUP", str, 1);
+		    //printf("Reset maxsuper env. variable to %d\n", c);
+	            break;
+	  case 'b': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("FILL", str, 1);
+		    //printf("Reset fill_ratio env. variable to %d\n", c);
+	            break;
+	  case 'g': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("N_GEMM", str, 1);
+		    //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+	            break;
+	  case 's': *nrhs = atoi(optarg); 
+	            break;
+          case 'f':
+                    if ( !(*fp = fopen(optarg, "r")) ) {
+                        ABORT("File does not exist");
+                    }
+                    //printf(".. test sparse matrix in file: %s\n", optarg);
+                    break;
+  	}
+    }
+}
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/TEST/CMakeLists.txt b/TEST/CMakeLists.txt
new file mode 100644
index 0000000..460824a
--- /dev/null
+++ b/TEST/CMakeLists.txt
@@ -0,0 +1,79 @@
+include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
+
+# Libs linked to all of the tests
+set(all_link_libs superlu_dist ${BLAS_LIB} m)
+
+set(MATRICES ../EXAMPLE/g20.rua)  # sample sparse matrix from a file
+set(NPROWS 1 2)		  # process rows
+set(NPCOLS 1 3) 	  # process columns 
+set(NVAL 9 19)	  	  # generated matrix dimensions
+set(NRHS 1 3)		  # number of RHS
+# set(FILLRATIO 2 10)	  # estimated fill ratio
+set(FILLRATIO 2)	  # estimated fill ratio
+# following are blocking parameters, see sp_ienv.c
+set(RELAX 8)	   	  # relaxed supernode size: 4 8
+set(SUPERSIZE 20)   	  # maximum supernode size: 10 20
+set(MINGEMM 10000)	  # minimum GEMM size for GPU offload
+
+function(cat IN_FILE OUT_FILE)
+  file(READ ${IN_FILE} CONTENTS)
+  file(APPEND ${OUT_FILE} "${CONTENTS}")
+endfunction()
+
+# Function to perform test
+# call API:  add_superlu_dist_tests(pddrive big.rua)
+function(add_superlu_dist_tests target input)
+   set(TEST_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
+   set(TEST_OUTPUT "${SuperLU_DIST_BINARY_DIR}/TEST/${target}.out")
+
+  # Prepare a temporary file to "cat" to:
+  # file(WRITE ${TEST_OUTPUT} "")
+
+##  get_target_property(TEST_LOC ${target} LOCATION)
+   set(TEST_LOC ${CMAKE_CURRENT_BINARY_DIR})
+
+   foreach (r ${NPROWS})
+      foreach (c ${NPCOLS})
+        MATH( EXPR np "${r}*${c}" )
+        foreach (s ${NRHS})
+	  foreach (b ${FILLRATIO})
+	    foreach (x ${RELAX})
+	      foreach (m ${SUPERSIZE})
+                set(testName "${target}_${r}x${c}_${s}_${b}_${x}_${m}")
+	  	set(SINGLE_OUTPUT ${SuperLU_DIST_BINARY_DIR}/TEST/${testName}.out)
+          add_test( ${testName}_SP 
+	    	    ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${np}
+            	    ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} 
+		    -r ${r} -c ${c} -s ${s} -b ${b} -x ${x} -m ${m} -f ${TEST_INPUT}
+		  ) 
+#          add_test( ${testName}_SP "${CMAKE_COMMAND}"
+#	    -DTEST=${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${np}
+#            ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r ${r} -c ${c} -s ${s} -b ${b} -x ${x} -m ${m} -f ${TEST_INPUT}
+#	    -DOUTPUT=${SINGLE_OUTPUT}
+#	    -DALL_OUTPUT=${TEST_OUTPUT}
+#	    -DHEADING=Sparse\ matrix\ ${TEST_INPUT}\ --\ r=${r},\ c=${c},\ s=${s},\ x=${x},\ m=${m}
+#	    -P "${SuperLU_DIST_SOURCE_DIR}/TEST/runtest.cmake"
+#		  )
+	      endforeach()
+	    endforeach()
+	  endforeach()
+	endforeach()
+      endforeach()
+   endforeach()
+
+# MPI variables:
+# ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} PROCS
+#  	${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS)
+
+endfunction(add_superlu_dist_tests)
+
+if(enable_double)
+  set(DTEST pdtest.c dcreate_matrix.c pdcompute_resid.c)
+  add_executable(pdtest ${DTEST})
+  target_link_libraries(pdtest ${all_link_libs})
+  add_superlu_dist_tests(pdtest g20.rua)
+endif()
+
+#if(enable_complex16)
+#  set(ZTEST pztest.c zcreate_matrix.c pzcompute_resid.c)
+#endif()
diff --git a/TEST/Makefile b/TEST/Makefile
new file mode 100644
index 0000000..c219d70
--- /dev/null
+++ b/TEST/Makefile
@@ -0,0 +1,56 @@
+#######################################################################
+#  This makefile creates the test programs for the linear equation
+#  routines in SuperLU_DIST.  The test files are grouped as follows:
+#
+#       DLINTST -- Double precision real test routines
+#       ZLINTST -- Double precision complex test routines
+#
+#  Test programs can be generated for all or some of the two different
+#  precisions.  Enter make followed by one or more of the data types
+#  desired.  Some examples:
+#       make complex16
+#  Alternatively, the command
+#       make
+#  without any arguments creates all two test programs.
+#  The executable files are called
+#       pdtest
+#       pztest
+#
+#  To remove the object files after the executable files have been
+#  created, enter
+#       make clean
+#  On some systems, you can force the source files to be recompiled by
+#  entering (for example)
+#       make double FRC=FRC
+#
+#  Creation date:	March 16, 2017
+#  Modified: 	
+#######################################################################
+
+include ../make.inc
+HEADER  = ../SRC
+
+DLINTST = pdtest.o dcreate_matrix.o pdcompute_resid.o
+
+ZLINTST = pztest.o zcreate_matrix.o pzcompute_resid.o
+
+all: double complex16
+
+testmat:
+	(cd MATGEN; $(MAKE))
+
+./pdtest: $(DLINTST) $(DSUPERLULIB) $(TMGLIB)
+	$(LOADER) $(LOADOPTS) $(DLINTST) $(TMGLIB) $(LIBS) -lm -o $@
+
+./pztest: $(ZLINTST) $(DSUPERLULIB) $(TMGLIB)
+	$(LOADER) $(LOADOPTS) $(ZLINTST) $(TMGLIB) $(LIBS) -lm -o $@
+
+double: ./pdtest
+complex16: ./pztest
+
+.c.o:
+	$(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
+
+clean:	
+	rm -f *.o *test *.out
+
diff --git a/TEST/README b/TEST/README
new file mode 100644
index 0000000..d40eaab
--- /dev/null
+++ b/TEST/README
@@ -0,0 +1,12 @@
+		SuperLU_DIST TEST
+		=================
+
+This directory contains testing programs to test various functions
+provded in SuperLU_DIST. 
+
+1. To run the tests (pdtest for real, pztest for complex), you may type:
+  $ mpiexec -n <np> pdtest -r <process row> -c <process columns> -f ../EXAMPLE/g20.rua 
+  $ mpiexec -n <np> pztest -r <process row> -c <process columns> -f ../EXAMPLE/cg20.cua
+
+2. bash scripts to run tests:
+   - pdtest.sh / pztest.sh : invoke many runs varying several input parameters.
diff --git a/EXAMPLE/dcreate_matrix.c b/TEST/dcreate_matrix.c
similarity index 97%
copy from EXAMPLE/dcreate_matrix.c
copy to TEST/dcreate_matrix.c
index 77292d7..a622463 100644
--- a/EXAMPLE/dcreate_matrix.c
+++ b/TEST/dcreate_matrix.c
@@ -89,9 +89,14 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
 #endif
 
     if ( !iam ) {
+        double t = SuperLU_timer_();
+
         /* Read the matrix stored on disk in Harwell-Boeing format. */
         dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
 
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+
 	/* Broadcast matrix A to the other PEs. */
 	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
 	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
diff --git a/TEST/pdcompute_resid.c b/TEST/pdcompute_resid.c
new file mode 100644
index 0000000..120a087
--- /dev/null
+++ b/TEST/pdcompute_resid.c
@@ -0,0 +1,155 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Test for small residual.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ *
+ */
+#include "superlu_ddefs.h"
+
+int pdcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+		    double *x, int ldx, double *b, int ldb,
+		    gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid)
+{
+/*  
+    Purpose   
+    =======   
+
+    PDCOMPUTE_RESID computes the residual for a solution of a system of linear   
+    equations  A*x = b  or  A'*x = b:   
+       RESID = norm(B - A*X) / ( norm(A) * norm(X) * EPS ),   
+    where EPS is the machine epsilon.   
+
+    Arguments   
+    =========   
+
+    M       (input) INTEGER   
+            The number of rows of the matrix A.  M >= 0.   
+
+    N       (input) INTEGER   
+            The number of columns of the matrix A.  N >= 0.   
+
+    NRHS    (input) INTEGER   
+            The number of columns of B, the matrix of right hand sides.   
+            NRHS >= 0.
+	    
+    A       (input/output) SuperMatrix*
+            The original M x N sparse matrix A.   
+	    On exit, the column indices are modified due to SPMV setup.
+
+    X       (input) DOUBLE PRECISION array, dimension (LDX,NRHS)   
+            The computed solution vectors for the system of linear   
+            equations.   
+
+    LDX     (input) INTEGER   
+            The leading dimension of the array X.  If TRANS = NOTRANS,   
+            LDX >= max(1,N); if TRANS = TRANS or CONJ, LDX >= max(1,M).   
+
+    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)   
+            On entry, the right hand side vectors for the system of   
+            linear equations.   
+            On exit, B is overwritten with the difference B - A*X.   
+
+    LDB     (input) INTEGER   
+            The leading dimension of the array B.  IF TRANS = NOTRANS,
+            LDB >= max(1,M); if TRANS = TRANS or CONJ, LDB >= max(1,N).
+
+    SOLVEstruct (input) SOLVEstruct_t*
+
+    GRID    (input) gridinfo_t*
+	    
+    RESID   (output) double PRECISION   
+            The maximum over the number of right-hand sides of
+            norm(B - A*X) / ( norm(A) * norm(X) * EPS ).   
+
+    =====================================================================
+*/
+
+    /* Table of constant values */
+    int    inc  = 1;
+    
+    /* Local variables */
+    int i, j;
+    double anorm, rnorm, rnorm_g;
+    double xnorm, xnorm_g;
+    double eps;
+    char transc[1];
+    double *ax, *R;
+    pdgsmv_comm_t gsmv_comm; 
+    int m_loc = ((NRformat_loc*) A->Store)->m_loc;
+
+    /* Function prototypes */
+    extern double dasum_(int *, double *, int *);
+    
+    /* Function Body */
+    if ( m <= 0 || n <= 0 || nrhs == 0) {
+	*resid = 0.;
+	return 0;
+    }
+
+    /* Exit with RESID = 1/EPS if ANORM = 0. */
+    eps = dmach_dist("Epsilon");
+    anorm = pdlangs("1", A, grid);
+    if (anorm <= 0.) {
+	*resid = 1. / eps;
+	return 0;
+    }
+
+    if ( !(ax = doubleMalloc_dist(m_loc)) ) ABORT("Malloc fails for work[]");
+    R = ax;
+
+    /* A is modified with colind[] permuted to [internal, external]. */
+    pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, &gsmv_comm);
+
+    /* Compute the maximum over the number of right-hand sides of   
+       norm(B - A*X) / ( norm(A) * norm(X) * EPS ) . */
+    *resid = 0.;
+    for (j = 0; j < nrhs; ++j) {
+	double *B_col = &b[j*ldb];
+	double *X_col = &x[j*ldx];
+
+	/* Compute residual R = B - op(A) * X,   
+	   where op(A) = A, A**T, or A**H, depending on TRANS. */
+	/* Matrix-vector multiply. */
+	pdgsmv(0, A, grid, &gsmv_comm, X_col, ax);
+	    
+	/* Compute residual, stored in R[]. */
+	for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
+
+	rnorm = dasum_(&m_loc, R, &inc);
+	xnorm = dasum_(&m_loc, X_col, &inc);
+
+	/* */
+	MPI_Allreduce( &rnorm, &rnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+	MPI_Allreduce( &xnorm, &xnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+		
+	if (xnorm_g <= 0.) {
+	    *resid = 1. / eps;
+	} else {
+	    /* Computing MAX */
+	    double d1, d2;
+	    d1 = *resid;
+	    d2 = rnorm_g / anorm / xnorm_g / eps;
+	    *resid = SUPERLU_MAX(d1, d2);
+	}
+    } /* end for j ... */
+
+    pdgsmv_finalize(&gsmv_comm);
+    SUPERLU_FREE(ax);
+
+    return 0;
+
+} /* pdcompute_redid */
diff --git a/TEST/pdtest.c b/TEST/pdtest.c
new file mode 100644
index 0000000..7666e5a
--- /dev/null
+++ b/TEST/pdtest.c
@@ -0,0 +1,519 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for testing PDGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name:		pdtest.c
+ * Purpose:             MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_ddefs.h"
+
+#define NTESTS 1 /*5*/      /* Number of test types */
+#define NTYPES 11     /* Number of matrix types */
+#define NTRAN  2    
+#define THRESH 20.0
+#define FMT1   "%10s:n=%d, test(%d)=%12.5g\n"
+#define	FMT2   "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3   "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp);
+
+extern int
+pdcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+		double *x, int ldx, double *b, int ldb,
+		gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+dCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NRformat_loc *Astore;
+    NRformat_loc *Bstore;
+    int_t i, nnz_loc, m_loc;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow = A->nrow;;
+    B->ncol = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    Bstore = (NRformat_loc *) B->Store;
+    Bstore->nnz_loc = Astore->nnz_loc;
+    nnz_loc = Astore->nnz_loc;
+    Bstore->m_loc = Astore->m_loc;
+    m_loc = Astore->m_loc;
+    Bstore->fst_row = Astore->fst_row;
+    memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(double));
+    memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+    memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+    if ( nfail > 0 )
+	printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+	       type, nfail, nrun);
+    else
+	printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+    if ( nerrs > 0 )
+	printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDTEST is the main test program for the DOUBLE linear 
+ * equation driver routines PDGSSVX.
+ * 
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A, Asave;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    double   *nzval_save;
+    int_t    *colind_save, *rowptr_save;
+    double   *berr, *R, *C;
+    double   *b, *bsave, *xtrue, *solx;
+    int    i, j, m, n, izero = 0;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    int_t  iinfo;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    char matrix_type[8], equed[1];
+    int  relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+         min_gemm_gpu_offload=0;
+    int    equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+    int    nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+    fact_t fact;
+    double rowcnd, colcnd, amax;
+    double result[NTESTS];
+
+    /* Fixed set of parameters */
+    int     iseed[]  = {1988, 1989, 1990, 1991};
+    char    equeds[]  = {'N', 'R', 'C', 'B'};
+    DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+    fact_t  facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+    trans_t transs[]  = {NOTRANS, TRANS, CONJ};
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+    for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+    /* Parse command line argv[]. */
+    parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+		       &relax, &maxsuper,
+		       &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( 0 ) {
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* Set the default input options. */
+    set_default_options_dist(&options);
+    options.PrintStat = NO;
+	
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+	
+    /* Loop through all the input options. */
+    for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+	//if (!iam) printf("imat loop ... %d\n", imat);
+	/* ------------------------------------------------------------
+	   GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+	   ------------------------------------------------------------*/
+	dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+	m = A.nrow;
+	n = A.ncol;
+
+	if ( !(bsave = doubleMalloc_dist(ldb * nrhs)) )
+	    ABORT("Malloc fails for bsave[]");
+	for (j = 0; j < nrhs; ++j)
+	    for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+	/* Save a copy of matrix A in Asave. */
+	Astore = (NRformat_loc *) A.Store;
+	int_t nnz_loc = Astore->nnz_loc;
+	int_t m_loc = Astore->m_loc;
+	nzval_save = (double *) doubleMalloc_dist(nnz_loc);
+	colind_save = (int_t *) intMalloc_dist(nnz_loc);
+	rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+	dCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+				       nzval_save, colind_save, rowptr_save,
+				       SLU_NR_loc, SLU_D, SLU_GE);
+	dCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+	for (iequed = 0; iequed < 4; ++iequed) {
+	    int what_equil = equils[iequed];
+	    if (iequed == 0) nfact = 4;
+	    else { /* Only test factored, pre-equilibrated matrix */
+		nfact = 1;
+		options.RowPerm = NOROWPERM; /* Turn off MC64 */
+	    }
+	    //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+	    for (ifact = 0; ifact < nfact; ++ifact) {
+		fact = facts[ifact];
+		options.Fact = fact;
+		//if (!iam) printf("ifact loop ... %d\n", ifact);
+
+		for (equil = 0; equil < 2; ++equil) {
+
+		    //if (!iam) printf("equil loop ... %d\n", equil);
+
+		    options.Equil = equil;
+		    /* Need a first factor */
+		    prefact   = ( options.Fact == FACTORED ||
+				  options.Fact == SamePattern ||
+				  options.Fact == SamePattern_SameRowPerm );
+
+		    /* Restore the matrix A. */
+		    dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+		    /* Initialize ScalePermstruct and LUstruct. */
+		    ScalePermstructInit(m, n, &ScalePermstruct);
+		    LUstructInit(n, &LUstruct);
+
+		    //if ( options.Fact == FACTORED || 
+		    // options.Fact == SamePattern_SameRowPerm ) {
+
+		    if ( prefact ) {
+
+			R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+			C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+			
+			/* Later call to PDGSSVX only needs to solve. */
+                        if ( equil || iequed ) {
+			    /* Compute row and column scale factors to
+			       equilibrate matrix A.    */
+			    pdgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &iinfo,
+				    &grid);
+
+			    /* Force equilibration. */
+			    if ( iinfo==0 && n > 0 ) {
+				if ( what_equil == ROW ) {
+				    rowcnd = 0.;
+				    colcnd = 1.;
+				    ScalePermstruct.DiagScale = ROW;
+				    ScalePermstruct.R = R;
+				} else if ( what_equil == COL ) {
+				    rowcnd = 1.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = COL;
+				    ScalePermstruct.C = C;
+				} else if ( what_equil == BOTH ) {
+				    rowcnd = 0.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = BOTH;
+				    ScalePermstruct.R = R;
+				    ScalePermstruct.C = C;
+				}
+			    }
+			
+			    /* Equilibrate the matrix. */
+			    pdlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+			    // printf("after pdlaqgs: *equed %c\n", *equed);
+
+			    /* Not equilibrate anymore when calling PDGSSVX,.
+			     * so, no malloc/free {R,C} inside PDGSSVX. */
+			    options.Equil = NO;
+			} /* end if (equil || iequed) */
+		    } /* end if prefact */
+
+		    if ( prefact ) { /* Need a first factor */
+			
+		        /* Save Fact option. */
+		        fact = options.Fact;
+			options.Fact = DOFACT;
+
+			/* Initialize the statistics variables. */
+			PStatInit(&stat);
+	
+			int nrhs1 = 0; /* Only performs factorization */
+			pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+				&grid, &LUstruct, &SOLVEstruct,
+				berr, &stat, &info);
+
+			if ( info ) {
+			    printf("** First factor: nrun %d: fact %d, info %d, "
+				   "equil %d, what_equil %d, DiagScale %d \n",
+				   nrun, fact, info, equil, what_equil,
+				   ScalePermstruct.DiagScale);
+			}
+
+			PStatFree(&stat);
+
+		        /* Restore Fact option. */
+			options.Fact = fact;
+			if ( fact == SamePattern ) {
+			    // {L,U} not re-used in subsequent call to PDGSSVX.
+			    Destroy_LU(n, &grid, &LUstruct);
+			}
+
+		    } /* end if .. first time factor */
+
+		    /*----------------
+		     * Test pdgssvx
+		     *----------------*/
+
+		    if ( options.Fact != FACTORED ) {
+			/* Restore the matrix A. */
+			dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+		    } 
+
+		    /* Set the right-hand side. */
+		    dCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+		    PStatInit(&stat);
+
+		    /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n", 
+		      nrun, iequed, equil, options.Fact);*/
+		    /* Testing PDGSSVX: solve and compute the error bounds. */
+		    pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+			    &grid, &LUstruct, &SOLVEstruct,
+			    berr, &stat, &info);
+
+		    PStatFree(&stat);
+#if 0
+		    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+				     nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+		    /*		    if ( info && info != izero ) {*/
+		    if ( info ) {
+			printf(FMT3, "pdgssvx",info,izero,n,nrhs,imat,nfail);
+		    } else {
+			/* Restore the matrix A. */
+			dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+			/* Compute residual of the computed solution.*/
+			solx = b;
+			pdcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+					&grid, &SOLVEstruct, &result[0]);
+			
+#if 0  /* how to get RCOND? */
+			/* Check solution accuracy from generated exact solution. */
+			dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+					  &result[2]);
+			pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+					 nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+			/* Print information about the tests that did
+			   not pass the threshold.    */
+			int k1 = 0;
+			for (i = k1; i < NTESTS; ++i) {
+			    if ( result[i] >= THRESH ) {
+				printf(FMT2, "pdgssvx", options.Fact, 
+				       ScalePermstruct.DiagScale,
+				       n, imat, i, result[i], berr[0]);
+				++nfail;
+			    }
+			}
+			nrun += NTESTS;
+		    } /* end else .. info == 0 */
+		   
+		    /* -----------------------------------------------------
+		       Deallocate storage associated with {L,U}.
+		       ----------------------------------------------------- */
+		    if ( prefact ) {
+			SUPERLU_FREE(R);
+			SUPERLU_FREE(C);
+			ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+		    }
+		    ScalePermstructFree(&ScalePermstruct);
+		    Destroy_LU(n, &grid, &LUstruct);
+		    LUstructFree(&LUstruct);
+		    if ( options.SolveInitialized ) {
+			dSolveFinalize(&options, &SOLVEstruct);
+		    }
+
+		} /* end for equil ... */
+		    
+	    } /* end for ifact ... */
+		
+	} /* end for iequed ... */
+	
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	Destroy_CompRowLoc_Matrix_dist(&A);
+	Destroy_CompRowLoc_Matrix_dist(&Asave);
+	//	ScalePermstructFree(&ScalePermstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(bsave);
+	SUPERLU_FREE(xtrue);
+
+    } /* end for imat ... */
+
+    /* Print a summary of the testing results. */
+    if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*  
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp)
+{
+    int c;
+    extern char *optarg;
+    char  str[20];
+
+    while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+	switch (c) {
+	  case 'h':
+	    printf("Options:\n");
+	    printf("\t-r <int> - process rows\n");
+	    printf("\t-c <int> - process columns\n");
+	    printf("\t-n <int> - matrix dimension\n");
+	    printf("\t-x <int> - granularity of relaxed supernodes\n");
+	    printf("\t-m <int> - maximum size of supernode\n");
+	    printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+	    printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+	    printf("\t-s <int> - number of right-hand sides\n");
+	    printf("\t-f <char[]> - file name storing a sparse matrix\n");
+	    exit(1);
+	    break;
+	  case 'r': *nprow = atoi(optarg);
+	            break;
+	  case 'c': *npcol = atoi(optarg);
+	            break;
+	  case 'n': *n = atoi(optarg);
+	            break;
+	  case 'x': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+	            setenv("NREL", str, 1);
+	            //printf("Reset relax env. variable to %d\n", c);
+	            break;
+	  case 'm': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("NSUP", str, 1);
+		    //printf("Reset maxsuper env. variable to %d\n", c);
+	            break;
+	  case 'b': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("FILL", str, 1);
+		    //printf("Reset fill_ratio env. variable to %d\n", c);
+	            break;
+	  case 'g': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("N_GEMM", str, 1);
+		    //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+	            break;
+	  case 's': *nrhs = atoi(optarg); 
+	            break;
+          case 'f':
+                    if ( !(*fp = fopen(optarg, "r")) ) {
+                        ABORT("File does not exist");
+                    }
+                    //printf(".. test sparse matrix in file: %s\n", optarg);
+                    break;
+  	}
+    }
+}
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/TEST/pdtest.sh b/TEST/pdtest.sh
new file mode 100755
index 0000000..8ca51a3
--- /dev/null
+++ b/TEST/pdtest.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# bash hint: == is for string comparisons, -eq is for numeric ones.
+
+ofile=pdtest.out			# output file
+if [ -e $ofile ]; then
+    rm -f $ofile
+fi
+echo "Double-precision testing output" > $ofile
+
+MATRICES=(../EXAMPLE/g20.rua)
+NPROWS="1 2"
+NPCOLS="1 3"
+NVAL="9 19"
+NRHS="1 3"
+FILLRATIO="2 6"
+# following are blocking parameters, see sp_ienv.c
+RELAX="4 8"
+SUPERSIZE="10 20"
+MINGEMM="10000"
+
+##
+# Loop through all matrices ...
+#
+for mat in $MATRICES; do
+
+  #--------------------------------------------
+  # Test matrix types generated in LAPACK-style
+  #--------------------------------------------
+  if  [ "$mat" == "LAPACK" ]; then
+      echo '== LAPACK test matrices' >> $ofile
+      for n in $NVAL ; do
+        for s in $NRHS ; do
+	    echo '' >> $ofile
+            echo 'n='$n 'nrhs='$s >> $ofile
+	      mpiexec -n 2 pdtest -r 1 -c 2 -x 4 -m 10 -b 5 -s 1 >> $ofile
+        done
+      done
+  #--------------------------------------------
+  # Test a specified sparse matrix
+  #--------------------------------------------
+  else
+    echo '' >> $ofile
+    echo '== sparse matrix:' $m >> $ofile
+    for s in $NRHS; do
+      for r in $NPROWS; do
+	for c in $NPCOLS; do
+	  np=$(($r*$c))
+	  for b in $FILLRATIO; do
+	    for x in $RELAX; do
+	      for m in $SUPERSIZE; do
+		echo '' >> $ofile
+   	        echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m"
+   	        echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m" >> $ofile
+		mpiexec -n $np pdtest -r $r -c $c -x $x -m $m -b $b -s 1 -f $mat >> $ofile
+	      done
+	    done
+	  done
+	done
+      done
+    done
+  fi
+done
+
diff --git a/TEST/pzcompute_resid.c b/TEST/pzcompute_resid.c
new file mode 100644
index 0000000..0c29fac
--- /dev/null
+++ b/TEST/pzcompute_resid.c
@@ -0,0 +1,154 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Test for small residual.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ *
+ */
+#include "superlu_zdefs.h"
+
+int pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+		    doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+		    gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid)
+{
+/*  
+    Purpose   
+    =======   
+
+    PZCOMPUTE_RESID computes the residual for a solution of a system of linear   
+    equations  A*x = b  or  A'*x = b:   
+       RESID = norm(B - A*X) / ( norm(A) * norm(X) * EPS ),   
+    where EPS is the machine epsilon.   
+
+    Arguments   
+    =========   
+
+    M       (input) INTEGER   
+            The number of rows of the matrix A.  M >= 0.   
+
+    N       (input) INTEGER   
+            The number of columns of the matrix A.  N >= 0.   
+
+    NRHS    (input) INTEGER   
+            The number of columns of B, the matrix of right hand sides.   
+            NRHS >= 0.
+	    
+    A       (input/output) SuperMatrix*
+            The original M x N sparse matrix A.   
+	    On exit, the column indices are modified due to SPMV setup.
+
+    X       (input) DOUBLE COMPLEX PRECISION array, dimension (LDX,NRHS)   
+            The computed solution vectors for the system of linear   
+            equations.   
+
+    LDX     (input) INTEGER   
+            The leading dimension of the array X.  If TRANS = NOTRANS,   
+            LDX >= max(1,N); if TRANS = TRANS or CONJ, LDX >= max(1,M).   
+
+    B       (input/output) DOUBLE COMPLEX PRECISION array, dimension (LDB,NRHS)   
+            On entry, the right hand side vectors for the system of   
+            linear equations.   
+            On exit, B is overwritten with the difference B - A*X.   
+
+    LDB     (input) INTEGER   
+            The leading dimension of the array B.  IF TRANS = NOTRANS,
+            LDB >= max(1,M); if TRANS = TRANS or CONJ, LDB >= max(1,N).
+
+    SOLVEstruct (input) SOLVEstruct_t*
+
+    GRID    (input) gridinfo_t*
+	    
+    RESID   (output) double PRECISION   
+            The maximum over the number of right-hand sides of
+            norm(B - A*X) / ( norm(A) * norm(X) * EPS ).   
+
+    =====================================================================
+*/
+
+    /* Table of constant values */
+    int    inc  = 1;
+    
+    /* Local variables */
+    int i, j;
+    double anorm, rnorm, rnorm_g;
+    double xnorm, xnorm_g;
+    double eps;
+    char transc[1];
+    doublecomplex *ax, *R;
+    pzgsmv_comm_t gsmv_comm; 
+    int m_loc = ((NRformat_loc*) A->Store)->m_loc;
+
+    /* Function prototypes */
+    extern double dzasum_(int *, doublecomplex *, int *);
+    
+    /* Function Body */
+    if ( m <= 0 || n <= 0 || nrhs == 0) {
+	*resid = 0.;
+	return 0;
+    }
+
+    /* Exit with RESID = 1/EPS if ANORM = 0. */
+    eps = dmach_dist("Epsilon");
+    anorm = pzlangs("1", A, grid);
+    if (anorm <= 0.) {
+	*resid = 1. / eps;
+	return 0;
+    }
+
+    if ( !(ax = doublecomplexMalloc_dist(m_loc)) ) ABORT("Malloc fails for work[]");
+    R = ax;
+
+    /* A is modified with colind[] permuted to [internal, external]. */
+    pzgsmv_init(A, SOLVEstruct->row_to_proc, grid, &gsmv_comm);
+
+    /* Compute the maximum over the number of right-hand sides of   
+       norm(B - A*X) / ( norm(A) * norm(X) * EPS ) . */
+    *resid = 0.;
+    for (j = 0; j < nrhs; ++j) {
+	doublecomplex *B_col = &b[j*ldb];
+	doublecomplex *X_col = &x[j*ldx];
+
+	/* Compute residual R = B - op(A) * X,   
+	   where op(A) = A, A**T, or A**H, depending on TRANS. */
+	/* Matrix-vector multiply. */
+	pzgsmv(0, A, grid, &gsmv_comm, X_col, ax);
+	    
+	/* Compute residual, stored in R[]. */
+	for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
+
+	rnorm = dzasum_(&m_loc, R, &inc);
+	xnorm = dzasum_(&m_loc, X_col, &inc);
+
+	/* */
+	MPI_Allreduce( &rnorm, &rnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+	MPI_Allreduce( &xnorm, &xnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+		
+	if (xnorm_g <= 0.) {
+	    *resid = 1. / eps;
+	} else {
+	    /* Computing MAX */
+	    double d1, d2;
+	    d1 = *resid;
+	    d2 = rnorm_g / anorm / xnorm_g / eps;
+	    *resid = SUPERLU_MAX(d1, d2);
+	}
+    } /* end for j ... */
+
+    pzgsmv_finalize(&gsmv_comm);
+    SUPERLU_FREE(ax);
+
+    return 0;
+
+} /* pzcompute_redid */
diff --git a/TEST/pztest.c b/TEST/pztest.c
new file mode 100644
index 0000000..dadb503
--- /dev/null
+++ b/TEST/pztest.c
@@ -0,0 +1,518 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Driver program for testing PZGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name:		pztest.c
+ * Purpose:             MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_zdefs.h"
+
+#define NTESTS 1 /*5*/      /* Number of test types */
+#define NTYPES 11     /* Number of matrix types */
+#define NTRAN  2    
+#define THRESH 20.0
+#define FMT1   "%10s:n=%d, test(%d)=%12.5g\n"
+#define	FMT2   "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3   "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp);
+
+extern int
+pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+		doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+		gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+zCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NRformat_loc *Astore;
+    NRformat_loc *Bstore;
+    int_t i, nnz_loc, m_loc;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow = A->nrow;;
+    B->ncol = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    Bstore = (NRformat_loc *) B->Store;
+    Bstore->nnz_loc = Astore->nnz_loc;
+    nnz_loc = Astore->nnz_loc;
+    Bstore->m_loc = Astore->m_loc;
+    m_loc = Astore->m_loc;
+    Bstore->fst_row = Astore->fst_row;
+    memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(doublecomplex));
+    memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+    memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+    if ( nfail > 0 )
+	printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+	       type, nfail, nrun);
+    else
+	printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+    if ( nerrs > 0 )
+	printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZTEST is the main test program for the DOUBLE COMPLEX linear 
+ * equation driver routines PZGSSVX.
+ * 
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A, Asave;
+    NRformat_loc *Astore;
+    ScalePermstruct_t ScalePermstruct;
+    LUstruct_t LUstruct;
+    SOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    doublecomplex   *nzval_save;
+    int_t    *colind_save, *rowptr_save;
+    double   *berr, *R, *C;
+    doublecomplex   *b, *bsave, *xtrue, *solx;
+    int    i, j, m, n, izero = 0;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    int_t  iinfo;
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    char matrix_type[8], equed[1];
+    int  relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+         min_gemm_gpu_offload=0;
+    int    equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+    int    nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+    fact_t fact;
+    double rowcnd, colcnd, amax;
+    double result[NTESTS];
+
+    /* Fixed set of parameters */
+    int     iseed[]  = {1988, 1989, 1990, 1991};
+    char    equeds[]  = {'N', 'R', 'C', 'B'};
+    DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+    fact_t  facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+    trans_t transs[]  = {NOTRANS, TRANS, CONJ};
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+    for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+    /* Parse command line argv[]. */
+    parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+		       &relax, &maxsuper,
+		       &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam >= nprow * npcol )	goto out;
+    if ( 0 ) {
+        printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* Set the default input options. */
+    set_default_options_dist(&options);
+    options.PrintStat = NO;
+	
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    if ( !(berr = doubleMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+	
+    /* Loop through all the input options. */
+    for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+	//if (!iam) printf("imat loop ... %d\n", imat);
+	/* ------------------------------------------------------------
+	   GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+	   ------------------------------------------------------------*/
+	zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+	m = A.nrow;
+	n = A.ncol;
+
+	if ( !(bsave = doublecomplexMalloc_dist(ldb * nrhs)) )
+	    ABORT("Malloc fails for bsave[]");
+	for (j = 0; j < nrhs; ++j)
+	    for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+	/* Save a copy of matrix A in Asave. */
+	Astore = (NRformat_loc *) A.Store;
+	int_t nnz_loc = Astore->nnz_loc;
+	int_t m_loc = Astore->m_loc;
+	nzval_save = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+	colind_save = (int_t *) intMalloc_dist(nnz_loc);
+	rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+	zCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+				       nzval_save, colind_save, rowptr_save,
+				       SLU_NR_loc, SLU_D, SLU_GE);
+	zCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+	for (iequed = 0; iequed < 4; ++iequed) {
+	    int what_equil = equils[iequed];
+	    if (iequed == 0) nfact = 4;
+	    else { /* Only test factored, pre-equilibrated matrix */
+		nfact = 1;
+		options.RowPerm = NOROWPERM; /* Turn off MC64 */
+	    }
+	    //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+	    for (ifact = 0; ifact < nfact; ++ifact) {
+		fact = facts[ifact];
+		options.Fact = fact;
+		//if (!iam) printf("ifact loop ... %d\n", ifact);
+
+		for (equil = 0; equil < 2; ++equil) {
+
+		    //if (!iam) printf("equil loop ... %d\n", equil);
+
+		    options.Equil = equil;
+		    /* Need a first factor */
+		    prefact   = ( options.Fact == FACTORED ||
+				  options.Fact == SamePattern ||
+				  options.Fact == SamePattern_SameRowPerm );
+
+		    /* Restore the matrix A. */
+		    zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+		    /* Initialize ScalePermstruct and LUstruct. */
+		    ScalePermstructInit(m, n, &ScalePermstruct);
+		    LUstructInit(n, &LUstruct);
+
+		    //if ( options.Fact == FACTORED || 
+		    // options.Fact == SamePattern_SameRowPerm ) {
+
+		    if ( prefact ) {
+
+			R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+			C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+			
+			/* Later call to PZGSSVX only needs to solve. */
+                        if ( equil || iequed ) {
+			    /* Compute row and column scale factors to
+			       equilibrate matrix A.    */
+			    pzgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &iinfo,
+				    &grid);
+
+			    /* Force equilibration. */
+			    if ( iinfo==0 && n > 0 ) {
+				if ( what_equil == ROW ) {
+				    rowcnd = 0.;
+				    colcnd = 1.;
+				    ScalePermstruct.DiagScale = ROW;
+				    ScalePermstruct.R = R;
+				} else if ( what_equil == COL ) {
+				    rowcnd = 1.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = COL;
+				    ScalePermstruct.C = C;
+				} else if ( what_equil == BOTH ) {
+				    rowcnd = 0.;
+				    colcnd = 0.;
+				    ScalePermstruct.DiagScale = BOTH;
+				    ScalePermstruct.R = R;
+				    ScalePermstruct.C = C;
+				}
+			    }
+			
+			    /* Equilibrate the matrix. */
+			    pzlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+			    // printf("after pdlaqgs: *equed %c\n", *equed);
+
+			    /* Not equilibrate anymore when calling PDGSSVX,.
+			     * so, no malloc/free {R,C} inside PDGSSVX. */
+			    options.Equil = NO;
+			} /* end if (equil || iequed) */
+		    } /* end if prefact */
+
+		    if ( prefact ) { /* Need a first factor */
+			
+		        /* Save Fact option. */
+		        fact = options.Fact;
+			options.Fact = DOFACT;
+
+			/* Initialize the statistics variables. */
+			PStatInit(&stat);
+	
+			int nrhs1 = 0; /* Only performs factorization */
+			pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+				&grid, &LUstruct, &SOLVEstruct,
+				berr, &stat, &info);
+
+			if ( info ) {
+			    printf("** First factor: nrun %d: fact %d, info %d, "
+				   "equil %d, what_equil %d, DiagScale %d \n",
+				   nrun, fact, info, equil, what_equil,
+				   ScalePermstruct.DiagScale);
+			}
+
+			PStatFree(&stat);
+
+		        /* Restore Fact option. */
+			options.Fact = fact;
+			if ( fact == SamePattern ) {
+			    // {L,U} not re-used in subsequent call to PDGSSVX.
+			    Destroy_LU(n, &grid, &LUstruct);
+			}
+
+		    } /* end if .. first time factor */
+
+		    /*----------------
+		     * Test pzgssvx
+		     *----------------*/
+
+		    if ( options.Fact != FACTORED ) {
+			/* Restore the matrix A. */
+			zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+		    } 
+
+		    /* Set the right-hand side. */
+		    zCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+		    PStatInit(&stat);
+
+		    /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n", 
+		      nrun, iequed, equil, options.Fact);*/
+		    /* Testing PDGSSVX: solve and compute the error bounds. */
+		    pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+			    &grid, &LUstruct, &SOLVEstruct,
+			    berr, &stat, &info);
+
+		    PStatFree(&stat);
+#if 0
+		    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+				     nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+		    /*		    if ( info && info != izero ) {*/
+		    if ( info ) {
+			printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
+		    } else {
+			/* Restore the matrix A. */
+			zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+			/* Compute residual of the computed solution.*/
+			solx = b;
+			pzcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+					&grid, &SOLVEstruct, &result[0]);
+			
+#if 0  /* how to get RCOND? */
+			/* Check solution accuracy from generated exact solution. */
+			dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+					  &result[2]);
+			pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+					 nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+			/* Print information about the tests that did
+			   not pass the threshold.    */
+			int k1 = 0;
+			for (i = k1; i < NTESTS; ++i) {
+			    if ( result[i] >= THRESH ) {
+				printf(FMT2, "pzgssvx", options.Fact, 
+				       ScalePermstruct.DiagScale,
+				       n, imat, i, result[i], berr[0]);
+				++nfail;
+			    }
+			}
+			nrun += NTESTS;
+		    } /* end else .. info == 0 */
+		   
+		    /* -----------------------------------------------------
+		       Deallocate storage associated with {L,U}.
+		       ----------------------------------------------------- */
+		    if ( prefact ) {
+			SUPERLU_FREE(R);
+			SUPERLU_FREE(C);
+			ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+		    }
+		    ScalePermstructFree(&ScalePermstruct);
+		    Destroy_LU(n, &grid, &LUstruct);
+		    LUstructFree(&LUstruct);
+		    if ( options.SolveInitialized ) {
+			zSolveFinalize(&options, &SOLVEstruct);
+		    }
+
+		} /* end for equil ... */
+		    
+	    } /* end for ifact ... */
+		
+	} /* end for iequed ... */
+	
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	Destroy_CompRowLoc_Matrix_dist(&A);
+	Destroy_CompRowLoc_Matrix_dist(&Asave);
+	//	ScalePermstructFree(&ScalePermstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(bsave);
+	SUPERLU_FREE(xtrue);
+
+    } /* end for imat ... */
+
+    /* Print a summary of the testing results. */
+    if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+    SUPERLU_FREE(berr);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*  
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+		   char *matrix_type, int *n, int *relax, int *maxsuper,
+		   int *fill_ratio, int *min_gemm_gpu_offload,
+		   int *nrhs, FILE **fp)
+{
+    int c;
+    extern char *optarg;
+    char  str[20];
+
+    while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+	switch (c) {
+	  case 'h':
+	    printf("Options:\n");
+	    printf("\t-r <int> - process rows\n");
+	    printf("\t-c <int> - process columns\n");
+	    printf("\t-n <int> - matrix dimension\n");
+	    printf("\t-x <int> - granularity of relaxed supernodes\n");
+	    printf("\t-m <int> - maximum size of supernode\n");
+	    printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+	    printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+	    printf("\t-s <int> - number of right-hand sides\n");
+	    printf("\t-f <char[]> - file name storing a sparse matrix\n");
+	    exit(1);
+	    break;
+	  case 'r': *nprow = atoi(optarg);
+	            break;
+	  case 'c': *npcol = atoi(optarg);
+	            break;
+	  case 'n': *n = atoi(optarg);
+	            break;
+	  case 'x': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+	            setenv("NREL", str, 1);
+	            //printf("Reset relax env. variable to %d\n", c);
+	            break;
+	  case 'm': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("NSUP", str, 1);
+		    //printf("Reset maxsuper env. variable to %d\n", c);
+	            break;
+	  case 'b': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("FILL", str, 1);
+		    //printf("Reset fill_ratio env. variable to %d\n", c);
+	            break;
+	  case 'g': c = atoi(optarg); 
+	            sprintf(str, "%d", c);
+		    setenv("N_GEMM", str, 1);
+		    //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+	            break;
+	  case 's': *nrhs = atoi(optarg); 
+	            break;
+          case 'f':
+                    if ( !(*fp = fopen(optarg, "r")) ) {
+                        ABORT("File does not exist");
+                    }
+                    //printf(".. test sparse matrix in file: %s\n", optarg);
+                    break;
+  	}
+    }
+}
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/TEST/pztest.sh b/TEST/pztest.sh
new file mode 100755
index 0000000..d7956aa
--- /dev/null
+++ b/TEST/pztest.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# bash hint: == is for string comparisons, -eq is for numeric ones.
+
+ofile=pztest.out			# output file
+if [ -e $ofile ]; then
+    rm -f $ofile
+fi
+echo "Double-complex testing output" > $ofile
+
+MATRICES=(../EXAMPLE/cg20.cua)
+NPROWS="1 2"
+NPCOLS="1 3"
+NVAL="9 19"
+NRHS="1 3"
+FILLRATIO="2 6"
+# following are blocking parameters, see sp_ienv.c
+RELAX="4 8"
+SUPERSIZE="10 20"
+MINGEMM="10000"
+
+##
+# Loop through all matrices ...
+#
+for mat in $MATRICES; do
+
+  #--------------------------------------------
+  # Test matrix types generated in LAPACK-style
+  #--------------------------------------------
+  if  [ "$mat" == "LAPACK" ]; then
+      echo '== LAPACK test matrices' >> $ofile
+      for n in $NVAL ; do
+        for s in $NRHS ; do
+	    echo '' >> $ofile
+            echo 'n='$n 'nrhs='$s >> $ofile
+	      mpiexec -n 2 pztest -r 1 -c 2 -x 4 -m 10 -b 5 -s 1 >> $ofile
+        done
+      done
+  #--------------------------------------------
+  # Test a specified sparse matrix
+  #--------------------------------------------
+  else
+    echo '' >> $ofile
+    echo '== sparse matrix:' $m >> $ofile
+    for s in $NRHS; do
+      for r in $NPROWS; do
+	for c in $NPCOLS; do
+	  np=$(($r*$c))
+	  for b in $FILLRATIO; do
+	    for x in $RELAX; do
+	      for m in $SUPERSIZE; do
+		echo '' >> $ofile
+   	        echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m"
+   	        echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m" >> $ofile
+		mpiexec -n $np pztest -r $r -c $c -x $x -m $m -b $b -s 1 -f $mat >> $ofile
+	      done
+	    done
+	  done
+	done
+      done
+    done
+  fi
+done
+
diff --git a/TEST/runtest.cmake b/TEST/runtest.cmake
new file mode 100644
index 0000000..93ae6a1
--- /dev/null
+++ b/TEST/runtest.cmake
@@ -0,0 +1,13 @@
+# execute the test command that was added earlier.
+execute_process( COMMAND ${TEST}
+                 OUTPUT_FILE ${OUTPUT}
+		 RESULT_VARIABLE RET )
+file(APPEND ${ALL_OUTPUT} ${HEADING})
+file(APPEND ${ALL_OUTPUT} "\n")
+#file(READ ${OUTPUT} SINGLE_OUTPUT)
+file(APPEND ${ALL_OUTPUT} OUTPUT_FILE)
+#file(REMOVE ${OUTPUT})   # remove the individual output file.
+
+if (NOT "${RET}" STREQUAL "0")
+   message (FATAL_ERROR "TEST FAILED!")
+endif()
diff --git a/EXAMPLE/zcreate_matrix.c b/TEST/zcreate_matrix.c
similarity index 97%
copy from EXAMPLE/zcreate_matrix.c
copy to TEST/zcreate_matrix.c
index 87774cf..8660143 100644
--- a/EXAMPLE/zcreate_matrix.c
+++ b/TEST/zcreate_matrix.c
@@ -88,9 +88,14 @@ int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs,
 #endif
 
     if ( !iam ) {
+        double t = SuperLU_timer_();
+
         /* Read the matrix stored on disk in Harwell-Boeing format. */
         zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
 
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+
 	/* Broadcast matrix A to the other PEs. */
 	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
 	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
diff --git a/compile.out b/compile.out
new file mode 100644
index 0000000..590cd80
--- /dev/null
+++ b/compile.out
@@ -0,0 +1,62 @@
+( cd SRC; make )
+make[1]: Entering directory `/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/SRC'
+printf "/* #define XSDK_INDEX_SIZE 64 */\n" > superlu_dist_config.h
+printf "#if (XSDK_INDEX_SIZE == 64)\n#define _LONGINT 1\n#endif\n" >> superlu_dist_config.h
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dlangs_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dgsequ_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dlaqgs_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dutil_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dmemory_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dmyblas2_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dsp_blas2_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dsp_blas3_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgssvx.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgssvx_ABglobal.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dreadhb.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dreadrb.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dreadtriple.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dreadMM.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgsequ.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdlaqgs.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dldperm_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdlangs.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdutil.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdsymbfact_distdata.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c ddistribute.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pddistribute.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrf.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrf2.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdGetDiagU.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrs.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrs1.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrs_lsum.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgstrs_Bglobal.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgsrfs.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgsmv.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgsrfs_ABXglobal.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pdgsmv_AXglobal.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c sp_ienv.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c etree.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c sp_colorder.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c get_perm_c.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c colamd.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c mmd.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c comm.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c memory.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c util.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c superlu_grid.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c pxerr_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c superlu_timer.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c symbfact.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c psymbfact.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c psymbfact_util.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c get_perm_c_parmetis.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c mc64ad_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c static_schedule.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c xerr_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c smach_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c dmach_dist.c 
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64   -c superlu_dist_version.c 
+/usr/bin/ar cr /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/ssg1-build/SRC/libsuperlu_dist.a \
+		dlangs_dist.o dgsequ_dist.o dlaqgs_dist.o dutil_dist.o dmemory_dist.o dmyblas2_dist.o dsp_blas2_dist.o dsp_blas3_dist.o pdgssvx.o pdgssvx_ABglobal.o dreadhb.o dreadrb.o dreadtriple.o dreadMM.o pdgsequ.o pdlaqgs.o dldperm_dist.o pdlangs.o pdutil.o pdsymbfact_distdata.o ddistribute.o pddistribute.o pdgstrf.o pdgstrf2.o pdGetDiagU.o pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o  sp_ienv.o etree.o sp_colorder.o get_perm_c.o c [...]
+make[1]: Leaving directory `/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/SRC'
diff --git a/MAKE_INC/make.mpich b/make.inc
similarity index 51%
copy from MAKE_INC/make.mpich
copy to make.inc
index 559a086..4ca856f 100644
--- a/MAKE_INC/make.mpich
+++ b/make.inc
@@ -8,26 +8,17 @@
 #
 #  Creation date:   March 1, 2016	version 5.0.0
 #
-#  Modified:	    
+#  Modified:	    October 13, 2017    version 5.2.1
 #		    
 #
 ############################################################################
 #
 #  The name of the libraries to be created/linked to
 #
-VERSION		= 5.1.3
-SuperLUroot	= /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}
-DSUPERLULIB   	= $(SuperLUroot)/lib/libsuperlu_dist.a
+SuperLUroot	= /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2
+DSUPERLULIB   	= $(SuperLUroot)/SRC/libsuperlu_dist.a
 
-# BLASDEF 	= -DUSE_VENDOR_BLAS
-
-PARMETIS_DIR	:= ${HOME}/lib/static/parmetis-4.0.3
-I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
-METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
-PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
-
-LIBS		= $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so \
-		${PARMETISLIB} ${METISLIB}
+LIBS		= $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so /home/xiaoye/lib/staticparmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a
 
 #
 #  The archiver and the flag(s) to use when building archive (library)
@@ -38,11 +29,12 @@ ARCHFLAGS    = cr
 RANLIB       = /usr/bin/ranlib
 
 CC           = /home/xiaoye/mpich-install/bin/mpicc
-CFLAGS 	     = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS       += -D_LONGINT
+CFLAGS 	     = -O3 -DNDEBUG -I/home/xiaoye/lib/static/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp  -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 
+# CFLAGS       += -DXSDK_INDEX_SIZE=64
 # CFLAGS       +=  
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 NOOPTS       = -O0
 FORTRAN	     = /usr/bin/gfortran
 
 LOADER       = $(CC)
-LOADOPTS     = -Wl,-rpath=/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}/lib -g # -Wl,-Bdynamic
+LOADOPTS     = -Wl,-rpath,    -Wl,-rpath  -Wl,/home/xiaoye/mpich-install/lib  -Wl,--enable-new-dtags -fopenmp
diff --git a/make.inc.in b/make.inc.in
index 15383ac..c4526cb 100644
--- a/make.inc.in
+++ b/make.inc.in
@@ -8,14 +8,14 @@
 #
 #  Creation date:   March 1, 2016	version 5.0.0
 #
-#  Modified:	    
+#  Modified:	    October 13, 2017    version 5.2.1
 #		    
 #
 ############################################################################
 #
 #  The name of the libraries to be created/linked to
 #
-SuperLUroot	= ${CMAKE_SOURCE_DIR}/build
+SuperLUroot	= ${CMAKE_INSTALL_PREFIX}
 DSUPERLULIB   	= $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
 
 LIBS		= $(DSUPERLULIB) ${BLAS_LIB_EXPORT} ${PARMETIS_LIB_EXPORT}
@@ -32,8 +32,9 @@ CC           = @CMAKE_C_COMPILER@
 CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@
 # CFLAGS       += -D${DirDefs}
 # CFLAGS       += @COMPILE_DEFINITIONS@ 
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
 NOOPTS       = -O0
 FORTRAN	     = @CMAKE_Fortran_COMPILER@
 
 LOADER       = $(CC)
-LOADOPTS     = -Wl,-rpath, at CMAKE_INSTALL_RPATH@ @CMAKE_EXE_LINKER_FLAGS@
+LOADOPTS     = -Wl,-rpath, at OpenMP_CXX_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
diff --git a/run_cmake_build.csh b/run_cmake_build.csh
old mode 100644
new mode 100755
index 42b6482..c003716
--- a/run_cmake_build.csh
+++ b/run_cmake_build.csh
@@ -4,7 +4,7 @@ if ( ! $?NERSC_HOST ) then
     echo "NERSC_HOST undefined"
 else
   if ( "$NERSC_HOST" == "edison" ) then
-    setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3 
+    setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3
 #    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
     setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
     cmake .. \
diff --git a/run_cmake_build.csh b/run_cmake_build.sh
old mode 100644
new mode 100755
similarity index 57%
copy from run_cmake_build.csh
copy to run_cmake_build.sh
index 42b6482..9b1e422
--- a/run_cmake_build.csh
+++ b/run_cmake_build.sh
@@ -1,12 +1,13 @@
-#!/bin/csh
+#!/bin/bash
 
-if ( ! $?NERSC_HOST ) then
+if [ !$?NERSC_HOST ]
+then
     echo "NERSC_HOST undefined"
-else
-  if ( "$NERSC_HOST" == "edison" ) then
-    setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3 
+elif [ "$NERSC_HOST" == "edison" ]
+then
+    export PARMETIS_ROOT=~/Edison/lib/parmetis-4.0.3
 #    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
-    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+    export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/static-build/Linux-x86_64
     cmake .. \
     -DUSE_XSDK_DEFAULTS=FALSE\
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
@@ -17,40 +18,40 @@ else
     -Denable_blaslib=OFF \
 #    -DTPL_BLAS_LIBRARIES=" " \
     -DBUILD_SHARED_LIBS=OFF \
-    -DCMAKE_INSTALL_PREFIX=..
-  endif
-
-  if ( "$NERSC_HOST" == "cori" ) then
-    setenv PARMETIS_ROOT ~/Cori/lib/parmetis-4.0.3
-    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
-#    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+    -DCMAKE_INSTALL_PREFIX=.
+elif [ "$NERSC_HOST" == "cori" ]
+then
+    export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3
+#    export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/shared-build
+    setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
     cmake .. \
     -DUSE_XSDK_DEFAULTS=TRUE\
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
-    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \
+    -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
     -Denable_blaslib=OFF \
     -DCMAKE_Fortran_COMPILER=ftn \
     -DCMAKE_C_FLAGS="-std=c99 -fPIC" \
-    -DCMAKE_EXE_LINKER_FLAGS="-shared" \
-    -DCMAKE_INSTALL_PREFIX=..
-  endif
-endif
+#    -DCMAKE_EXE_LINKER_FLAGS="-shared" \
+    -DCMAKE_INSTALL_PREFIX=.
+fi
 
-set THISHOST=`hostname -s`
-#echo $THISHOST
-if ( "$THISHOST" == "ssg1" ) then
-  setenv PARMETIS_ROOT ~/lib/static/parmetis-4.0.3 
-  setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
-    echo $PARMETIS_ROOT
+THISHOST=`hostname -s`
+echo "host: $THISHOST"
+if [ "$THISHOST" == "ssg1" ]
+then
+  rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build;
+  export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 
+  export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
+  echo "ParMetis root: $PARMETIS_ROOT"
   cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
     -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
-    -DCMAKE_C_FLAGS="-std=c99 -g" \
+    -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64" \
     -Denable_blaslib=OFF \
     -DBUILD_SHARED_LIBS=OFF \
     -DCMAKE_C_COMPILER=mpicc \
-    -DCMAKE_INSTALL_PREFIX=..
-endif
+    -DCMAKE_INSTALL_PREFIX=.
+fi
 
 # make VERBOSE=1
 # make test
diff --git a/superlu_dist.pc.in b/superlu_dist.pc.in
new file mode 100644
index 0000000..2de05e0
--- /dev/null
+++ b/superlu_dist.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
+includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: @CMAKE_PROJECT_NAME@
+Description: Distributed-memory direct solution of sparse systems of linear equations
+Version: @PROJECT_VERSION@
+URL: http://crd-legacy.lbl.gov/~xiaoye/SuperLU/
+
+Libs: -L${libdir} -lsuperlu
+Libs.private: @BLAS_LIB@ -lm
+Cflags: -I${includedir}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/superlu-dist.git