[linbox] 01/02: Imported Upstream version 1.4.2

Wed Aug 10 03:51:15 UTC 2016

This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch master
in repository linbox.

commit db70f946ee9677a548fb5d7a1ad472b4411e63fa
Author: Doug Torrance <dtorrance at piedmont.edu>
Date:   Tue Aug 9 23:51:01 2016 -0400

    Imported Upstream version 1.4.2
---
 .gitignore                                         |    2 +-
 AUTHORS                                            |    2 +
 ChangeLog                                          |    4 +
 Makefile.am                                        |    2 +-
 README                                             |   34 -
 README.md                                          |   46 +
 auto-install.sh                                    |   34 +-
 benchmarks/benchmark-order-basis.C                 |   87 +-
 benchmarks/perfpublisher.sh                        |   22 +-
 configure.ac                                       |   35 +-
 examples/Makefile.am                               |   16 +-
 examples/bench-fft.C                               |   81 +-
 examples/bench-matpoly-mult.C                      |   29 +-
 examples/bench-new-fft.C                           |  333 ++++++
 examples/smith.C                                   |    2 +-
 examples/smithvalence.h                            |    4 +-
 examples/solve.C                                   |  207 +---
 examples/test.sh                                   |    8 +-
 interfaces/sage/Makefile.am                        |    2 +-
 linbox.pc.in                                       |    6 +-
 linbox/Makefile.am                                 |    2 +-
 linbox/algorithms/Makefile.am                      |    4 +-
 linbox/algorithms/block-wiedemann.h                |    7 +-
 .../algorithms/classic-rational-reconstruction.h   |    2 +-
 linbox/algorithms/coppersmith.h                    |    6 +-
 linbox/algorithms/matpoly-mult.h                   |   16 +-
 linbox/algorithms/polynomial-matrix/Makefile.am    |    1 +
 .../matpoly-mult-fft-multiprecision.inl            |  191 ++--
 ...tiprecision.inl => matpoly-mult-fft-recint.inl} |  613 +++++-----
 .../matpoly-mult-fft-wordsize-fast.inl             |   16 +-
 .../matpoly-mult-fft-wordsize-three-primes.inl     |   69 +-
 .../matpoly-mult-fft-wordsize.inl                  |   19 +-
 .../polynomial-matrix/matpoly-mult-fft.h           |  115 +-
 linbox/algorithms/polynomial-matrix/order-basis.h  |  229 +++-
 .../polynomial-matrix/polynomial-fft-algorithms.h  |  401 +++++++
 .../polynomial-matrix/polynomial-fft-butterflies.h |  492 ++++++++
 .../polynomial-matrix/polynomial-fft-init.h        |  299 +++++
 .../polynomial-fft-transform-simd.inl              |  804 +++++++-------
 .../polynomial-matrix/polynomial-fft-transform.h   |  211 +++-
 .../polynomial-matrix/polynomial-fft-transform.inl |   28 +-
 .../polynomial-matrix/polynomial-matrix-domain.h   |   10 +-
 .../polynomial-matrix/simd-additional-functions.h  |  474 ++++++++
 linbox/algorithms/polynomial-matrix/simd.h         |    7 +-
 linbox/algorithms/rational-reconstruction.h        |    4 +-
 linbox/algorithms/rational-solver.inl              |    2 +-
 .../algorithms/smith-form-sparseelim-poweroftwo.h  |    2 +-
 linbox/algorithms/vector-fraction.h                |    4 +-
 linbox/blackbox/apply.h                            |    6 +-
 linbox/linbox-config.h                             |   11 +-
 linbox/matrix/polynomial-matrix.h                  |   45 +-
 linbox/matrix/sparsematrix/sparse-csr-matrix.h     |    6 +-
 linbox/randiter/givaro-poly.h                      |    2 +-
 linbox/randiter/mersenne-twister.h                 |    6 +-
 linbox/randiter/random-fftprime.h                  |  237 ++--
 linbox/ring/modular/Makefile.am                    |    1 -
 linbox/ring/modular/modular-int32.h                |  271 ++---
 linbox/ring/modular/modular-int64.h                |  288 +++--
 linbox/ring/modular/modular-unsigned.h             | 1172 +++++++++++++++-----
 linbox/ring/modular/modular-unsigned.inl           |  828 --------------
 linbox/ring/ntl/ntl-gf2e.h                         |    2 +-
 linbox/ring/ntl/ntl-lzz_pex.h                      |    2 +-
 linbox/ring/ntl/ntl-zz_p.h                         |    4 +-
 linbox/ring/ntl/ntl-zz_pe.h                        |    6 +-
 linbox/ring/pir-modular-int32.h                    |    2 +
 linbox/ring/pir-ntl-zz_p.h                         |    2 +-
 linbox/solutions/smith-form.h                      |  103 +-
 linbox/util/Makefile.am                            |    2 +-
 linbox/vector/blas-vector.h                        |    6 +-
 macros/fflas-ffpack-check.m4                       |    4 +-
 tests/.gitignore                                   |    2 +-
 tests/jenkins-maker.sh                             |   87 ++
 tests/perfpublisher.sh                             |   26 +-
 tests/test-charpoly.C                              |    6 +-
 tests/test-field.h                                 |    8 +-
 tests/test-order-basis.C                           |   38 +-
 tests/test-smith-form-adaptive.C                   |  203 +---
 tests/test-smith-form-binary.C                     |  211 +---
 tests/test-smith-form-iliopoulos.C                 |    2 +
 tests/test-smith-form.C                            |  214 +---
 tests/test-smith-form.h                            |  166 +++
 80 files changed, 5482 insertions(+), 3471 deletions(-)

diff --git a/.gitignore b/.gitignore
index 673faa8..ed889a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,4 +79,4 @@ macros/ltsugar.m4
 macros/ltversion.m4
 macros/lt~obsolete.m4
 stamp-h1
-linbox.pc
+linbox.pc
\ No newline at end of file
diff --git a/AUTHORS b/AUTHORS
index d259730..495d6b3 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,10 +6,12 @@ Mark Giesbrecht <mwg at csd.uwo.ca>
 Pascal Giorgi <Pascal.Giorgi at lirmm.fr>
 Bradford Hovinen <hovinen at cis.udel.edu>
 Erich Kaltofen <kaltofen at math.ncsu.edu>
+Romain Lebreton
 Clement Pernet <Clement.Pernet at imag.fr>
 Daniel Roche <roche at cis.udel.edu>
 B. David Saunders <saunders at cis.udel.edu>
 Arne Storjohann <storjoha at inf.ethz.ch>
 William Turner <turnerw at wabash.edu>
+Bastien Vialla
 Gilles Villard <Gilles.Villard at ens-lyon.fr>
 Zhendong Wan <wan at cis.udel.edu>
diff --git a/ChangeLog b/ChangeLog
index f6d60cc..c93de72 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2016-07-30 cpernet v1.4.2
+	* cleanup and new features on polynomial matrices
+	* many bug fixes ensuring support of gcc-4.8, 5.3, 6.1 clang-3.4 and
+	icpc on i386, x86_64, ubuntu osx, fedora and ppcle
 2016-02-24 cpernet v1.4.1
 	* update the build system (add pkgconfig file, and a more consistent way
 	of dealing with dependencies)
diff --git a/Makefile.am b/Makefile.am
index 4d4e4d6..b2d457f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -106,7 +106,7 @@ git:
 	git commit -a; git pull; git push
 
 
-VERSION=1.4.1
+VERSION=1.4.2
 
 EXTRA_DIST=auto-install.sh
 #incremente-versions
diff --git a/README b/README
deleted file mode 100644
index b2b4705..0000000
--- a/README
+++ /dev/null
@@ -1,34 +0,0 @@
-  ******  The Linbox Library  ******  
-
-PURPOSE:
-
-The Linbox library provides functionality for exact linear algebra.
-See doc/mainpage.doxy for more info.
-
-INSTALLATION:
-
-See doc/install-dev.html for installation from the git lastest version.
-See doc/install-dist.html for installation from a release tarball.
-See INSTALL for generic installation information.
-
-
-AVAILABILITY: from linalg.org and from github.com/linbox-team
-
-
-REQUIREMENTS:  GMP, ATLAS (or other cblas, lapack), Givaro, fflas-ffpack
-OPTIONAL Dependencies: NTL, IML, FLINT, M4RI, M4RIE 
-See  doc/install*html for details.
-
-This library requires the GNU C++ compiler (gcc-4.3 or newer) or any 
-compiler supporting advanced template features.
-
-
-==========================================================
-The linbox website is http://linalg.org
-
-Corrections, suggestions and comments to : 
-linbox-use at googlegroups.com
- 
-Last update : 2015 July
- 
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2213a6c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,46 @@
+# The Linbox Library
+
+[![Build Status](https://ci.inria.fr/linbox/buildStatus/icon?job=LinBox)](https://ci.inria.fr/linbox/job/LinBox/)
+
+## Purpose
+
+The Linbox library provides functionality for exact linear algebra.
+See doc/mainpage.doxy for more info.
+
+## Installation
+
+See doc/install-dev.html for installation from the git lastest version.
+See doc/install-dist.html for installation from a release tarball.
+See INSTALL for generic installation information.
+
+
+## Availability
+
+From github.com/linbox-team
+
+
+# Requirements
+
+- GMP
+- any BLAS (Fortran or C): e.g. ATLAS, OpenBLAS, ...
+- Givaro
+- fflas-ffpack
+
+# Optional Dependencies 
+- NTL, 
+- IML, 
+- FLINT, 
+- M4RI, 
+- M4RIE 
+
+See  doc/install*html for details.
+
+This library requires the GNU C++ compiler (gcc-4.3 or newer) or any 
+compiler supporting advanced template features.
+
+## Contact and discussions
+
+Corrections, suggestions and comments to linbox-use at googlegroups.com
+
+ 
+
diff --git a/auto-install.sh b/auto-install.sh
index 695c825..7241bb7 100755
--- a/auto-install.sh
+++ b/auto-install.sh
@@ -35,7 +35,7 @@ OPTIM="--enable-optimization"
 OPTIM_VAR=""
 CHECK_VAR=""
 #options
-PREFIX_LOC="/usr/local"
+PREFIX_LOC="/tmp"
 PREFIX_VAR=""
 PREFIX="--prefix=$PREFIX_LOC"
 BLAS=""
@@ -86,7 +86,7 @@ help() {
 	echo " * usage :"
 	echo
 	echo " --stable=[yes,no]     : install latest stable versions or latest git versions."
-	echo "                         Default : yes, even if switch ommitted. No argument means yes"
+	echo "                         Default : no, even if switch ommitted. No argument means no"
 
 	echo " --prefix=MY/PATH      : install all libraries under MY/PATH."
 	echo "                         Default : /tmp/"
@@ -97,7 +97,7 @@ help() {
 	echo 
 	echo " --with-gmp=GMP/PATH   : tell where gmp is."
 	echo "                         Default : /usr, /usr/local. No argument is Default"
-	echo " --with-blas=BLAS/PATH : same as GMP for BLAS. (will check anyway)"
+	echo " --with-blas-libs=BLAS/PATH : same as GMP for BLAS. (will check anyway)"
 	echo " --with-ntl=NTL/PATH   : same as GMP for NTL. (default)"
 	echo " --with-iml=IML/PATH   : same as GMP for IML. (default)"
 	echo " --extra-flags=\"\"      : give extra compiler flags."
@@ -254,8 +254,8 @@ for i in "$@" ; do
 				GMP="$i"
 				GMP_VAR="true"
 				;;
-			"--with-blas")
-				if	[ "x$BLAS_VAR" = "xtrue" ] ; then  echo "GMP path already set ?" ;      help ; exit -1; fi
+			"--with-blas-libs")
+				if	[ "x$BLAS_VAR" = "xtrue" ] ; then  echo "BLAS path already set ?" ;      help ; exit -1; fi
 				BLAS=$QUI=\"$QUOI\"
 				BLAS_VAR="true"
 				;;
@@ -341,6 +341,10 @@ esac
 done
 
 MAKEPROG="make ${MAKEOPT}"
+export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$PREFIX_LOC/lib/pkgconfig
+echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH"
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PREFIX_LOC/lib
+echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
 
 ######################
 #  create build dir  #
@@ -505,9 +509,6 @@ ${MAKEPROG} install | tee -a ../../auto-install.log|| die
 #return in build
 cd ..
 
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX_LOC}/lib
-
-
 cool| tee -a ../auto-install.log
 
 ##########################
@@ -537,14 +538,14 @@ if [ "$STABLE_VAR" = "true" ]; then
 	chmod +x configure.fflas.exe
 	./configure.fflas.exe| tee -a ../../auto-install.log
 	rm -rf configure.fflas.exe
-	#./configure  "$PREFIX" "$DEBUG" "$OPTIM" "$BLAS" "$GIVARO" "$WARNINGS" || die
+	#./configure  "$PREFIX" "$DEBUG" "$OPTIM" "$BLAS"  "$WARNINGS" || die
 else
 	echo "./autogen.sh $PREFIX $DEBUG $OPTIM $BLAS $WARNINGS"| tee -a ../../auto-install.log
 	echo "./autogen.sh $PREFIX $DEBUG $OPTIM $BLAS $WARNINGS" > configure.fflas.exe
 	chmod +x configure.fflas.exe
 	./configure.fflas.exe| tee -a ../../auto-install.log
 	rm -rf configure.fflas.exe
-	#./autogen.sh "$PREFIX" "$DEBUG" "$OPTIM" "$BLAS" "$GIVARO" "$WARNINGS" || die
+	#./autogen.sh "$PREFIX" "$DEBUG" "$OPTIM" "$BLAS"  "$WARNINGS" || die
 fi
 
 echo -e "${BEG}building Fflas-Ffpack..."| tee -a ../../auto-install.log
@@ -592,16 +593,13 @@ echo -e " * to ensure you don't get undefined symbols !"| tee -a ./auto-install.
 echo  ""| tee -a ./auto-install.log
 
 
-GIVARO="--with-givaro=$PREFIX_LOC"
-FFLAFLAS="--with-fflas-ffpack=$PREFIX_LOC"
-
 if [ -x autogen.sh ] ;  then 
-	echo "./autogen.sh $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $GIVARO $FFLAFLAS $WARNINGS $IML $SAGE $DRIV"| tee -a ./auto-install.log
-	./autogen.sh "$PREFIX" "$DEBUG" "$OPTIM" "$GMP" "$BLAS" "$NTL" "$GIVARO" "$FFLAFLAS" "$WARNINGS" "$IML" "$SAGE" "$DRIV" | tee -a ./auto-install.log|| die
+	echo "./autogen.sh $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $WARNINGS $IML $SAGE $DRIV"| tee -a ./auto-install.log
+	./autogen.sh "$PREFIX" "$DEBUG" "$OPTIM" "$GMP" "$BLAS" "$NTL" "$WARNINGS" "$IML" "$SAGE" "$DRIV" | tee -a ./auto-install.log|| die
 else
-	echo "./configure $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $GIVARO $FFLAFLAS $WARNINGS $IML $SAGE $DRIV"| tee -a ./auto-install.log
-	# ./configure $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $GIVARO $FFLAFLAS $WARNINGS  $IML $SAGE $DRIV || die
-	./configure "$PREFIX" "$DEBUG" "$OPTIM" "$GMP" "$BLAS" "$NTL" "$GIVARO" "$FFLAFLAS" "$WARNINGS" "$IML" "$SAGE" "$DRIV" | tee -a ./auto-install.log|| die
+	echo "./configure $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $WARNINGS $IML $SAGE $DRIV"| tee -a ./auto-install.log
+	# ./configure $PREFIX $DEBUG $OPTIM $GMP $BLAS $NTL $WARNINGS  $IML $SAGE $DRIV || die
+	./configure "$PREFIX" "$DEBUG" "$OPTIM" "$GMP" "$BLAS" "$NTL" "$WARNINGS" "$IML" "$SAGE" "$DRIV" | tee -a ./auto-install.log|| die
 fi
 
 echo -e "${BEG}building LinBox..."| tee -a ./auto-install.log
diff --git a/benchmarks/benchmark-order-basis.C b/benchmarks/benchmark-order-basis.C
index 5b68e77..e0ffa94 100755
--- a/benchmarks/benchmark-order-basis.C
+++ b/benchmarks/benchmark-order-basis.C
@@ -1,4 +1,6 @@
 /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+//#define __FFLASFFPACK_SEQUENTIAL
+
 #include <iostream>
 #include <iomanip>
 size_t getPeakRSS( );
@@ -6,7 +8,7 @@ size_t getCurrentRSS( );
 //#define MEMINFO std::right<<std::setw(20)<<"                     ---->   Max Mem: "<<getPeakRSS()/1000000.<<"Mo"
 #define MB(x) ((x)/(double)(1<<20))
 //#define MB(x) ((x)>>20)
-#define MEMINFO std::right<<std::setw(20)<<"                     ---->   Mem: "<<MB(getCurrentRSS())<<" Mo  (Max: "<<MB(getPeakRSS())<<" Mo)"  
+#define MEMINFO std::right<<" ---->   Mem: "<<MB(getCurrentRSS())<<" Mo  (Max: "<<MB(getPeakRSS())<<" Mo)"  
 #include "linbox/matrix/polynomial-matrix.h"
 #include "linbox/randiter/random-fftprime.h"
 #include "linbox/randiter/random-prime.h"
@@ -262,42 +264,33 @@ void bench_sigma(const Field& F,  RandIter& Gen, size_t m, size_t n, size_t d, s
 	//typedef typename Field::Element Element;
 	//typedef PolynomialMatrix<PMType::matfirst,PMStorage::plain,Field> MatrixP;
 	typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP;
-	
+	std::cout<<"Order Basis computation over ";F.write(cout)<<endl;
 	integer p;
 	F.characteristic(p);
-	size_t memp=length(p)+(p.bitsize()>=26?8:0);
-	size_t data_in=3*m*n*d*memp;
-	size_t data_out=2*m*m*(d+1)*memp;
-	size_t data_comp= 2*m*m*d*(length(uint64_t(m*d)*p*p)+(p.bitsize()>26?8:0));
+	size_t memp=length(p)+(p.bitsize()>=64?16:0);
+	//size_t data_in=3*m*n*d*memp;
+	//size_t data_out=2*m*m*(d+1)*memp;
+	//size_t data_comp= 2*m*m*d*(length(uint64_t(m*d)*p*p)+(p.bitsize()>26?8:0));
 	std::cout<<"**************************"<<std::endl;
 	std::cout<<"mem(p)        : "<<memp<<std::endl;
-	std::cout<<"mem(p)        : "<<sizeof(p)<<std::endl;
-	std::cout<<"Projected Memory : "<< MB(data_in+data_out+data_comp)<<"Mo"<<std::endl;
+	//std::cout<<"Projected Memory : "<< MB(data_in+data_out+data_comp)<<"Mo"<<std::endl;
 	std::cout<<"Available memory : "<<MB(getMemorySize())<<std::endl;
 	std::cout<<"**************************"<<std::endl;
 	std::cout<<"**************************"<<std::endl<<std::endl<<std::endl;
 	std::cout<<"[begin ] : "<<MEMINFO<<std::endl; 
 
 	
-	MatrixP Serie(F, m, n, d);	
-	std::cout<<"[initial sequence] : "<<MB(m*n*d*memp)<<"Mo"<<MEMINFO<<std::endl;
-	std::cout<<"--> " <<MB(Serie.realmeminfo())<<std::endl;
-	std::cout<<"--> " <<MB(Serie.meminfo())<<" "<<std::endl;
+	MatrixP *Serie = new MatrixP(F, m, n, d);	
 	// set the Serie at random
 	for (size_t k=0;k<d;++k)
 		for (size_t i=0;i<m;++i)
 			for (size_t j=0;j<n;++j)
-				Gen.random(Serie.ref(i,j,k));
-	std::cout<<"[initial sequence] : "<<MB(m*n*d*memp)<<"Mo"<<MEMINFO<<std::endl;
-	
-	MatrixP Sigma2(F, m, m, d+1);
-	std::cout<<"[output sigma    ] : "<<MB(m*m*(d+1)*memp)<<"Mo"<<MEMINFO<<std::endl;
-	std::cout<<"--> " <<MB(Sigma2.meminfo())<<std::endl;
-	std::cout<<"--> " <<MB(Sigma2.realmeminfo())<<std::endl;
-		
+				Gen.random(Serie->ref(i,j,k));
+	std::cout<<"[initial sequence] : "<<MB(Serie->realmeminfo())<<"Mo"<<MEMINFO<<std::endl;
+			
 	// define the shift
 	vector<size_t> shift(m,0);
-
+	
 	OrderBasis<Field> SB(F);
 	Timer chrono;
 #ifdef BENCH_MBASIS
@@ -305,18 +298,36 @@ void bench_sigma(const Field& F,  RandIter& Gen, size_t m, size_t n, size_t d, s
 		MatrixP Sigma1(F, m, m, d+1);
 		vector<size_t> shift2(m,0);
 		chrono.start();
-		SB.M_Basis(Sigma1, Serie, d, shift2);
+		SB.M_Basis(Sigma1, *Serie, d, shift2);
 		chrono.stop();
 		std::cout << "M-Basis       : " <<chrono.usertime()<<" s"<<std::endl;
 	}
 #endif
+
+
+#ifndef  LOW_MEMORY_PMBASIS
+	MatrixP Sigma2(F, m, m, d+1);
+	std::cout<<"[output sigma    ] : "<<MB(Sigma2.realmeminfo())<<"Mo"<<MEMINFO<<std::endl;	
 	chrono.clear();		
 	chrono.start();
-	SB.PM_Basis(Sigma2, Serie, d, shift);
+	SB.PM_Basis(Sigma2, *Serie, d, shift);
 	chrono.stop();
 	std::cout << "PM-Basis      : " <<chrono.usertime()<<" s"<<std::endl;
 	chrono.clear();
+	delete Serie;
+#else
+	MatrixP* sigma_ptr;
+	chrono.clear();		
+	chrono.start();
+	SB.PM_Basis_low(sigma_ptr, Serie, d, shift);
+	// Serie is deleted within PM_Basis_low
+	chrono.stop();
+	std::cout << "PM-Basis      : " <<chrono.usertime()<<" s"<<std::endl;
+	chrono.clear();
+	delete sigma_ptr;
+#endif
 
+	
 	// MatrixP Sigma3(F, m, m, d+1);
 	//vector<size_t> shift3(m,0);
 	// chrono.start();
@@ -328,18 +339,6 @@ void bench_sigma(const Field& F,  RandIter& Gen, size_t m, size_t n, size_t d, s
 }
 
 int main(int argc, char** argv){
-
-	// std::cout<<"Real memory usage: "<<MEMINFO<<std::endl;
-	// const size_t N=32<<20;
-	// double * T= new double[N];
-	// std::cout<<"allocating :"<<((N*sizeof(double))>>20)<<"Mo"<<std::endl;
-	// T[0]=1;
-	// for (size_t i=1;i<N;i++)
-	// 	T[i]=T[i-1];
-	// std::cout<<"Real memory usage: "<<MEMINFO<<std::endl;
-	// delete[] T;
-	// std::cout<<"Real memory usage: "<<MEMINFO<<std::endl;
-	
 	
 	static size_t  m = 64; // matrix dimension
 	static size_t  n = 32; // matrix dimension
@@ -361,12 +360,18 @@ int main(int argc, char** argv){
 	parseArguments (argc, argv, args);
 	
 	typedef Givaro::Modular<double>              SmallField;	
-	typedef Givaro::Modular<Givaro::Integer>      LargeField;
+	//typedef Givaro::Modular<Givaro::Integer>      LargeField;
+	typedef Givaro::Modular<RecInt::ruint128,RecInt::ruint256>  LargeField;
+
+	size_t logd=integer((uint64_t)d).bitsize();
 
-	size_t logd=integer((uint64_t)d).bitsize();	
 	
 	std::cout<<"###  matrix series is of size "<<m<<" x "<<n<<" of degree "<<d<<std::endl;
 	if (b < 26){
+#ifdef FFT_PROFILER		
+		FFT_PROF_LEVEL=1;
+#endif
+
 		if (logd>b-4){
 			std::cout<<"degree is to large for field bitsize: "<<b<<std::endl;
 			exit(0);
@@ -379,11 +384,15 @@ int main(int argc, char** argv){
 		bench_sigma(F,G,m,n,d,target);
 	}
 	else {
+#ifdef FFT_PROFILER		
+			FFT_PROF_LEVEL=2;
+#endif
+
 		RandomPrimeIterator Rd(b,seed);	
 		integer p = Rd.randomPrime();
 		std::cout<<"# starting sigma basis computation over Fp[x] with p="<<p<<endl;;		
-		LargeField F(p);
-		typename LargeField::RandIter G(F,0,seed);
+		LargeField F(p);		
+		typename LargeField::RandIter G(F,b,seed);
 
 		
 		bench_sigma(F,G,m,n,d,target);
diff --git a/benchmarks/perfpublisher.sh b/benchmarks/perfpublisher.sh
index 8be3168..9be8431 100755
--- a/benchmarks/perfpublisher.sh
+++ b/benchmarks/perfpublisher.sh
@@ -8,12 +8,24 @@ XMLFILE=$1
 benchmarks=$2
 COMPILER=$3
 
+# choose gdate on OS X
+if command -v "gdate" >/dev/null; then
+    DATE=gdate
+else
+    DATE=date
+fi
 #=================#
 # Plateform infos #
 #=================#
 
 COMPILERVERSION=$($COMPILER --version 2>&1 | head -1)
-CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+
+if command -v "lscpu" >/dev/null; then
+    CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+else
+    CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000))
+fi
+
 ARCH=$(uname -m)
 OSNAME=$(uname -s)
 OSVERSION=$(uname -r)
@@ -45,8 +57,8 @@ echo '<report name="benchmarks-report" categ="benchmarks">' >> $XMLFILE
 #=======#
 
 echo '<start>' >> $XMLFILE
-echo '<date format="YYYYMMDD" val="'$(date +%Y%m%d)'" />' >> $XMLFILE
-echo '<time format="HHMMSS" val="'$(date +%H%M%S)'" />' >> $XMLFILE
+echo '<date format="YYYYMMDD" val="'$($DATE +%Y%m%d)'" />' >> $XMLFILE
+echo '<time format="HHMMSS" val="'$($DATE +%H%M%S)'" />' >> $XMLFILE
 echo '</start>' >> $XMLFILE
 
 #============#
@@ -59,9 +71,9 @@ do
 	then
 		#File does not exist: compile it
 		echo '[Compiling]' $benchmark
-		COMPILESTART=$(date +%s%3N)
+		COMPILESTART=$($DATE +%s%3N)
 		COMPILELOG=$(make $benchmark 2>&1; echo 'Returned state: '$?)
-		COMPILEEND=$(date +%s%3N)
+		COMPILEEND=$($DATE +%s%3N)
 		COMPILETIME=$(($COMPILEEND - $COMPILESTART))
 		COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$')
 		COMPILETIMERELEVANT='true'
diff --git a/configure.ac b/configure.ac
index 12c8981..ec1764c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,12 +21,12 @@
 
 AC_PREREQ([2.61])
 
-AC_INIT([LinBox], [1.4.1],[linbox-use at googlegroups.com],[linbox],
+AC_INIT([LinBox], [1.4.2],[linbox-use at googlegroups.com],[linbox],
 		[http://www.linalg.org/])
 
 AC_CONFIG_MACRO_DIR([macros])
 AC_CONFIG_AUX_DIR([build-aux])
-AM_INIT_AUTOMAKE([1.8 gnu no-dependencies -Wall -Wno-portability])
+AM_INIT_AUTOMAKE([1.8 gnu no-dependencies -Wall -Wno-portability foreign])
 AC_CONFIG_HEADERS([config.h])
 AX_PREFIX_CONFIG_H(linbox/config.h, __LINBOX)
 AC_PATH_PROG(RM, rm, $FALSE)
@@ -79,7 +79,7 @@ AC_SUBST([DEFAULT_CFLAGS])
 AC_SUBST([DEBUG_CFLAGS])
 AC_SUBST([TESTS_CFLAGS])
 
-TESTS_CFLAGS="-O0"
+TESTS_CFLAGS="-O2"
 DEBUG_CFLAGS="-g"
 DEFAULT_CFLAGS=""
 WARN_CFLAGS="-Wall"
@@ -207,19 +207,20 @@ LB_DRIVER
 echo "-----------------------------------------------"
 
 # Now getting GMP and Givaro from FFLAS-FFPACK - AB 2014-12-10
-
-LB_CHECK_FFLAS_FFPACK(,,[
-echo ''
-echo '*******************************************************************************'
-echo ' ERROR: Fflas-Ffpack not found!'
-echo
-echo ' Fflas-Ffpack routines are required for this library to compile. Please'
-echo ' make sure they are installed and specify its location with the option'
-echo ' --with-fflas-ffpack=<lib> when running configure.'
-echo ' Also make sure your compiler supports cxx-11...'
-echo '*******************************************************************************'
-exit 1
-])
+PKG_CHECK_MODULES([FFLAS_FFPACK], [fflas-ffpack])
+
+dnl LB_CHECK_FFLAS_FFPACK(,,[
+dnl echo ''
+dnl echo '*******************************************************************************'
+dnl echo ' ERROR: Fflas-Ffpack not found!'
+dnl echo
+dnl echo ' Fflas-Ffpack routines are required for this library to compile. Please'
+dnl echo ' make sure they are installed and specify its location with the option'
+dnl echo ' --with-fflas-ffpack=<lib> when running configure.'
+dnl echo ' Also make sure your compiler supports cxx-11...'
+dnl echo '*******************************************************************************'
+dnl exit 1
+dnl ])
 
 LB_CHECK_LAPACK
 
@@ -256,7 +257,7 @@ if test ! -d ./benchmarks/data ; then
 fi
 
 DEPS_CFLAGS="${FFLAS_FFPACK_CFLAGS} ${NTL_CFLAGS} ${MPFR_CFLAGS} ${FPLLL_CFLAGS} ${IML_CFLAGS} ${FLINT_CFLAGS}"
-DEPS_LIBS="${FFLAS_FFPACK_LIBS} ${NTL_LIBS} ${MPFR_LIBS} ${FPLLL_LIBS} ${IML_LIBS} ${FLINT_LIBS} ${OCL_LIBS}"
+DEPS_LIBS=" ${NTL_LIBS} ${MPFR_LIBS} ${FPLLL_LIBS} ${IML_LIBS} ${FLINT_LIBS} ${OCL_LIBS} ${FFLAS_FFPACK_LIBS} ${XML_LIBS}"
 
 CXXFLAGS="${CXXFLAGS} ${STDFLAG}"
 
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 11b459d..dc206a2 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -42,7 +42,7 @@ LDADD += $(top_builddir)/linbox/liblinbox.la
 
 #  SUBDIRS=fields solver data blackbox
 
-EXAMPLES=rank det minpoly valence solve dot-product echelon sparseelimdet sparseelimrank checksolve doubledet smithvalence charpoly polysmith benchfft benchmatpolymult 
+EXAMPLES=rank det minpoly valence solve dot-product echelon sparseelimdet sparseelimrank checksolve doubledet smithvalence charpoly polysmith #bench-fft bench-matpoly-mult
 # EXAMPLES+=nulp yabla 
 GIVARONTL_EXAMPLES=smith graph-charpoly
 if LINBOX_HAVE_NTL
@@ -77,18 +77,22 @@ smithvalence_SOURCES   = smithvalence.C
 sparseelimdet_SOURCES  = sparseelimdet.C
 sparseelimrank_SOURCES = sparseelimrank.C
 polysmith_SOURCES      = poly-smith.C
-benchfft_SOURCES       = bench-fft.C
-benchmatpolymult_SOURCES = bench-matpoly-mult.C
+#bench_fft_SOURCES       = bench-fft.C
+#bench_matpoly_mult_SOURCES = bench-matpoly-mult.C
 LINBOX=@prefix@
 
 LINBOX_BIN=@bindir@
 
 
 # for compilation of new examples
-new_examp_comp = $(CXX) -I at includedir@ $(CXXFLAGS) $(AM_CPPFLAGS) $(OPTFLAGS) ${INCLUDES}  $< -o $@ -L at libdir@ -llinbox $(AM_LDFLAGS) $(LDADD) $(LIBS)
+#new_examp_comp = $(CXX) -I at includedir@ $(CXXFLAGS) $(AM_CPPFLAGS) $(OPTFLAGS) ${INCLUDES}  $< -o $@ -L at libdir@ -llinbox $(AM_LDFLAGS) $(LDADD) $(LIBS)
 
 %:%.C
-	$(new_examp_comp)
+#	$(LTCXXCOMPILE) -c -o $@.$(OBJEXT) $<
+	$(AM_V_CXX)$(CXXCOMPILE) -c -o $@.$(OBJEXT) $<
+	$(AM_V_CXXLD)$(CXXLINK) $@.$(OBJEXT) $(LDADD)
 
 %:%.cpp
-	$(new_examp_comp)
+#	$(LTCXXCOMPILE) -c -o $@.$(OBJEXT) $<
+	$(AM_V_CXX)$(CXXCOMPILE) -c -o $@.$(OBJEXT) $<
+	$(AM_V_CXXLD)$(CXXLINK) $@.$(OBJEXT) $(LDADD)
diff --git a/examples/bench-fft.C b/examples/bench-fft.C
index 9c8583f..b06d6c7 100755
--- a/examples/bench-fft.C
+++ b/examples/bench-fft.C
@@ -62,7 +62,7 @@ struct congruent{
 	bool operator()(T a, T b) const { return ((uint64_t)a%(uint64_t)p) == ((uint64_t)b%(uint64_t)p);}
 };
 template<typename Funct, typename FFT, typename Vect>
-void DFT_sanity_check(FFT& FFTDom, Funct f, const Vect& x, const Vect& y, string msg){
+bool DFT_sanity_check(FFT& FFTDom, Funct f, const Vect& x, const Vect& y, string msg){
 	typedef typename FFT::Element Element ;
 	Vect z(x);
 	auto Functor = bind(f, &FFTDom, &z[0]);
@@ -72,19 +72,21 @@ void DFT_sanity_check(FFT& FFTDom, Funct f, const Vect& x, const Vect& y, string
 	cout<<"  Checking ... "<<msg
 	    << (equal(y.begin(),y.end(),z.begin(),congruent<Element>(FFTDom._p))?" done":" error")<<endl;
 
-	// if (!(equal(y.begin(),y.end(),z.begin(),congruent<Element>(FFTDom._p)))){
-	// 	std::ostream_iterator<Element> out_it (std::cout,", ");
-	// 	std::copy ( z.begin(), z.end(), out_it );
-	// 	std::cout<<std::endl;
-	// 	std::copy ( y.begin(), y.end(), out_it );
-	// 	std::cout<<std::endl; 
-	// }
-
+	if (!(equal(y.begin(),y.end(),z.begin(),congruent<Element>(FFTDom._p)))){
+		std::ostream_iterator<Element> out_it (std::cout,", ");
+		std::copy ( z.begin(), z.end(), out_it );
+		std::cout<<std::endl;
+		std::copy ( y.begin(), y.end(), out_it );
+		std::cout<<std::endl;
+		return false;
+	}
+	return true;
 }
 
 template<typename Field>
-void check_DIF(const Field& fld, size_t kmax, long seed) {  
+bool check_DIF(const Field& fld, size_t kmax, long seed) {
 	typedef typename Field::Element Element;
+	bool passed = true;
 	for (size_t lpts = 1; lpts < kmax ; lpts++){
 		size_t pts = 1 << lpts;
 		cout<<"********************************************************"<<endl;
@@ -106,39 +108,40 @@ void check_DIF(const Field& fld, size_t kmax, long seed) {
 		// compute the correct result
 		MulDom.FFT_DIF_Harvey_mod2p_iterative(y.data()); 
 		// check 2x2 		
-		DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative2x2,x,y, "DIF_Harvey_mod2p_iterative2x2");
+		passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative2x2,x,y, "DIF_Harvey_mod2p_iterative2x2");
 		// check 3x3 		
-		DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative3x3,x,y, "DIF_Harvey_mod2p_iterative3x3");
+		passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative3x3,x,y, "DIF_Harvey_mod2p_iterative3x3");
 		// check 4x1 SSE		
-		//DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x1_SSE,x,y, "DIF_Harvey_mod2p_iterative4x1_SSE");
+		//passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x1_SSE,x,y, "DIF_Harvey_mod2p_iterative4x1_SSE");
 		// check 4x2 SSE		
-		//DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x2_SSE,x,y, "DIF_Harvey_mod2p_iterative4x2_SSE");
-#ifdef __AVX2__
+		//passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x2_SSE,x,y, "DIF_Harvey_mod2p_iterative4x2_SSE");
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		// check 8x1 AVX		
-		//DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative8x1_AVX,x,y, "DIF_Harvey_mod2p_iterative8x1_AVX");
+		//passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative8x1_AVX,x,y, "DIF_Harvey_mod2p_iterative8x1_AVX");
 #endif
 		// check Harvey SSE		
-		DFT_sanity_check(MulDom,&FFT_t::template FFT_DIF<Element>,x,y, "DIF_Harvey_SSE");
-		cout<<"---------------------------------------------------------------"<<endl;
+		passed &= DFT_sanity_check(MulDom,&FFT_t::template FFT_DIF<Element>,x,y, "DIF_Harvey_SSE");
+//		cout<<"---------------------------------------------------------------"<<endl;
+
 		/* CHECK DIT */
 		// compute the correct result
 		y=x;
 		MulDom.FFT_DIT_Harvey_mod4p_iterative2x2(y.data());
 		// check 2x2 		
-		DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative2x2,x,y, "DIT_Harvey_mod4p_iterative2x2");
+		passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative2x2,x,y, "DIT_Harvey_mod4p_iterative2x2");
 		// check 3x3 		
-		DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative3x3,x,y, "DIT_Harvey_mod4p_iterative3x3");
+		passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative3x3,x,y, "DIT_Harvey_mod4p_iterative3x3");
 		// check 4x1 SSE		
-		//DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative4x1_SSE,x,y, "DIT_Harvey_mod4p_iterative4x1_SSE");
-#ifdef __AVX2__
+		//passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative4x1_SSE,x,y, "DIT_Harvey_mod4p_iterative4x1_SSE");
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		// check 8x1 AVX		
-		//DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative8x1_AVX,x,y, "DIT_Harvey_mod4p_iterative8x1_AVX");
+		//passed &= DFT_sanity_check(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative8x1_AVX,x,y, "DIT_Harvey_mod4p_iterative8x1_AVX");
 #endif
 		// check Harvey SSE		
-		DFT_sanity_check(MulDom,&FFT_t::template FFT_DIT<Element>,x,y, "DIT_Harvey_SSE");
-		
-		cout<<endl;
+		passed &= DFT_sanity_check(MulDom,&FFT_t::template FFT_DIT<Element>,x,y, "DIT_Harvey_SSE");
+//		cout<<endl;
 	}
+	return passed;
 }
 
 /**************************************
@@ -179,9 +182,9 @@ void bench_DIF(const Field& fld, size_t kmax, long seed) {
 	typedef typename Field::Element Element;
 	for (size_t lpts = 5; lpts < kmax ; lpts++){
 		size_t pts = 1 << lpts;
-		cout<<"********************************************************"<<endl;
-		cout<<"*** Testing polynomials of size 2^" << lpts <<endl;
-		cout<<"********************************************************"<<endl;
+		cout<<"*********************************************************"<<endl;
+		cout<<"*** Benching polynomials of size 2^" << lpts <<endl;
+		cout<<"*********************************************************"<<endl;
 		vector<Element> x(pts);
 
 		// Generate random inputs
@@ -190,6 +193,8 @@ void bench_DIF(const Field& fld, size_t kmax, long seed) {
 		FFT_transform<Field> MulDom(fld,lpts);
 		typedef FFT_transform<Field> FFT_t; 
 
+		// check 1x1
+		DFT_performance(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative,lpts, x,     "DIF_Harvey_mod2p_iterative");
 		// check 2x2 		
 		DFT_performance(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative2x2,lpts, x,     "DIF_Harvey_mod2p_iterative2x2");
 		// check 3x3 		
@@ -198,20 +203,23 @@ void bench_DIF(const Field& fld, size_t kmax, long seed) {
 		//DFT_performance(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x1_SSE,lpts, x, "DIF_Harvey_mod2p_iterative4x1_SSE");
 		// check 4x2 SSE		
 		//DFT_performance(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative4x2_SSE,lpts, x, "DIF_Harvey_mod2p_iterative4x2_SSE");
-#ifdef __AVX2__
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		// check 8x1 AVX		
 		//DFT_performance(MulDom,&FFT_t::FFT_DIF_Harvey_mod2p_iterative8x1_AVX,lpts, x, "DIF_Harvey_mod2p_iterative8x1_AVX");
 #endif
 		// check Harvey SSE		
 		DFT_performance(MulDom,&FFT_t::template FFT_DIF<Element>,lpts, x, "DIF_Harvey_SSE");
 		cout<<"---------------------------------------------------------------"<<endl;
-		// check 2x2 		
+
+		// check 1x1
+		DFT_performance(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative,lpts, x,     "DIT_Harvey_mod4p_iterative");
+		// check 2x2
 		DFT_performance(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative2x2,lpts, x,     "DIT_Harvey_mod4p_iterative2x2");
 		// check 3x3 		
 		DFT_performance(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative3x3,lpts, x,     "DIT_Harvey_mod4p_iterative3x3");
 		// check 4x1 SSE		
 		//DFT_performance(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative4x1_SSE,lpts, x, "DIT_Harvey_mod4p_iterative4x1_SSE");
-#ifdef __AVX2__
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		// check 8x1 AVX		
 		//DFT_performance(MulDom,&FFT_t::FFT_DIT_Harvey_mod4p_iterative8x1_AVX,lpts, x, "DIT_Harvey_mod4p_iterative8x1_AVX");
 #endif
@@ -237,10 +245,11 @@ int main(int argc, char** argv){
 	cout<<"prime : "<<p<<endl;
 	cout<<endl;
 	
-	//Givaro::Modular<uint32_t> F(p);
-	Givaro::Modular<double> F(p);
-	check_DIF(F,k,seed);
-	bench_DIF(F,k,seed);
+	// No need to test on Modular<double> since the implementation will convert to uint32
+	// and use the uint32 implementation
+	Givaro::Modular<uint32_t,uint64_t> Fi(p);
+	cout << "Test : " << ((check_DIF(Fi,k,seed))?"OK":"KO!!!!") << endl;
+	bench_DIF(Fi,k,seed);
 
 
 	return 0;
diff --git a/examples/bench-matpoly-mult.C b/examples/bench-matpoly-mult.C
index 6779c8d..eefe235 100755
--- a/examples/bench-matpoly-mult.C
+++ b/examples/bench-matpoly-mult.C
@@ -91,9 +91,9 @@ using namespace LinBox;
 
 template <typename Rand, typename Vect>
 void randomVect (Rand& r, Vect& v) {
-	size_t s = v.size();
+	size_t s = v.size();				   
 	for (size_t i = 0; i < s; ++i)
-		r.random(v[i]);
+		r.random(v[i]); 
 }
 
 template <typename Rand, typename Mat>
@@ -149,8 +149,12 @@ template<typename Field, typename RandIter>
 void check_matpol_mul(const Field& fld,  RandIter& Gen, size_t n, size_t d) {
 	typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP;
 	typedef PolynomialMatrix<PMType::matfirst,PMStorage::plain,Field> PMatrix;
-	PMatrix A(fld,n,n,d),B(fld,n,n,d),C(fld,n,n,2*d-1);
-	MatrixP AA(fld,n,n,d),BB(fld,n,n,d),CC(fld,n,n,2*d-1);
+
+	// product m*n n*m
+	size_t m=n;
+	
+	PMatrix A(fld,m,n,d),B(fld,n,m,d),C(fld,m,m,2*d-1);
+	MatrixP AA(fld,m,n,d),BB(fld,n,m,d),CC(fld,m,m,2*d-1);
 	// Generate random matrix of polynomial
 	for (size_t i=0;i<d;i++){
 		randomMat(Gen,A[i]);
@@ -164,10 +168,10 @@ void check_matpol_mul(const Field& fld,  RandIter& Gen, size_t n, size_t d) {
 	Naive NMD(fld);
 	Kara PMKD(fld);
 	FFT  PMFFT(fld);
-
+	
 	// compute the correct result
-	for (size_t r=0;r<n;r++)
-		for (size_t c=0;c<n;c++)
+	for (size_t r=0;r<m;r++)
+		for (size_t c=0;c<m;c++)
 			for (size_t k=0;k<n;k++)
 				for (size_t i=0;i<A.size();i++)
 					for (size_t j=0;j<B.size();j++)
@@ -182,6 +186,7 @@ void check_matpol_mul(const Field& fld,  RandIter& Gen, size_t n, size_t d) {
 	AA.copy(A);
 	BB.copy(B);
 	CC.copy(C);
+	
 	// check fft
 	MATPOLMUL_sanity_check(PMFFT,CC,AA,BB, "FFT Multiplication");
 
@@ -298,7 +303,6 @@ template<typename Field, typename RandIter>
 void profile_matpol_mulfft(const Field& fld,  RandIter& Gen, size_t n, size_t d) {
 	typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP;
 	MatrixP A(fld,n,n,d),B(fld,n,n,d),C(fld,n,n,2*d-1);
-
 	// Generate random matrix of polynomial
 	for (size_t i=0;i<n*n;i++){
 		randomVect(Gen,A(i));
@@ -415,8 +419,9 @@ void profile_matpol_mulkara(const Field& fld,  RandIter& Gen, size_t n, size_t d
 
 template<typename Field>
 void runTest(const Field& F, size_t n, long b, long d, long seed, std::string test){
-	//typename Field::RandIter G(F,b,seed);
-	typename Field::RandIter G(F,seed);
+	
+	typename Field::RandIter G(F,b,seed);
+	//typename Field::RandIter G(F,seed);	
 	if (test == "check"|| test == "all")
 		check_matpol_mul(F,G,n,d);
 	if (test == "bench" || test == "all")
@@ -468,8 +473,8 @@ int main(int argc, char** argv){
 #endif
 			RandomPrimeIter Rd(b,seed);
 			integer p= Rd.random();
-			Givaro::Modular<integer> F(p);
-			//Givaro::Modular<RecInt::ruint128,RecInt::ruint512> F(p);
+			Givaro::Modular<integer> F(p);			
+			//Givaro::Modular<RecInt::ruint128,RecInt::ruint256> F(p);
 			cout<<"Computation over Fp[x] with p=  "<<p<<" (Generic prime)"<<endl;
 			cout<<"++++++++++++++++++++++++++++++++++++"<<endl;
 			runTest (F,n,b,d,seed,test);
diff --git a/examples/bench-new-fft.C b/examples/bench-new-fft.C
new file mode 100755
index 0000000..af40ae8
--- /dev/null
+++ b/examples/bench-new-fft.C
@@ -0,0 +1,333 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2013  Pascal Giorgi
+ *
+ * Written by Pascal Giorgi <pascal.giorgi at lirmm.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+ * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+#include <linbox/linbox-config.h>
+
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include <givaro/modular.h>
+#include <givaro/givranditer.h>
+
+using namespace std; 
+
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-butterflies.h"
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-algorithms.h"
+
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-transform.h"
+#include "linbox/randiter/random-fftprime.h"
+#include "linbox/ring/modular.h"
+#include "fflas-ffpack/utils/align-allocator.h"
+
+using namespace LinBox;
+
+
+
+template <typename Rand, typename Vect>
+void randomVect (Rand& r, Vect& v) {
+	size_t s = v.size();
+	for (size_t i = 0; i < s; ++i)
+		r.random(v[i]);
+}
+
+
+/**********************************
+ ****** DFT CHECKING FUNCTION *****
+ *********************************/
+template<typename T>
+struct congruent{
+	T p;
+	congruent(T _p): p(_p){}
+	bool operator()(T a, T b) const { return ((uint64_t)a%(uint64_t)p) == ((uint64_t)b%(uint64_t)p);}
+};
+template<typename Funct, typename FFT, typename Vect>
+bool DFT_sanity_check(FFT& FFTDom, Funct f, const Vect& x, const Vect& y, string msg){
+	typedef typename FFT::Element Element ;
+	Vect z(x);
+	auto Functor = bind(f, &FFTDom, &z[0]);
+	Functor();
+	msg+="  ";
+	msg.resize(45,'.');
+	cout<<"  Checking ... "<<msg
+	   << (equal(y.begin(),y.end(),z.begin(),congruent<Element>(FFTDom._p))?" done":" error")<<endl;
+
+	if (!(equal(y.begin(),y.end(),z.begin(),congruent<Element>(FFTDom._p)))){
+//		std::ostream_iterator<Element> out_it (std::cout,", ");
+//		std::copy ( z.begin(), z.end(), out_it );
+//		std::cout<<std::endl;
+//		std::copy ( y.begin(), y.end(), out_it );
+//		std::cout<<std::endl;
+		return false;
+	}
+	return true;
+}
+
+template<typename Field>
+bool check_DIF(const Field& fld, size_t kmax, long seed) {
+	typedef typename Field::Element Element;
+	bool passed = true;
+	for (size_t lpts = 1; lpts < kmax ; lpts++){
+		size_t pts = 1 << lpts;
+		cout<<"********************************************************"<<endl;
+		cout<<"*** Testing polynomials of size 2^" << lpts <<endl;
+		cout<<"********************************************************"<<endl;
+		//vector<Element> x(pts),y(pts);
+		std::vector<Element,AlignedAllocator<Element, Alignment::DEFAULT>> x(pts),y(pts);
+
+		// Generate random inputs
+		typename Field::RandIter Gen(fld);//,fld.characteristic(),seed);
+		randomVect(Gen,y);
+		x=y;
+		
+//		FFT_transform<Field> MulDom(fld,lpts);
+//		typedef FFT_transform<Field> FFT_t;
+
+		FFT_init<Field> fft_init (fld, lpts);
+
+		FFT_algorithms<Field,NoSimd<typename Field::Element> > fft_algo_nosimd (fft_init);
+//		using FFT_a = FFT_algorithms<Field,NoSimd<typename Field::Element> >;
+
+
+		/* CHECK DIF */
+		// compute the correct result
+		fft_algo_nosimd.DIF(y.data());
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+		// check FFT_algorithms::DIF
+		if (Simd128<typename Field::Element>::vect_size == 4 || Simd128<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd128<typename Field::Element> > fft_algo_simd128 (fft_init);
+			using FFT_a128 = FFT_algorithms<Field,Simd128<typename Field::Element> >;
+			passed &= DFT_sanity_check(fft_algo_simd128,&FFT_a128::DIF,x,y, "FFT_algorithms<Field,Simd128>::DIF");
+		}
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+		// check FFT_algorithms::DIF
+		if (Simd256<typename Field::Element>::vect_size == 4 || Simd256<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd256<typename Field::Element> > fft_algo_simd256 (fft_init);
+			using FFT_a256 = FFT_algorithms<Field,Simd256<typename Field::Element> >;
+			passed &= DFT_sanity_check(fft_algo_simd256,&FFT_a256::DIF,x,y, "FFT_algorithms<Field,Simd256>::DIF");
+		}
+#endif
+		cout<<"---------------------------------------------------------------"<<endl;
+
+		/* CHECK DIT */
+		// compute the correct result
+		y=x;
+		fft_algo_nosimd.DIT(y.data());
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+		// check FFT_algorithms::DIT
+		if (Simd128<typename Field::Element>::vect_size == 4 || Simd128<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd128<typename Field::Element> > fft_algo_simd128 (fft_init);
+			using FFT_a128 = FFT_algorithms<Field,Simd128<typename Field::Element> >;
+			passed &= DFT_sanity_check(fft_algo_simd128,&FFT_a128::DIT,x,y, "FFT_algorithms<Field,Simd128>::DIT");
+		}
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+		// check FFT_algorithms::DIT
+		if (Simd256<typename Field::Element>::vect_size == 4 || Simd256<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd256<typename Field::Element> > fft_algo_simd256 (fft_init);
+			using FFT_a256 = FFT_algorithms<Field,Simd256<typename Field::Element> >;
+			passed &= DFT_sanity_check(fft_algo_simd256,&FFT_a256::DIT,x,y, "FFT_algorithms<Field,Simd256>::DIT");
+		}
+#endif
+
+		cout<<endl;
+	}
+	return passed;
+}
+
+/**************************************
+ ****** DFT PERFORMANCE FUNCTION ******
+ **************************************/
+template<typename Funct, typename FFT, typename Vect>
+void DFT_performance(FFT& FFTDom, Funct f, size_t lpts, const Vect& x, string msg){
+	Vect z(x);
+	auto Functor = bind(f, &FFTDom, &z[0]);
+	Timer chrono;
+	double time;
+	double Miops;
+	size_t ct,minct=4;
+	ct = 0;
+	chrono.start();
+	while (chrono.realElapsedTime() < 1){
+		for (size_t k=0;k<minct;k++)
+			Functor();
+		ct+=minct;
+	}
+	time = chrono.userElapsedTime()/ct;
+	Miops = 17 * (lpts<<(lpts-1)) /(1e6 * time); // 3/2 n log n
+	msg+="  ";
+	msg.resize(45,'.');
+	cout << "Timings ... " << msg <<" : ";
+	cout.precision(2);
+	cout.width(10);
+	cout<<scientific<<time << " s, ";
+	cout.precision(2);
+	cout.width(10);
+	cout<<fixed<<Miops << " Miops\n";
+}
+
+
+
+template<typename Field>
+void bench_DIF(const Field& fld, size_t kmax, long seed) { 
+	typedef typename Field::Element Element;
+	for (size_t lpts = 5; lpts < kmax ; lpts++){
+		uint64_t pts = 1UL << lpts;
+		cout<<"*********************************************************"<<endl;
+		cout<<"*** Benching polynomials of size 2^" << lpts <<endl;
+		cout<<"*********************************************************"<<endl;
+		vector<Element> x(pts);
+
+		// Generate random inputs
+		typename Field::RandIter Gen(fld,seed);
+		randomVect(Gen,x);
+
+		FFT_init<Field> fft_init (fld, lpts);
+
+		FFT_algorithms<Field,NoSimd<typename Field::Element> > fft_algo_nosimd (fft_init);
+		using FFT_a = FFT_algorithms<Field,NoSimd<typename Field::Element> >;
+		DFT_performance(fft_algo_nosimd,&FFT_a::DIF, lpts, x, "FFT_algorithms<Field,NoSimd>::DIF");
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+		if (Simd128<typename Field::Element>::vect_size == 4 || Simd128<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd128<typename Field::Element> > fft_algo_simd128 (fft_init);
+			using FFT_a128 = FFT_algorithms<Field,Simd128<typename Field::Element> >;
+			DFT_performance(fft_algo_simd128,&FFT_a128::DIF, lpts, x, "FFT_algorithms<Field,Simd128>::DIF");
+		}
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+		if (Simd256<typename Field::Element>::vect_size == 4 || Simd256<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd256<typename Field::Element> > fft_algo_simd256 (fft_init);
+			using FFT_a256 = FFT_algorithms<Field,Simd256<typename Field::Element> >;
+			DFT_performance(fft_algo_simd256,&FFT_a256::DIF, lpts, x, "FFT_algorithms<Field,Simd256>::DIF");
+		}
+#endif
+		cout<<"---------------------------------------------------------------"<<endl;
+
+		DFT_performance(fft_algo_nosimd,&FFT_a::DIT, lpts, x, "FFT_algorithms<Field,NoSimd>::DIT");
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+		if (Simd128<typename Field::Element>::vect_size == 4 || Simd128<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd128<typename Field::Element> > fft_algo_simd128 (fft_init);
+			using FFT_a128 = FFT_algorithms<Field,Simd128<typename Field::Element> >;
+			DFT_performance(fft_algo_simd128,&FFT_a128::DIT, lpts, x, "FFT_algorithms<Field,Simd128>::DIT");
+		}
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+		if (Simd256<typename Field::Element>::vect_size == 4 || Simd256<typename Field::Element>::vect_size == 8){
+			FFT_algorithms<Field,Simd256<typename Field::Element> > fft_algo_simd256 (fft_init);
+			using FFT_a256 = FFT_algorithms<Field,Simd256<typename Field::Element> >;
+			DFT_performance(fft_algo_simd256,&FFT_a256::DIT, lpts, x, "FFT_algorithms<Field,Simd256>::DIT");
+		}
+#endif
+
+		cout<<endl;
+	}
+}
+
+
+int main(int argc, char** argv){
+	//	if (argc < 2 || argc >3){
+	//		cerr<<"usage : prime_bitsize , (seed)"<<endl;
+	//		exit(0);
+	//	}
+	uint64_t bits = 0; //atoi(argv[1]);
+	long seed=((argc>2)?atoi(argv[2]):time(NULL));
+	size_t l2n = 12;
+	size_t k = l2n;
+	RandomFFTPrime Rd;
+	uint32_t p;
+
+	//Modular<double,double>
+	bits = 22;
+	Rd = RandomFFTPrime (1<<bits,seed);
+	p = (double)Rd.randomPrime(l2n);
+
+	cout<<"prime : "<<p<<endl;
+	cout<<endl;
+
+	Givaro::Modular<double,double> Fd(p);
+//	cout << "Test Modular<double,double>: " << ((check_DIF(Fd,k,seed))?"OK":"KO!!!!") << endl;
+
+#ifdef __FFLASFFPACK_HAVE_INT128
+	//Modular<int64_t,uint128_t>
+	bits = 59;
+	Rd = RandomFFTPrime (1ul<<bits,seed);
+	p = (uint64_t)Rd.randomPrime(l2n);
+
+	cout<<"prime : "<<p<<endl;
+	cout<<endl;
+
+	Givaro::Modular<uint64_t,uint128_t> Fi64(p);
+	cout << "Test Modular<int64_t,uint128_t> : " << ((check_DIF(Fi64,k,seed))?"OK":"KO!!!!") << endl;
+#endif
+
+	//Modular<uint32_t,uint64_t>
+	bits = 28;
+	Rd = RandomFFTPrime (1<<bits,seed);
+	p = (uint32_t)Rd.randomPrime(l2n);
+
+	cout<<"prime : "<<p<<endl;
+	cout<<endl;
+
+	Givaro::Modular<uint32_t,uint64_t> Fi32(p);
+	cout << "Test Modular<uint32_t,uint64_t>: " << ((check_DIF(Fi32,k,seed))?"OK":"KO!!!!") << endl;
+
+//	bench_DIF(Fi32,k,seed);
+
+
+	//Modular<uint16_t,uint32_t>
+	bits = 12;
+	k = l2n = 8;
+	Rd = RandomFFTPrime (1<<bits,seed);
+	p = (uint16_t)Rd.randomPrime(l2n);
+
+	cout<<"prime : "<<p<<endl;
+	cout<<endl;
+
+	Givaro::Modular<uint16_t,uint32_t> Fi16(p);
+	cout << "Test Modular<uint16_t,uint32_t> : " << ((check_DIF(Fi16,k,seed))?"OK":"KO!!!!") << endl;
+
+
+	// Bench FFT
+
+	//	cout << "Test : " << ((check_DIF(Fi16,k,seed))?"OK":"KO!!!!") << endl;
+	//	cout << "Test : " << ((check_DIF(Fd,k,seed))?"OK":"KO!!!!") << endl;
+	//	bench_DIF(Fi,k,seed);
+	//	bench_DIF(Fd,k,seed);
+
+
+	return 0;
+}
+
+
diff --git a/examples/smith.C b/examples/smith.C
index f359bb4..c4344c6 100644
--- a/examples/smith.C
+++ b/examples/smith.C
@@ -64,7 +64,7 @@ using namespace std;
 #include <linbox/util/timer.h>
 
 #include <linbox/ring/local2_32.h>
-#include <linbox/ring/PIR-modular-int32.h>
+#include <linbox/ring/pir-modular-int32.h>
 #include <linbox/algorithms/smith-form-local.h>
 #include <linbox/algorithms/smith-form-iliopoulos.h>
 #include <linbox/algorithms/smith-form-adaptive.h>
diff --git a/examples/smithvalence.h b/examples/smithvalence.h
index 91cdc0d..eb606ef 100644
--- a/examples/smithvalence.h
+++ b/examples/smithvalence.h
@@ -116,7 +116,7 @@ std::vector<size_t>& PRank(std::vector<size_t>& ranks, size_t& effective_exponen
 	if (p <= maxmod) {
 		typedef Givaro::Modular<int64_t> Ring;
 		int64_t lp(p);
-		Givaro::Integer q = pow(p,e); int64_t lq(q);
+		Givaro::Integer q = pow(p,uint64_t(e)); int64_t lq(q);
 		if (q >Givaro::Integer(lq)) {
 			std::cerr << "Power rank might need extra large composite (" << p << '^' << e << ")." << std::endl;
 			q = p;
@@ -183,7 +183,7 @@ std::vector<size_t>& PRankPowerOfTwo(std::vector<size_t>& ranks, size_t& effecti
 std::vector<size_t>& PRankInteger(std::vector<size_t>& ranks, char * filename,Givaro::Integer p, size_t e, size_t intr)
 {
 	typedef Givaro::Modular<Givaro::Integer> Ring;
-	Givaro::Integer q = pow(p,e);
+	Givaro::Integer q = pow(p,uint64_t(e));
 	Ring F(q);
 	std::ifstream input(filename);
 	LinBox::MatrixStream<Ring> ms( F, input );
diff --git a/examples/solve.C b/examples/solve.C
index b7d0dd8..dcff6b5 100644
--- a/examples/solve.C
+++ b/examples/solve.C
@@ -33,11 +33,9 @@
 #include <iostream>
 
 #include <givaro/modular.h>
-#include <givaro/zring.h>
 #include <linbox/matrix/sparse-matrix.h>
-#include <linbox/solutions/solve.h>
-#include <linbox/util/matrix-stream.h>
-#include <linbox/solutions/methods.h>
+#include <linbox/algorithms/gauss.h>
+#include <linbox/util/timer.h>
 
 using namespace LinBox;
 using namespace std;
@@ -45,110 +43,45 @@ using namespace std;
 int main (int argc, char **argv)
 {
 
-	commentator().setMaxDetailLevel (-1);
-	commentator().setMaxDepth (-1);
-	commentator().setReportStream (std::cerr);
 
-
-	if (argc < 2 || argc > 4) {
-		cerr << "Usage: solve <matrix-file-in-supported-format> [<dense-vector-file>] [<p>]" << endl;
+	if (argc != 4) {
+		cerr << "Usage: solve <matrix-file-in-supported-format> <output-file> <p>" << endl;
 		return 0;
 	}
 	srand48( BaseTimer::seed() );
 
 	std::ifstream input (argv[1]);
 	if (!input) { cerr << "Error opening matrix file " << argv[1] << endl; return -1; }
-	std::ifstream invect;
 
-	bool createB = false;
-	int ModComp = 0;
-	if (argc == 2) {
-		createB = true;
-		ModComp = 0;
-	}
+	std::ofstream nsb;
+	nsb.open (argv[2], std::ofstream::out);
+	if (!nsb) { cerr << "Error opening nullspace output file " << argv[2] << endl; return -1; }
 
-	if (argc == 3) {
-		invect.open (argv[2], std::ifstream::in);
-		if (!invect) {
-			createB = true;
-			ModComp = 2;
-		}
-		else {
-			createB = false;
-			ModComp = 0;
-		}
-	}
-
-	if (argc == 4) {
-		ModComp = 3;
-		invect.open (argv[2], std::ifstream::in);
-		if (!invect) {
-			createB = true;
-		}
-		else
-			createB = false;
-	}
 
-
-	if (ModComp) {
-                cout<<"Computation is done over Z/("<<atoi(argv[ModComp])<<")"<<endl;
-		typedef Givaro::Modular<double> Field;
-		double q = atof(argv[ModComp]);
+        cout<<"Computation is done over Z/("<<atoi(argv[3])<<")"<<endl;
+		typedef Givaro::Modular<int64_t> Field;
+		double q = atof(argv[3]);
 		typedef DenseVector<Field> DenseVector ;
 		Field F(q);
 		MatrixStream< Field > ms ( F, input );
 		SparseMatrix<Field> A (ms);  // A.write(std::cout);
 		cout << "A is " << A.rowdim() << " by " << A.coldim() << endl;
                 if (A.rowdim() <= 20 && A.coldim() <= 20) A.write(std::cerr << "A:=",Tag::FileFormat::Maple) << ';' << std::endl;
-		DenseVector X(F, A.coldim()),B(F, A.rowdim());
-		if (createB) {
-			cerr << "Creating a random {-1,1} vector U, B is AU (to have a consistent system)" << endl;
-			DenseVector U(F, A.coldim() );
-			for(DenseVector::iterator it=U.begin();
-			    it != U.end(); ++it)
-				if (drand48() <0.5)
-					F.assign(*it,F.mOne);
-				else
-					F.assign(*it,F.one);
-			A.apply(B,U);
-		}
-		else {
-			for(DenseVector::iterator it=B.begin();
-			    it != B.end(); ++it)
-				F.read(invect,*it);
-		}
-
-		//         A.write(std::cout << "A: ") << std::endl;
-
-		std::cout << "B is " << B << std::endl;
-
+		DenseMatrix<Field> N(F, A.rowdim(), 15);
 		Timer chrono;
 
 		// Sparse Elimination
-		std::cout << "Sparse Elimination" << std::endl;
 		chrono.clear();
 		chrono.start();
-		solve (X, A, B, Method::SparseElimination());
-		chrono.stop();
-
-		std::cout << "(Sparse Gauss) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			F.write(cout, *it) << " ";
-		std::cout << "]" << std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl<<std::endl;;
+		GaussDomain<Field> GD ( A.field() );
+		GD.nullspacebasisin(N, A);
 
-		// BlasElimination
-		std::cout << "BlasElimination" << std::endl;
-		chrono.start();
-		solve (X, A, B, Method::BlasElimination());
 		chrono.stop();
 
-		std::cout << "(BlasElimination) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			F.write(cout, *it) << " ";
-		std::cout << "]" << std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl<< std::endl;
+		N.write(nsb) << std::endl;
+		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl<<std::endl;;
 
+#if 0
 		// Wiedemann
 		std::cout << "Blackbox" << std::endl;
 		chrono.clear();
@@ -161,6 +94,7 @@ int main (int argc, char **argv)
 			F.write(cout, *it) << " ";
 		std::cout << "]" << std::endl;
 		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl<<std::endl;;
+#endif
 #if 0
 		// Lanczos
 		std::cout << "Lanczos" << std::endl;
@@ -192,113 +126,6 @@ int main (int argc, char **argv)
 		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl<< std::endl;
 #endif
 
-	}
-	else {
-                cout<<"Computation is done over Q"<<endl;
-		Givaro::ZRing<Integer> ZZ;
-		typedef DenseVector<Givaro::ZRing<Integer> > DenseVector ;
-		MatrixStream< Givaro::ZRing<Integer> > ms( ZZ, input );
-		SparseMatrix<Givaro::ZRing<Integer> > A (ms);
-		Givaro::ZRing<Integer>::Element d;
-		std::cout << "A is " << A.rowdim() << " by " << A.coldim() << std::endl;
-                if (A.rowdim() <= 20 && A.coldim() <= 20) A.write(std::cerr << "A:=",Tag::FileFormat::Maple) << ';' << std::endl;
-		DenseVector X(ZZ, A.coldim()),B(ZZ, A.rowdim());
-
-		if (createB) {
-			cerr << "Creating a random {-1,1} vector U, B is AU" << endl;
-			DenseVector U(ZZ, A.coldim() );
-			for(DenseVector::iterator it=U.begin();
-			    it != U.end(); ++it)
-				if (drand48() <0.5)
-					*it = -1;
-				else
-					*it = 1;
-			A.apply(B,U);
-		}
-		else {
-			for(DenseVector::iterator it=B.begin();
-			    it != B.end(); ++it)
-				invect >> *it;
-		}
-
-
-		std::cout << "B is " << B << std::endl;
-
-
-		Timer chrono;
-
-		// BlasElimination
-                std::cout << "BlasElimination" << std::endl;
-                chrono.start();
-                solve (X, d, A, B, Method::BlasElimination());
-                chrono.stop();
-
- 		std::cout << "(BlasElimination) Solution is [";
-                for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
- 			ZZ.write(cout, *it) << " ";
-                std::cout << "] / ";
-                ZZ.write(std::cout, d)<< std::endl;
-                std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl;
-
-		// Sparse Elimination
-		std::cout << "Sparse Elimination" << std::endl;
-		chrono.start();
-		solve (X, d, A, B, Method::SparseElimination());
-		chrono.stop();
-
-		std::cout << "(SparseElimination) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			ZZ.write(cout, *it) << " ";
-		std::cout << "] / ";
-		ZZ.write(std::cout, d)<< std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl;
-
-                		// Wiedemann
-		std::cout << "Wiedemann" << std::endl;
-		chrono.start();
-		solve (X, d, A, B, Method::Wiedemann());
-		chrono.stop();
-
-		std::cout << "(Wiedemann) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			ZZ.write(cout, *it) << " ";
-		std::cout << "] / ";
-		ZZ.write(std::cout, d) << std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl;
-
-
-                
-#if 0
-		// Lanczos
-		std::cout << "Lanczos" << std::endl;
-		chrono.start();
-		solve (X, d, A, B, Method::Lanczos());
-		chrono.stop();
-
-		std::cout << "(Lanczos) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			ZZ.write(cout, *it) << " ";
-		std::cout << "] / ";
-		ZZ.write(std::cout, d) << std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl;
-
-
-		// Block Lanczos
-		std::cout << "Block Lanczos" << std::endl;
-		chrono.clear();
-		chrono.start();
-		solve (X, d, A, B, Method::BlockLanczos());
-		chrono.stop();
-
-		std::cout << "(Block Lanczos) Solution is [";
-		for(DenseVector::const_iterator it=X.begin();it != X.end(); ++it)
-			ZZ.write(cout, *it) << " ";
-		std::cout << "] / ";
-		ZZ.write(std::cout, d) << std::endl;
-		std::cout << "CPU time (seconds): " << chrono.usertime() << std::endl;
-#endif
-	}
-
 	return 0;
 }
 
diff --git a/examples/test.sh b/examples/test.sh
index 5ec2367..18fc03d 100755
--- a/examples/test.sh
+++ b/examples/test.sh
@@ -2,6 +2,10 @@
 
 # written by Brice Boyer (briceboyer) <boyer.brice at gmail.com>
 # part of LinBox, see COPYING
+SED="sed"
+case "`uname`" in
+  Darwin*) SED="gsed" ;;
+esac
 
 set -o nounset                              # Treat unset variables as an error
 
@@ -19,12 +23,12 @@ pass="true"
 echo -n "check rank ... "
 rank_cmd="Rank\sis\s"
 ./rank  data/test.matrix 7 > linbox-tmp.data
-result=`cat linbox-tmp.data | grep ${rank_cmd} | sed 's/'"$rank_cmd"'\([0-9]*\).*/\1/'`
+result=`cat linbox-tmp.data | grep ${rank_cmd} | $SED 's/'"$rank_cmd"'\([0-9]*\).*/\1/'`
 [ "$result" -eq "9" ] && success || { fail ;  pass="false" ; }
 
 echo -n "check rank ... "
 ./rank  data/test.matrix > linbox-tmp.data
-result=`cat linbox-tmp.data | grep ${rank_cmd} | sed 's/'"$rank_cmd"'\([0-9]*\).*/\1/'`
+result=`cat linbox-tmp.data | grep ${rank_cmd} | $SED 's/'"$rank_cmd"'\([0-9]*\).*/\1/'`
 [ "$result" -eq "10" ] && success || { fail ; pass="false" ; }
 
 
diff --git a/interfaces/sage/Makefile.am b/interfaces/sage/Makefile.am
index a3c6f47..c8a7bf4 100644
--- a/interfaces/sage/Makefile.am
+++ b/interfaces/sage/Makefile.am
@@ -23,7 +23,7 @@ if LINBOX_HAVE_SAGE
 
 #AM_CPPFLAGS=-I$(top_srcdir) -I. -I../../linbox
 
-AM_CPPFLAGS = -DDISABLE_COMMENTATOR -I$(top_srcdir)/linbox $(DEPS_CFLAGS)
+AM_CPPFLAGS = -DDISABLE_COMMENTATOR -I$(top_srcdir)/linbox $(DEPS_CFLAGS) $(DEFAULT_CFLAGS)
 LDADD = $(DEPS_LIBS) $(LDFLAGS)
 
 #AM_CXXFLAGS = @DEFAULT_CFLAGS@ -DDISABLE_COMMENTATOR  $(NTL_CFLAGS) $(OPTFLAGS) $(PARFLAGS)
diff --git a/linbox.pc.in b/linbox.pc.in
index 6a95e29..b37f9c2 100644
--- a/linbox.pc.in
+++ b/linbox.pc.in
@@ -1,14 +1,14 @@
 /------------------ linbox.pc ------------------------
 prefix=@prefix@
-exec_prefix=@prefix@/bin
+exec_prefix=@prefix@
 libdir=@prefix@/lib
 includedir=@prefix@/include
 
 Name: linbox
 Description: Exact Linear Algebra library
-URL: http://linbox-team.github.io/linbox/
+URL: http://github.com/linbox-team/linbox
 Version: @VERSION@
-Requires: fflas-ffpack >= 2.2.0
+Requires: fflas-ffpack >= 2.2.2
 Libs: -L${libdir} -llinbox @LINBOXSAGE_LIBS@ @NTL_LIBS@ @MPFR_LIBS@ @FPLLL_LIBS@ @IML_LIBS@ @FLINT_LIBS@ @OCL_LIBS@
 Cflags: @DEFAULT_CFLAGS@ -DDISABLE_COMMENTATOR -I${includedir}/linbox @NTL_CFLAGS@ @MPFR_CFLAGS@ @FPLLL_CFLAGS@  @IML_CFLAGS@ @FLINT_CFLAGS@
 \-------------------------------------------------------
diff --git a/linbox/Makefile.am b/linbox/Makefile.am
index b8c3d59..7232401 100644
--- a/linbox/Makefile.am
+++ b/linbox/Makefile.am
@@ -19,7 +19,7 @@
 # ========LICENCE========
 #/
 
-AM_CPPFLAGS= -I$(top_srcdir)/linbox $(DEPS_CFLAGS)
+AM_CPPFLAGS= -I$(top_srcdir)/linbox $(DEPS_CFLAGS) $(DEFAULT_CFLAGS)
 LDADD = $(DEPS_LIBS) $(LDFLAGS)
 
 #AM_CPPFLAGS = -I at includedir@ -I$(top_srcdir)/linbox -I$(top_srcdir) $(DEFCPPFLAGS) $(OPTFLAGS) $(FFLAS_FFPACK_CFLAGS) $(NTL_CFLAGS) $(MPFR_CFLAGS) $(FPLLL_CFLAGS) $(IML_CFLAGS) $(FLINT_CFLAGS) $(PARFLAGS)
diff --git a/linbox/algorithms/Makefile.am b/linbox/algorithms/Makefile.am
index 6a7b999..e7b343e 100644
--- a/linbox/algorithms/Makefile.am
+++ b/linbox/algorithms/Makefile.am
@@ -30,7 +30,7 @@ libalgorithms_la_SOURCES= diophantine-solver.C
 
 #  AM_CPPFLAGS= $(CBLAS_FLAG) $(GMP_CFLAGS) $(NTL_CFLAGS)  
 
-AM_CPPFLAGS = -I$(top_srcdir)/linbox $(DEPS_CFLAGS)
+AM_CPPFLAGS= -I$(top_srcdir)/linbox $(DEPS_CFLAGS) $(DEFAULT_CFLAGS)
 LDADD = $(DEPS_LIBS) $(LDFLAGS)
 
 #AM_CXXFLAGS = @DEFAULT_CFLAGS@ -DDISABLE_COMMENTATOR   $(NTL_CFLAGS) $(ATLAS_CFLAGS)  $(FPLLL_CFLAGS) $(OCL_CFLAGS) $(OMPFLAGS)
@@ -148,6 +148,8 @@ pkgincludesub_HEADERS =            \
 	short-vector.h                     \
 	rns.h                              \
 	rns.inl                            \
+	invariant-factors.h                \
+	smith-form-iliopoulos2.h           \
 	$(USE_OCL_HDRS)
 
 #  iml.h                              \
diff --git a/linbox/algorithms/block-wiedemann.h b/linbox/algorithms/block-wiedemann.h
index e19a073..384bcf6 100644
--- a/linbox/algorithms/block-wiedemann.h
+++ b/linbox/algorithms/block-wiedemann.h
@@ -79,12 +79,13 @@ namespace LinBox
 			m = A.rowdim();
 			n = A.coldim();
 
-			size_t p,q;
+			uint32_t p,q;
+                        // CP : converting to GMP int to get the bitsize is unsane ! Should be replaced by a tablelookup
 			integer tmp;
-			tmp = m;
+			tmp = uint32_t(m);
 			p = tmp.bitsize()-1;
 			//p=sqrt(tmp);
-			tmp = n;
+			tmp = uint32_t(n);
 			q = tmp.bitsize()-1;
 			//q=sqrt(tmp);
 			//std::cout<<"row block: "<<p<<std::endl;
diff --git a/linbox/algorithms/classic-rational-reconstruction.h b/linbox/algorithms/classic-rational-reconstruction.h
index 1de07be..0ed9271 100644
--- a/linbox/algorithms/classic-rational-reconstruction.h
+++ b/linbox/algorithms/classic-rational-reconstruction.h
@@ -260,7 +260,7 @@ namespace LinBox
 				b=1;
 				//Element s0,s; s0=1,s=0;//test time gcdex;
 
-				Element T = m.bitsize();
+				Element T = (uint32_t) m.bitsize();
 				int c = 5;	//should be changed here to enhance probability of correctness
 
 				while((a>0) && (r0.bitsize() > T.bitsize() + (unsigned long)c))
diff --git a/linbox/algorithms/coppersmith.h b/linbox/algorithms/coppersmith.h
index 53acf26..11664c5 100644
--- a/linbox/algorithms/coppersmith.h
+++ b/linbox/algorithms/coppersmith.h
@@ -76,7 +76,7 @@ namespace LinBox
 			//Set up the projection matrices and their dimensions
 			size_t d = B.coldim();
 			size_t r,c;
-			integer tmp = d;
+			integer tmp = uint64_t(d);
 
 			//Set the blocking size, Using Pascal Giorgi's convention
 			if(blocking==0){
@@ -256,7 +256,7 @@ namespace LinBox
 			//Set up the projection matrices and their dimensions
 			size_t d = B.coldim();
 			size_t r,c;
-			integer tmp = d;
+			integer tmp = uint64_t(d);
 
 			//Set the blocking size, Using Pascal Giorgi's convention
 			if(blocking==0){
@@ -394,7 +394,7 @@ namespace LinBox
 			//Set up the projection matrices and their dimensions
 			size_t d = B.coldim();
 			size_t r,c;
-			integer tmp = d;
+			integer tmp = uint64_t(d);
 
 			//Use given blocking size, if not given use Pascal Giorgi's convention
 			if(blocking==0){
diff --git a/linbox/algorithms/matpoly-mult.h b/linbox/algorithms/matpoly-mult.h
index 93ce6d4..e77ae1d 100644
--- a/linbox/algorithms/matpoly-mult.h
+++ b/linbox/algorithms/matpoly-mult.h
@@ -485,7 +485,7 @@ namespace LinBox
 
 			_fftsize=0;
 			//check if field is based on fft prime
-			size_t p = _p;
+			uint64_t p = _p;
 			if (p&1){
 				p-=1;
 				do { p=p>>1; _fftsize++;} while(!(p&0x0001));
@@ -519,13 +519,13 @@ namespace LinBox
 				while ( k ) {k>>=1; ++ln;}
 
 				// taking primes greater than current prime
-				size_t bit = std::max((53-ln)>>1, _p.bitsize());
+				uint64_t bit = std::max((53-ln)>>1, _p.bitsize());
 
 				// get number of necessary primes
-				integer ibound = uint64_t(n) * _p * _p * std::max(b.size(), c.size());
+				integer ibound = uint64_t(n) * _p * _p * uint64_t(std::max(b.size(), c.size()));
 				integer primesprod;
 				size_t nbrprimes=1;
-				RandomFFTPrime fftprime((size_t)bit, FFT_PRIME_SEED);
+				RandomFFTPrime fftprime(bit, FFT_PRIME_SEED);
 				std::vector<integer> lprimes(10); lprimes.resize(nbrprimes);
 				lprimes[0] = fftprime.generatePrime();
 				primesprod = lprimes[0];
@@ -635,10 +635,10 @@ namespace LinBox
 				}
 
 				// taking primes greater than current prime
-				size_t bit = std::max((53-ln)>>1, _p.bitsize());
+				uint64_t bit = std::max((53-ln)>>1, _p.bitsize());
 
 				// get number of necessary primes
-				integer ibound = uint64_t(n) * _p * _p * std::max(b.size(), c.size());
+				integer ibound = uint64_t(n) * _p * _p * uint64_t(std::max(b.size(), c.size()));
 				integer primesprod;
 				size_t nbrprimes=1;
 				RandomFFTPrime fftprime(bit, FFT_PRIME_SEED);
@@ -1044,7 +1044,7 @@ namespace LinBox
 
 			size_t deg     = b.size()+c.size()-1;
 			size_t lpts = 0;
-			size_t pts =1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			uint64_t pts =1; while (pts < deg) { pts= pts<<1; ++lpts; }
 
 #ifdef FFT_TIMING
 			std::cout<<"FFT: points "<<pts<<"\n";
@@ -1266,7 +1266,7 @@ namespace LinBox
 
 			size_t deg  = c.size()+1;
 			size_t lpts = 0;
-			size_t pts =1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			uint64_t pts =1; while (pts < deg) { pts= pts<<1; ++lpts; }
 
 			if (_p%pts != 1) {
 				std::cout<<"Error the prime is not a FFTPrime or it has too small power of 2\n";
diff --git a/linbox/algorithms/polynomial-matrix/Makefile.am b/linbox/algorithms/polynomial-matrix/Makefile.am
index a58ec59..dfdc3d5 100644
--- a/linbox/algorithms/polynomial-matrix/Makefile.am
+++ b/linbox/algorithms/polynomial-matrix/Makefile.am
@@ -31,6 +31,7 @@ pkgincludesub_HEADERS =         \
 	matpoly-mult-fft-wordsize-fast.inl	\
 	matpoly-mult-fft-wordsize-three-primes.inl	\
 	matpoly-mult-fft-multiprecision.inl	\
+	matpoly-mult-fft-recint.inl	\
 	polynomial-fft-transform-simd.inl	\
 	polynomial-fft-transform.h	\
 	polynomial-fft-transform.inl	\
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl
index 743d14b..549c7f7 100644
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl
@@ -56,7 +56,7 @@ namespace LinBox{
     integer           _maxnorm;
 
     template<typename PMatrix1>
-    size_t logmax(const PMatrix1 A) const {
+    size_t logmax(const PMatrix1& A) const {
       size_t mm=A.get(0,0,0).bitsize();
       for(size_t k=0;k<A.size();k++)
 	for (size_t i=0;i<A.rowdim()*A.coldim();i++){
@@ -66,31 +66,7 @@ namespace LinBox{
       return mm;
     }
 
-  public:
-    void getFFTPrime(uint64_t prime_max, size_t lpts, integer bound, std::vector<integer> &bas){
-
-      RandomFFTPrime RdFFT(prime_max);
-      size_t nbp=0;
-      if (!RdFFT.generatePrimes(lpts,bound,bas)){
-	integer MM=1;
-	for(std::vector<integer>::size_type i=0;i<bas.size();i++)
-	  MM*=bas[i];
-	RandomPrimeIter Rd(integer(prime_max).bitsize());
-	integer tmp;
-	do {
-	  do {Rd.random(tmp);}
-	  while (MM%tmp==0);
-	  bas.push_back(tmp);
-	  nbp++;
-	  MM*=tmp;
-	} while (MM<bound);	
-      }
-#ifdef VERBOSE_FFT
-      std::cout<<"MatPoly Multiprecision FFT : using "<<bas.size()-nbp<<" FFT primes and "<<nbp<<" normal primes "<<std::endl;
-#endif
-    }
-
-    
+  public: 
 
     inline const IntField & field() const { return *_field; }
 
@@ -99,7 +75,7 @@ namespace LinBox{
       _field(&F), _maxnorm(maxnorm) {}
 
     template<typename PMatrix1, typename PMatrix2, typename PMatrix3>
-    void mul (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b) {
+    void mul (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b, size_t max_rowdeg=0) {
       //compute a bound on the entry of the input matrix a and b
       FFT_PROFILE_START(2);
       integer maxA,maxB;
@@ -108,10 +84,11 @@ namespace LinBox{
 	maxA=1;maxA<<=uint64_t(logmax(a));
 	maxB=1;maxB<<=uint64_t(logmax(b));
       }
-      integer bound=2*maxA*maxB*uint64_t(a.coldim())*uint64_t(std::min(a.size(),b.size()));
+      integer bound=maxA*maxB*uint64_t(a.coldim())*uint64_t(std::min(a.size(),b.size()));
+      if (_maxnorm==0) bound*=2; //seems to compute over Z, need to double to handle possible negative value
       FFT_PROFILING(2,"max norm computation");
 
-      mul_crtla(c,a,b,maxA,maxB,bound);
+      mul_crtla(c,a,b,maxA,maxB,bound,max_rowdeg);
     }
 
     template<typename PMatrix1, typename PMatrix2, typename PMatrix3>
@@ -125,7 +102,12 @@ namespace LinBox{
 	maxA=1;maxA<<=uint64_t(logmax(a));
 	maxB=1;maxB<<=uint64_t(logmax(b));
       }
-      integer bound=2*maxA*maxB*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));;
+      //std::cout<<"MIDP RNS bound: "<<maxA<<" "<<maxB<<" "<<a.coldim()<<" "<<a.size()<<" "<<b.size()<<std::endl;
+      
+      //integer bound=2*maxA*maxB*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));;
+      integer bound=maxA*maxB*integer((uint64_t)a.coldim());
+      if (_maxnorm==0) bound*=2; //seems to compute over Z, need to double to handle possible negative value
+      
       if (smallLeft)
 	bound*= (uint64_t)a.size();
       else
@@ -160,7 +142,7 @@ namespace LinBox{
     // WARNING: Polynomial Matrix should stored as matrix of polynomial with integer coefficient 
     template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
     void mul_crtla(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
-    		   const integer& maxA, const integer& maxB, const integer& bound) {
+    		   const integer& maxA, const integer& maxB, const integer& bound, size_t max_rowdeg=0) {
 
       FFT_PROFILE_START(2);
       linbox_check(a.coldim() == b.rowdim());
@@ -168,6 +150,7 @@ namespace LinBox{
       size_t k = a.coldim();
       size_t n = b.coldim();
       size_t s= a.size()+b.size()-1;
+      if (max_rowdeg!=0) s = max_rowdeg+1;
       c.resize(s);
       size_t lpts=0;
       size_t pts  = 1; while (pts < s) { pts= pts<<1; ++lpts; }
@@ -178,9 +161,9 @@ namespace LinBox{
       //size_t prime_bitsize= (53-lk)>>1;
 
       // compute max prime value for FFLAS      
-      uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
+      uint64_t prime_max= std::min(uint64_t(std::sqrt( (1ULL<<53) / k)+1), uint64_t(Givaro::Modular<double>::maxCardinality()));
       std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
+      getFFTPrime(prime_max,lpts,bound,bas,k,s);
       // RandomFFTPrime RdFFT(prime_bitsize);
       // if (!RdFFT.generatePrimes(lpts,bound,bas)){
       // 	std::cout<<"COULD NOT FIND ENOUGH FFT PRIME in MatPoly FFTMUL taking normal primes..."<<std::endl;
@@ -194,6 +177,7 @@ namespace LinBox{
 #ifdef FFT_PROFILER
       //double tMul=0.,tCopy=0;;
       if (FFT_PROF_LEVEL<3){
+	std::cout << "*** MatPoly FFT - MUL ***"<<std::endl;
 	std::cout << "number of FFT primes :" << num_primes << std::endl;
 	std::cout << "max prime            : "<<prime_max<<" ("<<integer(prime_max).bitsize()<<")"<<std::endl;
 	std::cout << "bitsize of the output: "<<bound.bitsize()
@@ -204,33 +188,31 @@ namespace LinBox{
       FFT_PROFILING(2,"init of CRT approach");
       // reduce t_a and t_b modulo each FFT primes
       size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();
-      //size_t n_ta=m*k*pts, n_tb=k*n*pts;
-      //std::cout<<"----------------------------------------------"<<std::endl;
-      //std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-      std::cout<<"MUL FFT RNS: need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
+      ADD_MEM(8*(n_ta+n_tb)*num_primes);
       double* t_a_mod= new double[n_ta*num_primes];
       double* t_b_mod= new double[n_tb*num_primes];
       RNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
       RNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
+      ADD_MEM(n_ta* (maxA.bitsize()/16 + (maxA.bitsize()%16?1:0)) *8); // needed by RNS init
+      DEL_MEM(n_ta* (maxA.bitsize()/16 + (maxA.bitsize()%16?1:0)) *8);
+      ADD_MEM(n_tb* (maxB.bitsize()/16 + (maxB.bitsize()%16?1:0)) *8); // needed by RNS init
+      DEL_MEM(n_tb* (maxB.bitsize()/16 + (maxB.bitsize()%16?1:0)) *8);
+
       FFT_PROFILING(2,"reduction mod pi of input matrices");
 
       std::vector<MatrixP_F*> c_i (num_primes);
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((m*n*pts)*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((2*(m*k+k*n)*pts)*8)<<"Mo"<<std::endl;
-
-      // FFT_PROFILE_START(2);
-      // auto sp=SPLITTER();
-      // PARFOR1D(l,num_primes,sp,
+      
       for (size_t l=0;l<num_primes;l++)
 	{
 	  //FFT_PROFILE_START;
 	  ModField f(RNS._basis[l]);
 	  MatrixP_F a_i (f, m, k, pts);
 	  MatrixP_F b_i (f, k, n, pts);
-	
+	  //a_i.changeField(f);
+	  //b_i.changeField(f);
+
 	  c_i[l] = new MatrixP_F(f, m, n, pts);
+
 	  // copy reduced data
 	  for (size_t i=0;i<m*k;i++)
 	    for (size_t j=0;j<a.size();j++)
@@ -238,14 +220,24 @@ namespace LinBox{
 	  for (size_t i=0;i<k*n;i++)
 	    for (size_t j=0;j<b.size();j++)
 	      b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];	
+	  //std::cout<<"a"<<l<<":="<<a_i<<";\n";
+	  //std::cout<<"b"<<l<<":="<<b_i<<";\n";
+	  
 	  //FFT_PROFILE_GET(tCopy);
 	  //PolynomialMatrixFFTPrimeMulDomain<ModField> fftdomain (f);
-	  PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
-	  fftdomain.mul_fft(lpts, *c_i[l], a_i, b_i);	
+	  PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);
+	  integer bound=integer(RNS._basis[l]-1)*integer(RNS._basis[l]-1)
+	    *integer((uint64_t) k)*integer((uint64_t)std::min(a.size(),b.size()));
+
+	  fftdomain.mul_fft(lpts, *c_i[l], a_i, b_i, bound);
+	  //std::cout<<"c"<<l<<":="<<*c_i[l]<<";\n";
+	  //std::cout<<"p"<<l<<":="<<uint64_t(RNS._basis[l])<<";\n";
 	  //FFT_PROFILE_GET(tMul);
 	}
+      //std::cout<<"MUL FFT RNS: output polmat -> allocating "<<MB(num_primes*c_i[0]->realmeminfo())<<"Mo"<<std::endl;
       //)
       FFT_PROFILING(2,"FFTprime mult+copying");
+      DEL_MEM(8*(n_ta+n_tb)*num_primes);
       delete[] t_a_mod;
       delete[] t_b_mod;
       //FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
@@ -259,7 +251,9 @@ namespace LinBox{
 	// construct contiguous storage for c_i
 	size_t n_tc=m*n*s;
 	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB(n_tc*num_primes*8)<<"Mo"<<std::endl;
+	ADD_MEM(8*n_tc*num_primes);
 	double *t_c_mod = new double[n_tc*num_primes];
+	//std::cout<<"MUL FFT RNS: output RNS -> allocating "<<MB((n_tc)*num_primes*8)<<"Mo"<<std::endl;
 	for (size_t l=0;l<num_primes;l++){
 	  for (size_t i=0;i<m*n;i++)
 	    for (size_t j=0;j<s;j++)
@@ -270,8 +264,12 @@ namespace LinBox{
 
 	// reconstruct the result in C
 	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
+	ADD_MEM(n_tc*RNS._ldm*8);
+	DEL_MEM(n_tc*RNS._ldm*8);
+
 	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
 	//std::cout<<"----------------------------------------------"<<std::endl;
+	DEL_MEM(8*n_tc*num_primes);
 	delete[] t_c_mod;
 
       }
@@ -296,9 +294,9 @@ namespace LinBox{
       size_t pts  = 1; while (pts < s) { pts= pts<<1; ++lpts; }
 
       // compute max prime value for FFLAS      
-      uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
+      uint64_t prime_max= std::min(uint64_t(std::sqrt( (1ULL<<53) / k)+1), uint64_t(Givaro::Modular<double>::maxCardinality()));
       std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
+      getFFTPrime(prime_max,lpts,bound,bas,k,s);
       
       std::vector<double> basis(bas.size());
       std::copy(bas.begin(),bas.end(),basis.begin());
@@ -326,14 +324,10 @@ namespace LinBox{
  
       // loop for memory saving
       size_t CRT_NBPRIME=4;
+      ADD_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
       double* t_a_mod= new double[n_ta*CRT_NBPRIME];
       double* t_b_mod= new double[n_tb*CRT_NBPRIME];
-      std::cout<<"MUL FFT RNS: input/output data: "<< MB((n_ta*(maxA.bitsize()+128) +n_tb*(maxB.bitsize()+128) +m*k*s*(bound.bitsize()+128))/8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: initial need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNS  in: "<<MB( (n_ta+n_tb)*CRT_NBPRIME*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNC com: "<<MB(2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNS out: "<<MB((m*n*pts)*num_primes*8 )<<"Mo"<<std::endl;
-      
+            
       for(size_t loop=0;loop<num_primes;loop+=CRT_NBPRIME){
 	
 	// create chunk of RNS
@@ -343,14 +337,10 @@ namespace LinBox{
 	FFPACK::rns_double smallRNS(smallBasis);
 	smallRNS.precompute_cst(RNS._ldm);
 
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
 	smallRNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
 	smallRNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
 	FFT_PROFILING(2,"reduction mod pi of input matrices");
 
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((m*n*pts)*num_primes*8)<<"Mo"<<std::endl;
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((2*(m*k+k*n)*pts)*8)<<"Mo"<<std::endl;
-
 	for (size_t l=0;l<rns_chunk;l++)
 	  {	    
 	    //FFT_PROFILE_START;
@@ -368,8 +358,11 @@ namespace LinBox{
 		b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];	
 	    //FFT_PROFILE_GET(tCopy);
 	    //PolynomialMatrixFFTPrimeMulDomain<ModField> fftdomain (f);
-	    PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
-	    fftdomain.mul_fft(lpts, *c_i[loop+l], a_i, b_i);	
+	    PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);
+	    integer bound=integer(smallRNS._basis[l]-1)*integer(smallRNS._basis[l]-1)
+	      *integer(k)*integer((uint64_t)std::min(a.size(),b.size()));
+	    
+	    fftdomain.mul_fft(lpts, *c_i[loop+l], a_i, b_i, bound);	
 	    //FFT_PROFILE_GET(tMul);
 	  }      
 	FFT_PROFILING(2,"FFTprime mult+copying");
@@ -377,9 +370,9 @@ namespace LinBox{
 	//FFT_PROFILE(2,"FFTprime multiplication",tMul);
 
       } // end of loop for memory saving
-	delete[] t_a_mod;
-	delete[] t_b_mod;
-	
+      DEL_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
+      delete[] t_a_mod;
+      delete[] t_b_mod;
       
       if (num_primes < 2) {
 	FFT_PROFILE_START(2);	
@@ -389,6 +382,7 @@ namespace LinBox{
 	// construct contiguous storage for c_i
 	size_t n_tc=m*n*s;
 	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB(n_tc*num_primes*8)<<"Mo"<<std::endl;
+	ADD_MEM(8*(n_tc)*num_primes);
 	double *t_c_mod = new double[n_tc*num_primes];
 	for (size_t l=0;l<num_primes;l++){
 	  for (size_t i=0;i<m*n;i++)
@@ -400,10 +394,13 @@ namespace LinBox{
 
 	// reconstruct the result in C
 	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
+	ADD_MEM(n_tc*RNS._ldm*8);
+	DEL_MEM(n_tc*RNS._ldm*8);
+	
 	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
 	//std::cout<<"----------------------------------------------"<<std::endl;
+	DEL_MEM(8*n_tc*num_primes);
 	delete[] t_c_mod;
-
       }
       FFT_PROFILING(2,"k prime reconstruction");
       // std::cout<<"CC:="<<c<<std::endl;
@@ -412,6 +409,7 @@ namespace LinBox{
 
 
 
+
     // template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
     // void midproduct_crtla(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
     // 			  const integer& maxA, const integer& maxB, const integer& bound,
@@ -464,12 +462,7 @@ namespace LinBox{
       // compute max prime value for FFLAS
       uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
       std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
-      //RandomFFTPrime RdFFT(prime_bitsize);
-      // if (!RdFFT.generatePrimes(bound,bas)){
-      // 	std::cout<<"COULD NOT FIND ENOUGH FFT PRIME in MatPoly FFTMUL exiting..."<<std::endl;
-      // 	throw LinboxError("LinBox ERROR: not enough FFT Prime\n");
-      // }
+      getFFTPrime(prime_max,lpts,bound,bas,k,deg);
 
       std::vector<double> basis(bas.size());
       std::copy(bas.begin(),bas.end(),basis.begin());
@@ -478,7 +471,8 @@ namespace LinBox{
 #ifdef FFT_PROFILER
       double tMul=0.,tCopy=0;;
       if (FFT_PROF_LEVEL<3){
-	std::cout << "number of FFT primes :" << num_primes << std::endl;
+	std::cout << "*** MatPoly FFT - MIDP ***"<<std::endl;
+ 	std::cout << "number of FFT primes :" << num_primes << std::endl;
 	std::cout << "max prime            : "<<prime_max<<" ("<<integer(prime_max).bitsize()<<")"<<std::endl;
 	std::cout << "bitsize of the output: "<<bound.bitsize()
 		  <<"( "<< RNS._M.bitsize()<<" )"<<std::endl;
@@ -488,20 +482,19 @@ namespace LinBox{
       FFT_PROFILING(2,"init of CRT approach");
       // reduce t_a and t_b modulo each FFT primes
       size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();
+      ADD_MEM(8*(n_ta+n_tb)*num_primes);
       double* t_a_mod= new double[n_ta*num_primes];
       double* t_b_mod= new double[n_tb*num_primes];
+      //std::cout<<"MIDP FFT RNS: input RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
+
       RNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
       RNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
-      FFT_PROFILING(2,"reduction mod pi of input matrices");
-
-      //std::cout<<"----------------------------------------------"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
-      //std::cout<<"MIDP FFT RNS: need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
+      ADD_MEM(n_ta* (maxA.bitsize()/16 + (maxA.bitsize()%16?1:0)) *8); // needed by RNS init
+      DEL_MEM(n_ta* (maxA.bitsize()/16 + (maxA.bitsize()%16?1:0)) *8);
+      ADD_MEM(n_tb* (maxB.bitsize()/16 + (maxB.bitsize()%16?1:0)) *8); // needed by RNS init
+      DEL_MEM(n_tb* (maxB.bitsize()/16 + (maxB.bitsize()%16?1:0)) *8);
 
-      
-      //std::cout<<"MIDP FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: RNS -> allocating "<<MB((m*n)*pts*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
+      FFT_PROFILING(2,"reduction mod pi of input matrices");
 
       std::vector<MatrixP_F*> c_i (num_primes);
 
@@ -532,8 +525,11 @@ namespace LinBox{
 				
 	FFT_PROFILE_GET(2,tMul);
       }
+
+      DEL_MEM(8*(n_ta+n_tb)*num_primes);
       delete[] t_a_mod;
       delete[] t_b_mod;
+
       FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
       FFT_PROFILE(2,"FFTprime multiplication",tMul);
 
@@ -545,7 +541,9 @@ namespace LinBox{
 	// construct contiguous storage for c_i
 	double *t_c_mod;
 	size_t n_tc=m*n*c.size();
+	ADD_MEM(8*(n_tc)*num_primes);
 	t_c_mod = new double[n_tc*num_primes];
+	//std::cout<<"MIDP FFT RNS: output RNS -> allocating "<<MB((n_tc)*num_primes*8)<<"Mo"<<std::endl;
 	for (size_t l=0;l<num_primes;l++){
 	  for (size_t i=0;i<m*n;i++)
 	    for (size_t j=0;j<c.size();j++)
@@ -556,12 +554,12 @@ namespace LinBox{
 
 	// reconstruct the result in C
 	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
-	//std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
-	delete[] t_c_mod;
-
-	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-	//std::cout<<"----------------------------------------------"<<std::endl;
+	ADD_MEM(n_tc*RNS._ldm*8); // needed by RNS
+	DEL_MEM(n_tc*RNS._ldm*8);
 
+	DEL_MEM(8*n_tc*num_primes);
+	delete[] t_c_mod;
+	
 	FFT_PROFILING(2,"k prime reconstruction");
       }
     }
@@ -594,7 +592,7 @@ namespace LinBox{
     }
 
     template<typename Matrix1, typename Matrix2, typename Matrix3>
-    void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b) {
+    void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, size_t max_rowdeg=0) {
       FFT_PROFILE_START(2);
       MatrixP_F a2(field(),a.rowdim(),a.coldim(),a.size());
       MatrixP_F b2(field(),b.rowdim(),b.coldim(),b.size());
@@ -602,7 +600,7 @@ namespace LinBox{
       a2.copy(a,0,a.size()-1);
       b2.copy(b,0,b.size()-1);
       FFT_PROFILING(2,"converting rep of input");
-      mul(c2,a2,b2);
+      mul(c2,a2,b2, max_rowdeg);
       FFT_PROFILE_START(2);
       c.copy(c2,0,c.size()-1);
       FFT_PROFILING(2,"converting rep of output");
@@ -610,14 +608,18 @@ namespace LinBox{
     }
     
     // Matrix with polynomials  
-    void mul (MatrixP_F &c, const MatrixP_F &a, const MatrixP_F &b) {
-      
+    void mul (MatrixP_F &c, const MatrixP_F &a, const MatrixP_F &b, size_t max_rowdeg=0) {
+
       FFT_PROFILE_START(2);
       IntField Z;      
       PolynomialMatrixFFTMulDomain<IntField> Zmul(Z,_p);
       integer bound=2*_p*_p*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
-      //Zmul.mul_crtla(c,a,b,_p,_p,bound);
+#ifdef TRY1
       Zmul.mul_crtla2(c,a,b,_p,_p,bound); 
+#else
+      Zmul.mul_crtla(c,a,b,_p,_p,bound, max_rowdeg);
+#endif
+      
       
       // reduce the result mod p
       FFT_PROFILE_START(2);
@@ -658,11 +660,6 @@ namespace LinBox{
     }
   };
 
-
-
-
-
-
 } // end of namespace LinBox
 
 #endif // __LINBOX_matpoly_mult_ftt_multiprecision_INL
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-recint.inl
similarity index 52%
copy from linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl
copy to linbox/algorithms/polynomial-matrix/matpoly-mult-fft-recint.inl
index 743d14b..447cc65 100644
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-multiprecision.inl
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-recint.inl
@@ -1,11 +1,9 @@
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
 /*
- * Copyright (C) 2013  Pascal Giorgi
- *                     Romain Lebreton
+ * Copyright (C) 2015  Pascal Giorgi
  *
  * Written by Pascal Giorgi   <pascal.giorgi at lirmm.fr>
- *            Romain Lebreton <lebreton at lirmm.fr>
  *
  * ========LICENCE========
  * This file is part of the library LinBox.
@@ -25,10 +23,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * ========LICENCE========
  */
-#ifndef __LINBOX_matpoly_mult_ftt_multiprecision_INL
-#define __LINBOX_matpoly_mult_ftt_multiprecision_INL
+#ifndef __LINBOX_matpoly_mult_ftt_recint_INL
+#define __LINBOX_matpoly_mult_ftt_recint_INL
 
 #include <givaro/zring.h>
+#include <recint/rint.h>
 #include "linbox/ring/modular.h"
 #include "linbox/randiter/random-fftprime.h"
 #include "linbox/randiter/random-prime.h"
@@ -37,17 +36,24 @@
 #ifndef MEMINFO
 #define MEMINFO ""
 #endif
+
+#ifdef LOW_MEMORY_PMBASIS
+#define MEMFACTOR 4
+#define CRT_SIZE 3
+#endif
+
 namespace LinBox{
 
   /***************************************************
    **** Polynomial Matrix Multiplication over Z[x] ***
    ***************************************************/
-  template<>
-  class PolynomialMatrixFFTMulDomain<Givaro::ZRing<integer> > {
+  template<size_t K >
+  class PolynomialMatrixFFTMulDomain<Givaro::ZRing<RecInt::ruint<K> > > {
   public:
-    typedef Givaro::ZRing<integer>       IntField;
+    typedef Givaro::ZRing<RecInt::ruint<K> >   IntField;
+    typedef RecInt::ruint<K>    Element;
     //typedef Givaro::Modular<uint32_t>     ModField;
-    typedef Givaro::Modular<double>                ModField;
+    typedef Givaro::Modular<double>         ModField;
     typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,ModField> MatrixP_F; // Polynomial matrix stored as a matrix of polynomials
     typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,IntField> MatrixP_I; // Polynomial matrix stored as a matrix of polynomials
 
@@ -56,39 +62,11 @@ namespace LinBox{
     integer           _maxnorm;
 
     template<typename PMatrix1>
-    size_t logmax(const PMatrix1 A) const {
-      size_t mm=A.get(0,0,0).bitsize();
-      for(size_t k=0;k<A.size();k++)
-	for (size_t i=0;i<A.rowdim()*A.coldim();i++){
-	  size_t tmp=A.get(i,k).bitsize();
-	  mm=std::max(mm,tmp);
-	}
-      return mm;
+    size_t logmax(const PMatrix1& A) const {
+      return size_t(1)<<K;
     }
 
   public:
-    void getFFTPrime(uint64_t prime_max, size_t lpts, integer bound, std::vector<integer> &bas){
-
-      RandomFFTPrime RdFFT(prime_max);
-      size_t nbp=0;
-      if (!RdFFT.generatePrimes(lpts,bound,bas)){
-	integer MM=1;
-	for(std::vector<integer>::size_type i=0;i<bas.size();i++)
-	  MM*=bas[i];
-	RandomPrimeIter Rd(integer(prime_max).bitsize());
-	integer tmp;
-	do {
-	  do {Rd.random(tmp);}
-	  while (MM%tmp==0);
-	  bas.push_back(tmp);
-	  nbp++;
-	  MM*=tmp;
-	} while (MM<bound);	
-      }
-#ifdef VERBOSE_FFT
-      std::cout<<"MatPoly Multiprecision FFT : using "<<bas.size()-nbp<<" FFT primes and "<<nbp<<" normal primes "<<std::endl;
-#endif
-    }
 
     
 
@@ -99,7 +77,7 @@ namespace LinBox{
       _field(&F), _maxnorm(maxnorm) {}
 
     template<typename PMatrix1, typename PMatrix2, typename PMatrix3>
-    void mul (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b) {
+    void mul (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b, size_t max_rowdeg=0) {
       //compute a bound on the entry of the input matrix a and b
       FFT_PROFILE_START(2);
       integer maxA,maxB;
@@ -108,11 +86,13 @@ namespace LinBox{
 	maxA=1;maxA<<=uint64_t(logmax(a));
 	maxB=1;maxB<<=uint64_t(logmax(b));
       }
-      integer bound=2*maxA*maxB*uint64_t(a.coldim())*uint64_t(std::min(a.size(),b.size()));
+      integer bound=maxA*maxB*uint64_t(a.coldim())*uint64_t(std::min(a.size(),b.size()));
+      if (_maxnorm==0) bound*=2; //seems to compute over Z, need to double to handle possible negative value
       FFT_PROFILING(2,"max norm computation");
 
-      mul_crtla(c,a,b,maxA,maxB,bound);
+      mul_crtla(c,a,b,maxA,maxB,bound, max_rowdeg);
     }
+    
 
     template<typename PMatrix1, typename PMatrix2, typename PMatrix3>
     void midproduct (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
@@ -125,7 +105,8 @@ namespace LinBox{
 	maxA=1;maxA<<=uint64_t(logmax(a));
 	maxB=1;maxB<<=uint64_t(logmax(b));
       }
-      integer bound=2*maxA*maxB*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));;
+      integer bound=maxA*maxB*integer((uint64_t)a.coldim());
+      if (_maxnorm==0) bound*=2; //seems to compute over Z, need to double to handle possible negative value
       if (smallLeft)
 	bound*= (uint64_t)a.size();
       else
@@ -137,56 +118,33 @@ namespace LinBox{
     }
 
 
-    // template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
-    // void mul_crtla(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
-    // 		   const integer& maxA, const integer& maxB, const integer& bound) {
-    //   // (convert to MatrixP representation)
-    //   FFT_PROFILE_START;
-    //   MatrixP_I a2(field(),a.rowdim(),a.coldim(),a.size());
-    //   MatrixP_I b2(field(),b.rowdim(),b.coldim(),b.size());
-    //   a2.copy(a,0,a.size()-1);
-    //   b2.copy(b,0,b.size()-1);
-    //   MatrixP_I c2(field(),c.rowdim(),c.coldim(),c.size());
-    //   FFT_PROFILING(2,"converting rep of input matrices");
-    //   mul_crtla(c2,a2,b2,maxA,maxB,bound);
-    //   c.copy(c2,0,c.size()-1);
-    //   FFT_PROFILING(2,"converting rep of output matrices");
-    // }
-
-
-    // void mul_crtla(MatrixP_I &c, const MatrixP_I &a, const MatrixP_I &b,
-    // 		   const integer& maxA, const integer& maxB, const integer& bound){
-
-    // WARNING: Polynomial Matrix should stored as matrix of polynomial with integer coefficient 
+    
+    // WARNING: Polynomial Matrix should stored as matrix of polynomial with integer coefficient
+    // outputsize -> its the size of the output if known in advance and less than a.size()+b.size()-1
     template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
     void mul_crtla(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
-    		   const integer& maxA, const integer& maxB, const integer& bound) {
-
+		   const integer& maxA, const integer& maxB, const integer& bound, size_t max_rowdeg=0) {
+      //std::cout<<"MUL CRT LA STARTING: "<<STR_MEMINFO<<std::endl;      
       FFT_PROFILE_START(2);
       linbox_check(a.coldim() == b.rowdim());
       size_t m = a.rowdim();
       size_t k = a.coldim();
       size_t n = b.coldim();
-      size_t s= a.size()+b.size()-1;
+      size_t s= a.size()+b.size()-1; // MUST BE CHANGED TO the 0-rowdeg of (a.b)
+      if (max_rowdeg!=0) s = max_rowdeg+1;
       c.resize(s);
       size_t lpts=0;
       size_t pts  = 1; while (pts < s) { pts= pts<<1; ++lpts; }
 
-      // compute bit size of feasible prime for FFLAS
-      // size_t _k=k,lk=0;
-      //while ( _k ) {_k>>=1; ++lk;}
-      //size_t prime_bitsize= (53-lk)>>1;
-
+      //std::cout<<"MULCRT_LA: "<<c.size()<<" -> "<<a.size()<<"x"<<b.size()<<" (nb pts=2^"<<lpts<<")\n";
+      
       // compute max prime value for FFLAS      
-      uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
+      //uint64_t prime_max=std::sqrt( (1ULL<<53) /k)+1;
+      uint64_t prime_max=maxFFTPrimeValue(k,pts); // CAREFUL: only for Modular<double>
+      
       std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
-      // RandomFFTPrime RdFFT(prime_bitsize);
-      // if (!RdFFT.generatePrimes(lpts,bound,bas)){
-      // 	std::cout<<"COULD NOT FIND ENOUGH FFT PRIME in MatPoly FFTMUL taking normal primes..."<<std::endl;
-      //        exit(1);
-      // }
-	
+      getFFTPrime(prime_max,lpts,bound,bas,k,s);
+      
       std::vector<double> basis(bas.size());
       std::copy(bas.begin(),bas.end(),basis.begin());
       FFPACK::rns_double RNS(basis);
@@ -203,158 +161,61 @@ namespace LinBox{
 #endif
       FFT_PROFILING(2,"init of CRT approach");
       // reduce t_a and t_b modulo each FFT primes
-      size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();
-      //size_t n_ta=m*k*pts, n_tb=k*n*pts;
-      //std::cout<<"----------------------------------------------"<<std::endl;
-      //std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-      std::cout<<"MUL FFT RNS: need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
+      size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();      
+      std::vector<MatrixP_F*> c_i (num_primes);
+
+#ifndef LOW_MEMORY_PMBASIS 
+      ADD_MEM(8*(n_ta+n_tb)*num_primes);      
       double* t_a_mod= new double[n_ta*num_primes];
       double* t_b_mod= new double[n_tb*num_primes];
       RNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
       RNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
       FFT_PROFILING(2,"reduction mod pi of input matrices");
-
-      std::vector<MatrixP_F*> c_i (num_primes);
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((m*n*pts)*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((2*(m*k+k*n)*pts)*8)<<"Mo"<<std::endl;
-
-      // FFT_PROFILE_START(2);
-      // auto sp=SPLITTER();
-      // PARFOR1D(l,num_primes,sp,
+      
+      FFT_PROFILE_START(2);
       for (size_t l=0;l<num_primes;l++)
-	{
-	  //FFT_PROFILE_START;
-	  ModField f(RNS._basis[l]);
-	  MatrixP_F a_i (f, m, k, pts);
-	  MatrixP_F b_i (f, k, n, pts);
-	
-	  c_i[l] = new MatrixP_F(f, m, n, pts);
-	  // copy reduced data
-	  for (size_t i=0;i<m*k;i++)
-	    for (size_t j=0;j<a.size();j++)
-	      a_i.ref(i,j)=t_a_mod[l*n_ta+j+i*a.size()];
-	  for (size_t i=0;i<k*n;i++)
-	    for (size_t j=0;j<b.size();j++)
-	      b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];	
-	  //FFT_PROFILE_GET(tCopy);
-	  //PolynomialMatrixFFTPrimeMulDomain<ModField> fftdomain (f);
-	  PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
-	  fftdomain.mul_fft(lpts, *c_i[l], a_i, b_i);	
-	  //FFT_PROFILE_GET(tMul);
-	}
-      //)
+	       {
+		 //FFT_PROFILE_START;
+		 ModField f(RNS._basis[l]);
+		 MatrixP_F a_i (f, m, k, pts);
+		 MatrixP_F b_i (f, k, n, pts);
+		 
+		 c_i[l] = new MatrixP_F(f, m, n, pts);
+		 // copy reduced data
+		 for (size_t i=0;i<m*k;i++)
+		   for (size_t j=0;j<a.size();j++)
+		     a_i.ref(i,j)=t_a_mod[l*n_ta+j+i*a.size()];
+		 for (size_t i=0;i<k*n;i++)
+		   for (size_t j=0;j<b.size();j++)
+		     b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];	
+		 PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);
+		 integer bound=integer(RNS._basis[l]-1)*integer(RNS._basis[l]-1)
+		   *integer((uint64_t) k)*integer((uint64_t)std::min(a.size(),b.size()));
+		 
+		 fftdomain.mul_fft(lpts, *c_i[l], a_i, b_i, bound);
+	       }      
       FFT_PROFILING(2,"FFTprime mult+copying");
+      DEL_MEM(8*(n_ta+n_tb)*num_primes);
       delete[] t_a_mod;
       delete[] t_b_mod;
-      //FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
-      //FFT_PROFILE(2,"FFTprime multiplication",tMul);
-      
-      if (num_primes < 2) {
-	FFT_PROFILE_START(2);	
-	c.copy(*c_i[0],0,s-1);
-      } else {
-	FFT_PROFILE_START(2);
-	// construct contiguous storage for c_i
-	size_t n_tc=m*n*s;
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB(n_tc*num_primes*8)<<"Mo"<<std::endl;
-	double *t_c_mod = new double[n_tc*num_primes];
-	for (size_t l=0;l<num_primes;l++){
-	  for (size_t i=0;i<m*n;i++)
-	    for (size_t j=0;j<s;j++)
-	      t_c_mod[l*n_tc + (j+i*s)]= c_i[l]->get(i,j);
-	  delete c_i[l];
-	}
-	FFT_PROFILING(2,"linearization of results mod pi");
-
-	// reconstruct the result in C
-	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
-	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-	//std::cout<<"----------------------------------------------"<<std::endl;
-	delete[] t_c_mod;
-
-      }
-      FFT_PROFILING(2,"k prime reconstruction");
-      // std::cout<<"CC:="<<c<<std::endl;
-      // std::cout<<"<-----------------: "<<std::endl;;
-    }
-
-    // WARNING: Polynomial Matrix should stored as matrix of polynomial with integer coefficient 
-    template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
-    void mul_crtla2(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
-		    const integer& maxA, const integer& maxB, const integer& bound) {
-
-      FFT_PROFILE_START(2);
-      linbox_check(a.coldim() == b.rowdim());
-      size_t m = a.rowdim();
-      size_t k = a.coldim();
-      size_t n = b.coldim();
-      size_t s= a.size()+b.size()-1;
-      c.resize(s);
-      size_t lpts=0;
-      size_t pts  = 1; while (pts < s) { pts= pts<<1; ++lpts; }
-
-      // compute max prime value for FFLAS      
-      uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
-      std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
-      
-      std::vector<double> basis(bas.size());
-      std::copy(bas.begin(),bas.end(),basis.begin());
-      FFPACK::rns_double RNS(basis);
-      size_t num_primes = RNS._size;
-
-
-#ifdef FFT_PROFILER
-      //double tMul=0.,tCopy=0;;
-      if (FFT_PROF_LEVEL<3){
-	std::cout << "number of FFT primes :" << num_primes << std::endl;
-	std::cout << "max prime            : "<<prime_max<<" ("<<integer(prime_max).bitsize()<<")"<<std::endl;
-	std::cout << "bitsize of the output: "<<bound.bitsize()
-		  <<"( "<< RNS._M.bitsize()<<" )"<<std::endl;
-	std::cout <<" +++++++++++++++++++++++++++++++"<<std::endl;
-      }
-#endif
-
-
-      FFT_PROFILING(2,"init of CRT approach");
-      // reduce t_a and t_b modulo each FFT primes
-      size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();
-      std::vector<MatrixP_F*> c_i (num_primes);
-
- 
-      // loop for memory saving
-      size_t CRT_NBPRIME=4;
+#else
+      size_t CRT_NBPRIME=CRT_SIZE;
+      ADD_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
       double* t_a_mod= new double[n_ta*CRT_NBPRIME];
       double* t_b_mod= new double[n_tb*CRT_NBPRIME];
-      std::cout<<"MUL FFT RNS: input/output data: "<< MB((n_ta*(maxA.bitsize()+128) +n_tb*(maxB.bitsize()+128) +m*k*s*(bound.bitsize()+128))/8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: initial need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNS  in: "<<MB( (n_ta+n_tb)*CRT_NBPRIME*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNC com: "<<MB(2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-      std::cout<<"MUL FFT RNS: RNS out: "<<MB((m*n*pts)*num_primes*8 )<<"Mo"<<std::endl;
       
-      for(size_t loop=0;loop<num_primes;loop+=CRT_NBPRIME){
-	
+      for(size_t loop=0;loop<num_primes;loop+=CRT_NBPRIME){	
 	// create chunk of RNS
 	size_t rns_chunk=std::min(CRT_NBPRIME,num_primes-loop); // nbr of primes in the current smallRNS basis
 	std::vector<double> smallBasis(rns_chunk);
 	std::copy(basis.begin()+loop,basis.begin()+loop+rns_chunk,smallBasis.begin());
 	FFPACK::rns_double smallRNS(smallBasis);
-	smallRNS.precompute_cst(RNS._ldm);
-
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
+	smallRNS.precompute_cst(RNS._ldm);	
 	smallRNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
 	smallRNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
 	FFT_PROFILING(2,"reduction mod pi of input matrices");
-
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((m*n*pts)*num_primes*8)<<"Mo"<<std::endl;
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB((2*(m*k+k*n)*pts)*8)<<"Mo"<<std::endl;
-
 	for (size_t l=0;l<rns_chunk;l++)
-	  {	    
-	    //FFT_PROFILE_START;
-	    //std::cout<<"prime: "<<(long)smallRNS._basis[l]<<std::endl;
+	  {
 	    ModField f(smallRNS._basis[l]);
 	    MatrixP_F a_i (f, m, k, pts);
 	    MatrixP_F b_i (f, k, n, pts);	
@@ -366,30 +227,32 @@ namespace LinBox{
 	    for (size_t i=0;i<k*n;i++)
 	      for (size_t j=0;j<b.size();j++)
 		b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];	
-	    //FFT_PROFILE_GET(tCopy);
-	    //PolynomialMatrixFFTPrimeMulDomain<ModField> fftdomain (f);
-	    PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
-	    fftdomain.mul_fft(lpts, *c_i[loop+l], a_i, b_i);	
-	    //FFT_PROFILE_GET(tMul);
+	    
+	    PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);
+	    integer bound=integer(smallRNS._basis[l]-1)*integer(smallRNS._basis[l]-1)
+	      *integer((int64_t)k)*integer((uint64_t)std::min(a.size(),b.size()));
+	    
+	    fftdomain.mul_fft(lpts, *c_i[loop+l], a_i, b_i, bound);		    
 	  }      
 	FFT_PROFILING(2,"FFTprime mult+copying");
-	//FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
-	//FFT_PROFILE(2,"FFTprime multiplication",tMul);
-
       } // end of loop for memory saving
-	delete[] t_a_mod;
-	delete[] t_b_mod;
-	
+      DEL_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
+      delete[] t_a_mod;
+      delete[] t_b_mod;
+#endif
       
-      if (num_primes < 2) {
+      if (false && num_primes < 2) {
 	FFT_PROFILE_START(2);	
-	c.copy(*c_i[0],0,s-1);
+	//c.copy(*c_i[0],0,s-1);
       } else {
 	FFT_PROFILE_START(2);
+
+#ifndef LOW_MEMORY_PMBASIS
 	// construct contiguous storage for c_i
 	size_t n_tc=m*n*s;
-	//std::cout<<"MUL FFT RNS: RNS -> allocating "<<MB(n_tc*num_primes*8)<<"Mo"<<std::endl;
+	ADD_MEM(8*n_tc*num_primes);
 	double *t_c_mod = new double[n_tc*num_primes];
+	//std::cout<<"RNS OUT ALLOC done: "<<STR_MEMINFO<<std::endl;      
 	for (size_t l=0;l<num_primes;l++){
 	  for (size_t i=0;i<m*n;i++)
 	    for (size_t j=0;j<s;j++)
@@ -399,32 +262,57 @@ namespace LinBox{
 	FFT_PROFILING(2,"linearization of results mod pi");
 
 	// reconstruct the result in C
-	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
-	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-	//std::cout<<"----------------------------------------------"<<std::endl;
+	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc, _maxnorm);
+	//std::cout<<"RNS OUT COMP done: "<<STR_MEMINFO<<std::endl;      
+	DEL_MEM(8*n_tc*num_primes);
 	delete[] t_c_mod;
-
+#else
+	size_t s_small= s/MEMFACTOR + 1;
+	size_t s_last = s- s_small*(MEMFACTOR-1);
+	size_t n_tc_small= m*n*s_small;
+	size_t n_tc_last = m*n*s_last;
+	{
+	  ADD_MEM(8*n_tc_small*num_primes);
+	  //std::cout<<"RNS OUT ALLOC done: "<<STR_MEMINFO<<std::endl;      
+	  double *t_c_mod = new double[n_tc_small*num_primes];
+	  for (size_t memiter=0;memiter<MEMFACTOR-1;memiter++){	 
+	    for (size_t l=0;l<num_primes;l++){
+	      for (size_t i=0;i<m*n;i++)
+		for (size_t j=0;j<s_small;j++)
+		  t_c_mod[l*n_tc_small + (j+i*s_small)]= c_i[l]->get(i,memiter*s_small+j);
+	    }	
+	    // reconstruct the result in C
+	    RNS.convert(m*n,s_small,0,c.getWritePointer()+memiter*s_small,s, t_c_mod, n_tc_small, _maxnorm);
+	    //std::cout<<"RNS OUT COMP done: "<<STR_MEMINFO<<std::endl;      
+	  }
+	  DEL_MEM(8*n_tc_small*num_primes);
+	  delete[] t_c_mod;
+	}	
+	{
+	  ADD_MEM(8*n_tc_last*num_primes);
+	  double *t_c_mod = new double[n_tc_last*num_primes];
+	  // perform the last step
+	  for (size_t l=0;l<num_primes;l++){
+	    for (size_t i=0;i<m*n;i++)
+	      for (size_t j=0;j<s_last;j++)
+		t_c_mod[l*n_tc_last + (j+i*s_last)]= c_i[l]->get(i,(MEMFACTOR-1)*s_small+j);
+	    delete c_i[l];
+	  }	  
+	  // reconstruct the result in C
+	  RNS.convert(m*n,s_last,0,c.getWritePointer()+(MEMFACTOR-1)*s_small,s, t_c_mod, n_tc_last, _maxnorm);
+	  DEL_MEM(8*n_tc_last*num_primes);
+	  delete[] t_c_mod;
+	}
+	
+#endif
       }
+      
+      //      std::cout<<"c"<<":="<<c<<";\n";
       FFT_PROFILING(2,"k prime reconstruction");
       // std::cout<<"CC:="<<c<<std::endl;
       // std::cout<<"<-----------------: "<<std::endl;;
     }
-
-
-
-    // template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
-    // void midproduct_crtla(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b,
-    // 			  const integer& maxA, const integer& maxB, const integer& bound,
-    // 			  bool smallLeft=true, size_t n0=0, size_t n1=0) {
-    //   // (convert to MatrixP representation)
-    //   MatrixP_I a2(field(),a.rowdim(),a.coldim(),a.size());
-    //   MatrixP_I b2(field(),b.rowdim(),b.coldim(),b.size());
-    //   a2.copy(a,0,a.size()-1);
-    //   b2.copy(b,0,b.size()-1);
-    //   MatrixP_I c2(field(),c.rowdim(),c.coldim(),c.size());
-    //   midproduct_crtla(c2,a2,b2,maxA,maxB,bound,smallLeft,n0,n1);
-    //   c.copy(c2,0,c2.size()-1);
-    // }
+    
 
     // WARNING: Polynomial Matrix should stored as matrix of polynomial with integer coefficient 
     template< typename PMatrix1,typename PMatrix2, typename PMatrix3>
@@ -444,10 +332,10 @@ namespace LinBox{
       linbox_check(c.size()>=deg-hdeg);
       
       if (smallLeft){
-      	linbox_check(b.size()<hdeg+deg);
+	linbox_check(b.size()<hdeg+deg);
       }
       else
-      	linbox_check(a.size()<hdeg+deg);
+	linbox_check(a.size()<hdeg+deg);
 
       //linbox_check(2*c.size()-1 == b.size());
       //size_t deg= b.size()+1;
@@ -455,22 +343,11 @@ namespace LinBox{
       size_t lpts=0;
       size_t pts  = 1; while (pts < deg) { pts= pts<<1; ++lpts; }
 
-
-      // compute bit size of feasible prime for FFLAS
-      // size_t _k=k,lk=0;
-      //while ( _k ) {_k>>=1; ++lk;}
-      //size_t prime_bitsize= (53-lk)>>1;
-
       // compute max prime value for FFLAS
       uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
       std::vector<integer> bas;
-      getFFTPrime(prime_max,lpts,bound,bas);
-      //RandomFFTPrime RdFFT(prime_bitsize);
-      // if (!RdFFT.generatePrimes(bound,bas)){
-      // 	std::cout<<"COULD NOT FIND ENOUGH FFT PRIME in MatPoly FFTMUL exiting..."<<std::endl;
-      // 	throw LinboxError("LinBox ERROR: not enough FFT Prime\n");
-      // }
-
+      getFFTPrime(prime_max,lpts,bound,bas,k,deg);
+      
       std::vector<double> basis(bas.size());
       std::copy(bas.begin(),bas.end(),basis.begin());
       FFPACK::rns_double RNS(basis);
@@ -486,24 +363,21 @@ namespace LinBox{
       }
 #endif
       FFT_PROFILING(2,"init of CRT approach");
-      // reduce t_a and t_b modulo each FFT primes
+
       size_t n_ta=m*k*a.size(), n_tb=k*n*b.size();
+      std::vector<MatrixP_F*> c_i (num_primes);
+      
+#ifndef LOW_MEMORY_PMBASIS 
+      // reduce t_a and t_b modulo each FFT primes
+      ADD_MEM(8*(n_ta+n_tb)*num_primes);
       double* t_a_mod= new double[n_ta*num_primes];
       double* t_b_mod= new double[n_tb*num_primes];
+
       RNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
       RNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
       FFT_PROFILING(2,"reduction mod pi of input matrices");
-
-      //std::cout<<"----------------------------------------------"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
-      //std::cout<<"MIDP FFT RNS: need "<<MB((m*n*pts+n_ta+n_tb)*num_primes*8 + 2*(m*k+k*n)*pts*8)<<"Mo"<<std::endl;
-
       
-      //std::cout<<"MIDP FFT RNS: RNS -> allocating "<<MB((n_ta+n_tb)*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: RNS -> allocating "<<MB((m*n)*pts*num_primes*8)<<"Mo"<<std::endl;
-      //std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
 
-      std::vector<MatrixP_F*> c_i (num_primes);
 
       for (size_t l=0;l<num_primes;l++){
 	FFT_PROFILE_START(2);
@@ -526,14 +400,76 @@ namespace LinBox{
 	    else
 	      b_i.ref(i,hdeg-1-j)=t_b_mod[l*n_tb+j+i*b.size()];
 	FFT_PROFILE_GET(2,tCopy);
-	//PolynomialMatrixFFTPrimeMulDomain<ModField> fftdomain (f);
+	
 	PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
 	fftdomain.midproduct_fft(lpts, *(c_i[l]), a_i, b_i, smallLeft);
-				
+	
 	FFT_PROFILE_GET(2,tMul);
-      }
+      }      
+      DEL_MEM(8*(n_ta+n_tb)*num_primes);
       delete[] t_a_mod;
       delete[] t_b_mod;
+#else
+      // loop for memory saving
+      size_t CRT_NBPRIME=CRT_SIZE;
+      ADD_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
+      double* t_a_mod= new double[n_ta*CRT_NBPRIME];
+      double* t_b_mod= new double[n_tb*CRT_NBPRIME];
+            
+      for(size_t loop=0;loop<num_primes;loop+=CRT_NBPRIME){	
+	// create chunk of RNS
+	size_t rns_chunk=std::min(CRT_NBPRIME,num_primes-loop); // nbr of primes in the current smallRNS basis
+	std::vector<double> smallBasis(rns_chunk);
+	std::copy(basis.begin()+loop,basis.begin()+loop+rns_chunk,smallBasis.begin());
+	FFPACK::rns_double smallRNS(smallBasis);
+	smallRNS.precompute_cst(RNS._ldm);
+	smallRNS.init(1, n_ta, t_a_mod, n_ta, a.getPointer(), n_ta, maxA);
+	smallRNS.init(1, n_tb, t_b_mod, n_tb, b.getPointer(), n_tb, maxB);
+	FFT_PROFILING(2,"reduction mod pi of input matrices");
+
+	for (size_t l=0;l<rns_chunk;l++)
+	  {	    
+	    //FFT_PROFILE_START;
+	    //std::cout<<"prime: "<<(long)smallRNS._basis[l]<<std::endl;
+	    ModField f(smallRNS._basis[l]);
+	    MatrixP_F a_i (f, m, k, pts);
+	    MatrixP_F b_i (f, k, n, pts);	
+	    c_i[loop+l] = new MatrixP_F(f, m, n, pts);
+
+	    // copy reduced data
+	    for (size_t i=0;i<m*k;i++)
+	      for (size_t j=0;j<a.size();j++)
+		if (smallLeft)
+		  a_i.ref(i,hdeg-1-j)=t_a_mod[l*n_ta+j+i*a.size()];
+		else
+		  a_i.ref(i,j)=t_a_mod[l*n_ta+j+i*a.size()];
+	    for (size_t i=0;i<k*n;i++)
+	      for (size_t j=0;j<b.size();j++)
+		if (smallLeft)
+		  b_i.ref(i,j)=t_b_mod[l*n_tb+j+i*b.size()];
+		else
+		  b_i.ref(i,hdeg-1-j)=t_b_mod[l*n_tb+j+i*b.size()];
+	    FFT_PROFILE_GET(2,tCopy);
+
+	    PolynomialMatrixThreePrimesFFTMulDomain<ModField> fftdomain (f);       
+	    fftdomain.midproduct_fft(lpts, *(c_i[loop+l]), a_i, b_i, smallLeft);	    
+	    FFT_PROFILE_GET(2,tMul);
+
+	  }      
+	FFT_PROFILING(2,"FFTprime mult+copying");
+	//FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
+	//FFT_PROFILE(2,"FFTprime multiplication",tMul);
+
+      } // end of loop for memory saving
+      DEL_MEM(8*(n_ta+n_tb)*CRT_NBPRIME);
+      delete[] t_a_mod;
+      delete[] t_b_mod;
+
+#endif
+
+
+
+
       FFT_PROFILE(2,"copying linear reduced matrix",tCopy);
       FFT_PROFILE(2,"FFTprime multiplication",tMul);
 
@@ -542,41 +478,78 @@ namespace LinBox{
 	c.copy(*(c_i[0]),0,c.size()-1);
       } else {
 	FFT_PROFILE_START(2);
+
+	size_t s=c.size();
+#ifndef LOW_MEMORY_PMBASIS
 	// construct contiguous storage for c_i
-	double *t_c_mod;
-	size_t n_tc=m*n*c.size();
-	t_c_mod = new double[n_tc*num_primes];
+	size_t n_tc=m*n*s;
+	ADD_MEM(8*n_tc*num_primes);
+	double *t_c_mod = new double[n_tc*num_primes];
 	for (size_t l=0;l<num_primes;l++){
 	  for (size_t i=0;i<m*n;i++)
-	    for (size_t j=0;j<c.size();j++)
-	      t_c_mod[l*n_tc + (j+i*c.size())]= c_i[l]->get(i,j);
+	    for (size_t j=0;j<s;j++)
+	      t_c_mod[l*n_tc + (j+i*s)]= c_i[l]->get(i,j);
 	  delete c_i[l];
 	}
 	FFT_PROFILING(2,"linearization of results mod pi");
 
 	// reconstruct the result in C
-	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc);
-	//std::cout<<"MIDP FFT RNS: "<<MEMINFO<<std::endl;
+	RNS.convert(1,n_tc,0,c.getWritePointer(),n_tc, t_c_mod, n_tc, _maxnorm);
+	DEL_MEM(8*n_tc*num_primes);
 	delete[] t_c_mod;
-
-	//std::cout<<"MUL FFT RNS: "<<MEMINFO<<std::endl;
-	//std::cout<<"----------------------------------------------"<<std::endl;
-
-	FFT_PROFILING(2,"k prime reconstruction");
+#else
+	size_t s_small= s/MEMFACTOR + 1;
+	size_t s_last = s- s_small*(MEMFACTOR-1);
+	size_t n_tc_small= m*n*s_small;
+	size_t n_tc_last = m*n*s_last;
+	{
+	  ADD_MEM(8*n_tc_small*num_primes);
+	  double *t_c_mod = new double[n_tc_small*num_primes];
+	  for (size_t memiter=0;memiter<MEMFACTOR-1;memiter++){	 
+	    for (size_t l=0;l<num_primes;l++){
+	      for (size_t i=0;i<m*n;i++)
+		for (size_t j=0;j<s_small;j++)
+		  t_c_mod[l*n_tc_small + (j+i*s_small)]= c_i[l]->get(i,memiter*s_small+j);
+	    }	
+	    // reconstruct the result in C
+	    RNS.convert(m*n,s_small,0,c.getWritePointer()+memiter*s_small,s, t_c_mod, n_tc_small, _maxnorm);
+	  }
+	  DEL_MEM(8*n_tc_small*num_primes);
+	  delete[] t_c_mod;
+	}	
+	{
+	  ADD_MEM(8*n_tc_last*num_primes);
+	  double *t_c_mod = new double[n_tc_last*num_primes];
+	  // perform the last step
+	  for (size_t l=0;l<num_primes;l++){
+	    for (size_t i=0;i<m*n;i++)
+	      for (size_t j=0;j<s_last;j++)
+		t_c_mod[l*n_tc_last + (j+i*s_last)]= c_i[l]->get(i,(MEMFACTOR-1)*s_small+j);
+	    delete c_i[l];
+	  }	  
+	  // reconstruct the result in C
+	  RNS.convert(m*n,s_last,0,c.getWritePointer()+(MEMFACTOR-1)*s_small,s, t_c_mod, n_tc_last, _maxnorm);
+	  DEL_MEM(8*n_tc_last*num_primes);
+	  delete[] t_c_mod;
+	}
+	
+#endif
       }
     }
+      
   };
 
 
   /***************************************************************************
    **** Polynomial Matrix Multiplication over Fp[x], with p multiprecision ***
    ***************************************************************************/
-  template <>
-  class PolynomialMatrixFFTMulDomain<Givaro::Modular<integer> > {
+  template <size_t K, size_t L>
+  class PolynomialMatrixFFTMulDomain<Givaro::Modular<RecInt::ruint<K>,RecInt::ruint<L> > > {
   public:
-    typedef Givaro::Modular<integer>              Field;
+    typedef Givaro::Modular<RecInt::ruint<K>,RecInt::ruint<L> >       Field;
     typedef typename Field::Element     Element;
-    typedef Givaro::ZRing<integer>  IntField;
+    typedef Givaro::ZRing<RecInt::ruint<L>>  IntField;
+
     // Polynomial matrix stored as a polynomial of matrix
     typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP_F;
     // Polynomial matrix stored as a polynomial of matrix
@@ -584,17 +557,17 @@ namespace LinBox{
 
   private:
     const Field            *_field;  // Read only
-    integer                     _p;
-
+    RecInt::ruint<K>         _p;
+    
   public:
     inline const Field & field() const { return *_field; }
-
+    
     PolynomialMatrixFFTMulDomain(const Field &F) : _field(&F) {
-      field().cardinality(_p);
+      _p=field().cardinality();
     }
 
     template<typename Matrix1, typename Matrix2, typename Matrix3>
-    void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b) {
+    void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, size_t max_rowdeg=0) {
       FFT_PROFILE_START(2);
       MatrixP_F a2(field(),a.rowdim(),a.coldim(),a.size());
       MatrixP_F b2(field(),b.rowdim(),b.coldim(),b.size());
@@ -602,7 +575,7 @@ namespace LinBox{
       a2.copy(a,0,a.size()-1);
       b2.copy(b,0,b.size()-1);
       FFT_PROFILING(2,"converting rep of input");
-      mul(c2,a2,b2);
+      mul(c2,a2,b2, max_rowdeg);
       FFT_PROFILE_START(2);
       c.copy(c2,0,c.size()-1);
       FFT_PROFILING(2,"converting rep of output");
@@ -610,23 +583,21 @@ namespace LinBox{
     }
     
     // Matrix with polynomials  
-    void mul (MatrixP_F &c, const MatrixP_F &a, const MatrixP_F &b) {
-      
+    void mul (MatrixP_F &c, const MatrixP_F &a, const MatrixP_F &b, size_t max_rowdeg=0) {
       FFT_PROFILE_START(2);
-      IntField Z;      
-      PolynomialMatrixFFTMulDomain<IntField> Zmul(Z,_p);
-      integer bound=2*_p*_p*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
-      //Zmul.mul_crtla(c,a,b,_p,_p,bound);
-      Zmul.mul_crtla2(c,a,b,_p,_p,bound); 
+      IntField Z;
+      Givaro::Integer pp(_p);
+      //std::cerr<<"FFT RECINT MUL 1: "<<c.size()<<" -> "<<a.size()<<"x"<<b.size()<<"  "<<STR_MEMINFO<<MEMINFO<<std::endl;
+      PolynomialMatrixFFTMulDomain<IntField> Zmul(Z,pp);
+      integer bound=pp*pp*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
+      Zmul.mul_crtla(c,a,b,_p,_p,bound, max_rowdeg);
+      //std::cerr<<"FFT RECINT MUL 2: "<<c.size()<<" -- "<<STR_MEMINFO<<MEMINFO<<std::endl;
       
-      // reduce the result mod p
-      FFT_PROFILE_START(2);
-      for (size_t i=0;i<c.rowdim()*c.coldim();i++)
-	for (size_t j=0;j<c.size();j++)
-	  c.ref(i,j)%=_p;
       FFT_PROFILING(2,"reduction mod p of output");
     }
 
+
+
     template<typename Matrix1, typename Matrix2, typename Matrix3>
     void midproduct (Matrix1 &c, const Matrix2 &a, const Matrix3 &b,
 		     bool smallLeft=true, size_t n0=0, size_t n1=0) {
@@ -638,26 +609,30 @@ namespace LinBox{
       MatrixP_F c2(field(),c.rowdim(),c.coldim(),c.size());
       midproduct(c2,a2,b2,smallLeft,n0,n1);
       c.copy(c2,0,c.size()-1);
-    }
+    } 
 
     void midproduct (MatrixP_F &c, const MatrixP_F &a, const MatrixP_F &b,
 		     bool smallLeft=true, size_t n0=0, size_t n1=0) {
+      FFT_PROFILE_START(2);
       IntField Z;
-      PolynomialMatrixFFTMulDomain<IntField> Zmul(Z,_p);
-      //const MatrixP_I* a2 = reinterpret_cast<const MatrixP_I*>(&a);
-      //const MatrixP_I* b2 = reinterpret_cast<const MatrixP_I*>(&b);
-      //MatrixP_I* c2       = reinterpret_cast<MatrixP_I*>(&c);
-      //Zmul.midproduct(*c2,*a2,*b2,smallLeft,n0,n1);
+      Givaro::Integer pp(_p);
+      PolynomialMatrixFFTMulDomain<IntField> Zmul(Z,pp);
+      //MatrixP_I c2(Zmul,c.rowdim(),c.coldim(),c.size());
+      //Zmul.midproduct(c2,a,b,smallLeft,n0,n1);
       Zmul.midproduct(c,a,b,smallLeft,n0,n1);
+      
       // reduce the result mod p
-      FFT_PROFILE_START(2);
-      for (size_t i=0;i<c.rowdim()*c.coldim();i++)
-	for (size_t j=0;j<c.size();j++)
-	  c.ref(i,j)%=_p;
+      // FFT_PROFILE_START(2);
+      // for (size_t i=0;i<c.rowdim()*c.coldim();i++)
+      // 	for (size_t j=0;j<c.size();j++)
+      // 	  c.ref(i,j)=integer(c2.ref(i,j))%pp;
       FFT_PROFILING(2,"reduction mod p of output");
     }
   };
 
+  
+
+
 
 
 
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-fast.inl b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-fast.inl
index 50e260d..0136bf4 100644
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-fast.inl
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-fast.inl
@@ -60,11 +60,11 @@ namespace LinBox {
 			: _field(&F), _p(field().cardinality()),  _BMD(F){}
 
 		template<typename Matrix1, typename Matrix2, typename Matrix3>
-		void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b) {
+		void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, size_t max_rowdeg=0) {
 			linbox_check(a.coldim()==b.rowdim());
-			size_t deg  = a.size()+b.size()-1;
+			size_t deg  = (max_rowdeg?max_rowdeg:a.size()+b.size()-2); //size_t deg  = a.size()+b.size()-1;
 			size_t lpts = 0;
-			size_t pts  = 1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			size_t pts  = 1; while (pts <= deg) { pts= pts<<1; ++lpts; }
 			// padd the input a and b to 2^lpts (convert to MatrixP representation)
 			MatrixP a2(field(),a.rowdim(),a.coldim(),pts);
 			MatrixP b2(field(),b.rowdim(),b.coldim(),pts);
@@ -72,14 +72,14 @@ namespace LinBox {
 			b2.copy(b,0,b.size()-1);
 			MatrixP c2(field(),c.rowdim(),c.coldim(),pts);
 			mul_fft (lpts,c2, a2, b2);
-			c.copy(c2,0,deg-1);
+			c.copy(c2,0,deg);
 		}
 
-		void mul (MatrixP &c, const MatrixP &a, const MatrixP &b) {
+		void mul (MatrixP &c, const MatrixP &a, const MatrixP &b, size_t max_rowdeg=0) {
 			linbox_check(a.coldim()==b.rowdim());
-			size_t deg  = a.size()+b.size()-1;
+			size_t deg  = (max_rowdeg?max_rowdeg:a.size()+b.size()-2); //size_t deg  = a.size()+b.size()-1;
 			size_t lpts = 0;
-			size_t pts  = 1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			size_t pts  = 1; while (pts <= deg) { pts= pts<<1; ++lpts; }
 			
 			// padd the input a and b to 2^lpts
 			MatrixP a2(field(),a.rowdim(),a.coldim(),pts);
@@ -89,7 +89,7 @@ namespace LinBox {
 			// resize c to 2^lpts
 			c.resize(pts);
 			mul_fft (lpts,c, a2, b2);
-			c.resize(deg);
+			c.resize(deg+1);
 		}
 
 		// a,b and c must have size: 2^lpts
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-three-primes.inl b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-three-primes.inl
index e68d0dc..daed2de 100644
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-three-primes.inl
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-three-primes.inl
@@ -38,7 +38,7 @@ namespace LinBox {
 
 	/***********************************************************************************
 	 **** Polynomial Matrix Multiplication over Zp[x] with p (FFLAS prime) ***
-	 ***********************************************************************************/
+	 *********************************x**************************************************/
 	template<class Field>
 	class PolynomialMatrixThreePrimesFFTMulDomain {
 	public:
@@ -65,39 +65,48 @@ namespace LinBox {
 		}
 
 		template<typename Matrix1, typename Matrix2, typename Matrix3>
-		void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b) {
+		void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, size_t max_rowdeg=0) {
 			linbox_check(a.coldim()==b.rowdim());
-			size_t deg  = a.size()+b.size()-1;
+			// deg is the max rowdegree of the product
+			size_t deg  = (max_rowdeg?max_rowdeg:a.size()+b.size()-2); //size_t deg  = a.size()+b.size()-1;
+			c.resize(deg+1);
 			size_t lpts = 0;
-			size_t pts  = 1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			size_t pts  = 1; while (pts <= deg) { pts= pts<<1; ++lpts; }
 			// padd the input a and b to 2^lpts (convert to MatrixP representation)
 			MatrixP a2(field(),a.rowdim(),a.coldim(),pts);
 			MatrixP b2(field(),b.rowdim(),b.coldim(),pts);
-			a2.copy(a,0,a.size()-1);
-			b2.copy(b,0,b.size()-1);
+			a2.copy(a,0,a.degree());
+			b2.copy(b,0,b.degree());
 			MatrixP c2(field(),c.rowdim(),c.coldim(),pts);
-			mul_fft (lpts,c2, a2, b2);
-			c.copy(c2,0,deg-1);
+			integer bound=integer(_p-1)*integer(_p-1)
+				*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
+			mul_fft (lpts,c2, a2, b2, bound);
+			c.copy(c2,0,deg);
 		}
 
-		void mul (MatrixP &c, const MatrixP &a, const MatrixP &b) {
+		void mul (MatrixP &c, const MatrixP &a, const MatrixP &b, size_t max_rowdeg=0) {
 			linbox_check(a.coldim()==b.rowdim());
-			size_t deg  = a.size()+b.size()-1;
+			// deg is the max rowdegree of the product
+			size_t deg  = (max_rowdeg?max_rowdeg:a.size()+b.size()-2); //size_t deg  = a.size()+b.size()-1;
 			size_t lpts = 0;
-			size_t pts  = 1; while (pts < deg) { pts= pts<<1; ++lpts; }
+			size_t pts  = 1; while (pts <= deg) { pts= pts<<1; ++lpts; }
 			// padd the input a and b to 2^lpts
 			MatrixP a2(field(),a.rowdim(),a.coldim(),pts);
 			MatrixP b2(field(),b.rowdim(),b.coldim(),pts);
-			a2.copy(a,0,a.size()-1);
-			b2.copy(b,0,b.size()-1);
+			a2.copy(a,0,a.degree());
+			b2.copy(b,0,b.degree());
 			// resize c to 2^lpts
 			c.resize(pts);
-			mul_fft (lpts,c, a2, b2);
-			c.resize(deg);
-		}
+			integer bound=integer(_p-1)*integer(_p-1)
+				*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
 
-		void mul_fft (size_t lpts, MatrixP &c, MatrixP &a, MatrixP &b) {
-			size_t pts=c.size();
+			mul_fft (lpts,c, a2, b2, bound);
+			c.resize(deg+1);
+		}
+		
+		// a,b and c must have size: 2^lpts
+		void mul_fft (size_t lpts, MatrixP &c, MatrixP &a, MatrixP &b, const integer& bound) {
+			size_t pts=c.size();			
 			if ((_p-1) % pts == 0){
 				PolynomialMatrixFFTPrimeMulDomain<ModField> fftprime_domain (field());
 				fftprime_domain.mul_fft(lpts,c,a,b);
@@ -110,15 +119,7 @@ namespace LinBox {
 			size_t k = a.coldim();
 			size_t n = b.coldim();
 			
-
-			integer bound=integer((uint64_t)_p)*integer((uint64_t)_p)*integer((uint64_t)k)*integer((uint64_t)pts);
-			// compute bit size of feasible prime for FFLAS
-			// size_t _k=k,lk=0;
-			// while ( _k ) {_k>>=1; ++lk;}
-			// size_t prime_bitsize= (53-lk)>>1;
-
-			// compute max prime value for FFLAS
-			uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
+			uint64_t prime_max=maxFFTPrimeValue(k,pts); // CAREFUL: only for Modular<double>;
 			RandomFFTPrime RdFFT(prime_max);
 			std::vector<integer> bas;
 			if (!RdFFT.generatePrimes(lpts,bound,bas)){
@@ -202,8 +203,10 @@ namespace LinBox {
 				for (size_t j=0;j<b2.rowdim()*b2.coldim();j++)
 					for (size_t i=0;i<hdeg/2;i++)
 						std::swap(b2.ref(j,i),b2.ref(j,hdeg-1-i));
-
-			midproduct_fft (lpts,c2, a2, b2, smallLeft);
+			integer bound=integer(_p-1)*integer(_p-1)
+				*integer((uint64_t)a.coldim())*integer((uint64_t)std::min(a.size(),b.size()));
+			
+			midproduct_fft (lpts,c2, a2, b2, bound, smallLeft);
 			c.copy(c2,0,c.size()-1);
 		}
 
@@ -211,7 +214,7 @@ namespace LinBox {
 		// a,b and c must have size: 2^lpts
 		// -> a must have been already reversed according to the midproduct algorithm
 		void midproduct_fft (size_t lpts, MatrixP &c, MatrixP &a, MatrixP &b,
-				     bool smallLeft=true) {
+				     const integer& bound, bool smallLeft=true) {
 			size_t pts=c.size();			
 			if ((_p-1) % pts == 0){
 				PolynomialMatrixFFTPrimeMulDomain<ModField> fftprime_domain (field());
@@ -222,15 +225,15 @@ namespace LinBox {
 			size_t k = a.coldim();
 			size_t n = b.coldim();
 
-			integer bound=integer(_p)*integer(_p)*integer((uint64_t)k)*integer((uint64_t)pts);
-
 			// compute bit size of feasible prime for FFLAS
 			// size_t _k=k,lk=0;
 			// while ( _k ) {_k>>=1; ++lk;}
 			// size_t prime_bitsize= (53-lk)>>1;
 
 			// compute max prime value for FFLAS
-			uint64_t prime_max= std::sqrt( (1ULL<<53) / k)+1;
+			//uint64_t prime_max= std::min(uint64_t(std::sqrt( (1ULL<<53) / k)+1), uint64_t(Givaro::Modular<double>::maxCardinality())) 
+			uint64_t prime_max=maxFFTPrimeValue(k,pts); // CAREFUL: only for Modular<double>;
+			
 			RandomFFTPrime RdFFT(prime_max);
 
 			std::vector<integer> bas;
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize.inl b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize.inl
index c09f31e..f0f4338 100644
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize.inl
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize.inl
@@ -55,16 +55,19 @@ namespace LinBox {
                 PolynomialMatrixFFTMulDomain (const Field& F) : _field(&F), _p(F.cardinality()) {}
 
                 template<typename Matrix1, typename Matrix2, typename Matrix3>
-                void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b) {
-                        uint64_t pts= 1<<(integer((uint64_t)a.size()+b.size()-1).bitsize());
+                void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, size_t max_rowdeg=0) {
+			size_t deg  = (max_rowdeg?max_rowdeg:a.size()+b.size()-2); //size_t deg  = a.size()+b.size()-1;
+			c.resize(deg+1);
+			size_t lpts = 0;
+			size_t pts  = 1; while (pts <= deg) { pts= pts<<1; ++lpts; }
                         if ( _p< 536870912ULL  &&  ((_p-1) % pts)==0){
 				PolynomialMatrixFFTPrimeMulDomain<Field> MulDom(field());
-                                MulDom.mul(c,a,b);
+                                MulDom.mul(c,a,b, max_rowdeg);
                         }
                         else {
 				if (_p< 536870912ULL){
 					PolynomialMatrixThreePrimesFFTMulDomain<Field> MulDom(field());
-					MulDom.mul(c,a,b);
+					MulDom.mul(c,a,b, max_rowdeg);
 				}
 				else {
 					// use computation with Givaro::Modular<integer>
@@ -75,11 +78,11 @@ namespace LinBox {
 					MatrixP_L a2(Fp,a.rowdim(),a.coldim(),a.size());
 					MatrixP_L b2(Fp,b.rowdim(),b.coldim(),b.size());
 					MatrixP_L c2(Fp,c.rowdim(),c.coldim(),c.size());
-					a2.copy(a,0,a.size()-1);
-					b2.copy(b,0,b.size()-1);
+					a2.copy(a,0,a.degree());
+					b2.copy(b,0,b.degree());
 					FFT_PROFILING(2,"converting rep of polynomial matrix input");
-					MulDom.mul(c2,a2,b2);
-					c.copy(c2,0,c.size()-1);
+					MulDom.mul(c2,a2,b2, max_rowdeg);
+					c.copy(c2,0,c.degree());
 					FFT_PROFILING(2,"converting rep of polynomial matrix output");
 				}
                         }
diff --git a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft.h b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft.h
index 8e7a06d..772ff88 100755
--- a/linbox/algorithms/polynomial-matrix/matpoly-mult-fft.h
+++ b/linbox/algorithms/polynomial-matrix/matpoly-mult-fft.h
@@ -52,8 +52,8 @@ Givaro::Timer mychrono[3];
     mychrono[lvl].stop();std::cout<<"FFT("<<lvl<<"):";			\
     std::cout.width(FFT_PROF_MSG_SIZE);std::cout<<std::left<<msg<<" : "; \
     std::cout.precision(6);std::cout<<mychrono[lvl]<<std::endl;		\
-    mychrono[lvl].clear();mychrono[lvl].start();					\
-}
+    mychrono[lvl].clear();mychrono[lvl].start();			\
+  }
   
 #ifdef HAVE_OPENMP								
 #define FFT_PROFILE_GET(lvl,x)						\
@@ -63,11 +63,11 @@ Givaro::Timer mychrono[3];
   mychrono[lvl].stop();(x)+=mychrono[lvl].usertime();mychrono[lvl].clear();mychrono[lvl].start();
 #endif
 #define FFT_PROFILE(lvl,msg,x)						\
-if ((lvl)>=FFT_PROF_LEVEL) {					\
-  std::cout<<"FFT: ";						   \
-  std::cout.width(FFT_PROF_MSG_SIZE);std::cout<<std::left<<msg<<" : ";	\
-  std::cout.precision(6);std::cout<<x<<" s"<<std::endl;			\
-}
+  if ((lvl)>=FFT_PROF_LEVEL) {						\
+			      std::cout<<"FFT: ";			\
+			      std::cout.width(FFT_PROF_MSG_SIZE);std::cout<<std::left<<msg<<" : "; \
+			      std::cout.precision(6);std::cout<<x<<" s"<<std::endl; \
+  }
 #else
 #define FFT_PROFILE_START(lvl)
 #define FFT_PROFILING(lvl,msg)
@@ -82,33 +82,94 @@ if ((lvl)>=FFT_PROF_LEVEL) {					\
 
 namespace LinBox
 {
-	// generic handler for multiplication using FFT
-	template <class Field>
-	class PolynomialMatrixFFTMulDomain {
-	public:
-		inline const Field & field() const;
+// generic handler for multiplication using FFT
+  template <class Field>
+    class PolynomialMatrixFFTMulDomain {
+  public:
+    inline const Field & field() const;
 
-		PolynomialMatrixFFTMulDomain (const Field& F);
+    PolynomialMatrixFFTMulDomain (const Field& F);
 
-		template<typename Matrix1, typename Matrix2, typename Matrix3>
-		void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b);
+    template<typename Matrix1, typename Matrix2, typename Matrix3>
+      void mul (Matrix1 &c, const Matrix2 &a, const Matrix3 &b);
 
-		template<typename Matrix1, typename Matrix2, typename Matrix3>
-		void midproduct (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, bool smallLeft=true, size_t n0=0,size_t n1=0);
-	};
+    template<typename Matrix1, typename Matrix2, typename Matrix3>
+      void midproduct (Matrix1 &c, const Matrix2 &a, const Matrix3 &b, bool smallLeft=true, size_t n0=0,size_t n1=0);
+  };
 		
 	
-	//class PolynomialMatrixFFTPrimeMulDomain ;                         // Mul in Zp[x] with p <2^32, (fflas, fourier)
+  //class PolynomialMatrixFFTPrimeMulDomain ;                         // Mul in Zp[x] with p <2^32, (fflas, fourier)
 		
-	// template <class T>
-	// class PolynomialMatrixFFTMulDomain<Givaro::Modular<T> > ;        // Mul in Zp[x] with p^2 storable in type T
-
-	// template<>
-	// class PolynomialMatrixFFTMulDomain<Givaro::ZRing<integer> >;  // Mul in Z[x]
-
-	// template <>
-	// class PolynomialMatrixFFTMulDomain<Givaro::Modular<integer> > ;           // Mul in Zp[x] with p multiprecision
+  // template <class T>
+  // class PolynomialMatrixFFTMulDomain<Givaro::Modular<T> > ;        // Mul in Zp[x] with p^2 storable in type T
+
+  // template<>
+  // class PolynomialMatrixFFTMulDomain<Givaro::ZRing<integer> >;  // Mul in Z[x]
+
+  // template <>
+  // class PolynomialMatrixFFTMulDomain<Givaro::Modular<integer> > ;           // Mul in Zp[x] with p multiprecision
+
+  // get the maximum prime for fft with modular<double> (matrix dim =k, nbr point = pts)
+  uint64_t maxFFTPrimeValue(uint64_t k, uint64_t pts) {
+    uint64_t prime_max=std::sqrt( (1ULL<<53) /k)+1;
+    size_t c=1;
+    const int fct=24;
+    while (c<k && prime_max < (1UL<<26) && prime_max< pts*fct){
+      prime_max=std::sqrt( (1ULL<<53) /(k/c))+1;
+      c<<=1;
+    }
+
+    //std::cout<<"maxFFTPrime: pts -> "<<pts<<std::endl;
+    //std::cout<<"maxFFTPrime: replacing "<<k<<" -> "<<k/c<<std::endl;
+	  
+    if (c>=k){
+      std::cout<<"MatPoly FFT (maxPrimeValue): impossible to find enough FFT Prime\n";
+      std::terminate();
+    }
+	  
+    return std::min(prime_max, uint64_t(Givaro::Modular<double>::maxCardinality()));
+  }
+
+  void getFFTPrime(uint64_t prime_max, size_t lpts, integer bound, std::vector<integer> &bas, size_t k, size_t d){
+	  
+    RandomFFTPrime RdFFT(prime_max);
+    size_t nbp=0;
+	  
+    if (!RdFFT.generatePrimes(lpts,bound,bas)){ // not enough FFT prime found 
+      integer MM=1;
+      for(std::vector<integer>::size_type i=0;i<bas.size();i++){
+	MM*=bas[i];
+	//std::cout<<bas[i]<<std::endl;
+      }
+	    
+      // compute max bitsize for prime allowing three prime fft
+      integer prime_max_tp=MM/uint64_t(d*k);
+      while (k>1 && prime_max_tp<100) {k/=2;prime_max_tp*=2;}
+      if (k<=1) {std::cout<<"getFFTPrime error: impossible to have enough primes satisfying constraints: FFLAS prime (<2^26) and FFT (2^"<<lpts<<")\n";}
+	
+      RandomPrimeIter Rd(std::min(prime_max_tp.bitsize()/2,integer(prime_max).bitsize())-1);
+#ifdef VERBOSE_FFT
+      std::cout<<"MM="<<MM<<std::endl;
+      std::cout<<"normal primemax: "<<prime_max_tp<<" "<<prime_max<<std::endl;
+      std::cout<<"normal prime bitmax: "<<std::min(prime_max_tp.bitsize()/2,integer(prime_max).bitsize()-1)<<std::endl;
+#endif
+      integer tmp;
+      do {
+	do {Rd.random(tmp);}
+	while (MM%tmp==0 || tmp>prime_max);
+	bas.push_back(tmp);
+	nbp++;
+	MM*=tmp;
+      } while (MM<bound);	
+    }
+#ifdef VERBOSE_FFT      
+    std::cout<<"MatPoly Multiprecision FFT : using "<<bas.size()-nbp<<" FFT primes and "<<nbp<<" normal primes "<<std::endl;
+#endif
+    for(auto i: bas)
+      if (i>prime_max) std::cout<<"ERROR\n";
+  }
 
+	
 } // end of namespace LinBox
 
 #include "linbox/algorithms/polynomial-matrix/matpoly-mult-fft-wordsize-fast.inl"
diff --git a/linbox/algorithms/polynomial-matrix/order-basis.h b/linbox/algorithms/polynomial-matrix/order-basis.h
index 4affeb2..55f2744 100755
--- a/linbox/algorithms/polynomial-matrix/order-basis.h
+++ b/linbox/algorithms/polynomial-matrix/order-basis.h
@@ -22,8 +22,17 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * ========LICENCE========
  */
+
+
 #include "linbox/matrix/dense-matrix.h"
 #include "linbox/matrix/polynomial-matrix.h"
+
+
+#ifdef TRACK_MEMORY_MATPOL
+#define MEMINFO2 STR_MEMINFO<<MEMINFO
+#else
+#define MEMINFO2 ""
+#endif
 #include "linbox/algorithms/polynomial-matrix/polynomial-matrix-domain.h"
 #include <vector>
 #include <algorithm>
@@ -32,14 +41,16 @@
 #define MBASIS_THRESHOLD_LOG 5
 #define MBASIS_THRESHOLD (1<<MBASIS_THRESHOLD_LOG)
 
+
 namespace LinBox {
 
 #ifdef __CHECK_ORDERBASIS
 #define __CHECK_MBASIS
 #define __CHECK_PMBASIS
 #endif
-        
-#if (__CHECK_MBASIS) or (__CHECK_PMBASIS)
+
+
+#if defined (__CHECK_MBASIS) or defined (__CHECK_PMBASIS)
 #include <string>
         template<typename Field, typename Mat>
         std::string check_orderbasis(const Field& F, const Mat& sigma,  const Mat& serie, size_t ord){
@@ -51,15 +62,15 @@ namespace LinBox {
                 std::string msg(".....");
                 bool nul_sigma=true;
                 while(i<ord && MD.isZero(T[i])){
-                        if (!MD.isZero(sigma[i])) nul_sigma=false;		
+                        if (i<sigma.size() && !MD.isZero(sigma[i])) nul_sigma=false;		
                         i++;
                 }
                 if (i<ord){
                         std::cout<<"error at degree="<<i<<std::endl;
                         T[i].write(std::cout, Tag::FileFormat::Plain);
                         std::cout<<"***"<<std::endl;
-                        std::cout<<serie<<std::endl;
-                        std::cout<<sigma<<std::endl;
+                        //std::cout<<serie<<std::endl;
+                        //std::cout<<sigma<<std::endl;
                         exit(1);
                 }
 	
@@ -99,7 +110,7 @@ namespace LinBox {
                 void reset() {_count=0;_val=0;}
         };
 
-        template<class Field, class ET=EarlyTerm<-1> >
+		template<class Field, class ET=EarlyTerm<(size_t) -1> >
         class OrderBasis {
         public:
                 typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP;
@@ -110,7 +121,7 @@ namespace LinBox {
                 BlasMatrixDomain<Field>            _BMD;
                 ET                           _EarlyStop;
         public:
-#if 1 or (PROFILE_PMBASIS) or (__CHECK_MBASIS)or (__CHECK_PMBASIS)
+#if  defined(PROFILE_PMBASIS) or defined(__CHECK_MBASIS) or defined(__CHECK_PMBASIS)
                 size_t _idx=0;
                 size_t _target=0;
                 double  _eta=0.;
@@ -138,7 +149,6 @@ namespace LinBox {
                 
                 // serie must have exactly order elements (i.e. its degree = order-1)
                 // sigma can have at most order+1 elements (i.e. its degree = order)
-                // BEWARE: serie can be modified
                 template<typename PMatrix1, typename PMatrix2>
                 size_t PM_Basis(PMatrix1                 &sigma,
                                 const PMatrix2           &serie,
@@ -146,15 +156,15 @@ namespace LinBox {
                                 std::vector<size_t>       &shift)
                 {
 
-#if 1 or (PROFILE_PMBASIS)
-                        //std::cout<<"Start PM-Basis : "<<order<<" ("<<_idx<<"/"<<_target<<")] : "<<std::endl;//MEMINFO<<std::endl;
+#ifdef PROFILE_PMBASIS
+                        //std::cout<<"Start PM-Basis : "<<order<<" ("<<_idx<<"/"<<_target<<")] : "<<std::endl;//MEMINFO2<<std::endl;
                         if (_target==0) _target=order;
                         if (!_started) {_started=true; _start = std::chrono::system_clock::now();}
                         std::chrono::time_point<std::chrono::system_clock> _chrono_start=std::chrono::system_clock::now();
 #endif
                         
                         if (order <= MBASIS_THRESHOLD) {
-#if 1 or (PROFILE_PMBASIS) or (__CHECK_PMBASIS)
+#if defined (PROFILE_PMBASIS) or defined(__CHECK_PMBASIS)
                                 _idx+=order;
 #endif
                                 return M_Basis(sigma, serie, order, shift);                            
@@ -174,52 +184,53 @@ namespace LinBox {
                                 integer p;
 
                                 // first recursive call
-                                PMatrix1 sigma1(field(),m,n,ord1+1);                                
-
+                                PMatrix1 sigma1(field(),m,n,ord1+1);
+                                
 #ifdef MEM_PMBASIS
-                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Sigma1] -> "<<MB(m*n*(ord1+1)*length(field().characteristic(p)))<<"Mo"<<MEMINFO<<std::endl;
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Sigma1] -> "<<MB(sigma1.realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
 #endif
-                                //typename PMatrix2::const_view serie1=serie.at(0,ord1-1);
-                                PMatrix2 *serie1=new PMatrix2(field(),n,k,ord1);
+                                PMatrix2 *serie1 = new PMatrix2(field(),n,k,ord1);
 #ifdef MEM_PMBASIS
-                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Serie1] -> "<<MB(n*k*ord1*length(field().characteristic(p)))<<"Mo"<<MEMINFO<<std::endl;
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Serie1] -> "<<MB(serie1->realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
 #endif
                                 serie1->copy(serie,0,ord1-1);
                                 d1 = PM_Basis(sigma1, *serie1, ord1, shift);
+                                //DEL_MEM(serie1->realmeminfo())
                                 delete serie1;                                
                                 if (_EarlyStop.terminated()){
                                         sigma=sigma1;
                                         return d1;
                                 }
-                                
+
                                 // compute the serie update
                                 // TODO: for Block Wiedemann, this step can use only the first column of sigma
                                 PMatrix2 *serie2=new PMatrix2(field(),n,k,ord2);//serie2 size=ord1+1 -> midproduct)
+                                //ADD_MEM(serie2->realmeminfo());
 #ifdef MEM_PMBASIS
-                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Serie2] -> "<<MB(n*k*ord2*length(field().characteristic(p)))<<"Mo"<<MEMINFO<<std::endl;
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [Serie2] -> "<<MB(serie2->realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
 #endif              
                                 _PMD.midproductgen(*serie2, sigma1, serie, true, ord1+1,ord1+ord2);
+                                
 #ifdef PROFILE_PMBASIS
                                 //chrono.stop();
-                                //std::cout<<"      -> serie update "<<sigma1.size()<<"x"<<order<<" --> "<<chrono.usertime()<<std::endl;//MEMINFO<<std::endl;
+                                //std::cout<<"      -> serie update "<<sigma1.size()<<"x"<<order<<" --> "<<chrono.usertime()<<std::endl;//MEMINFO2<<std::endl;
                                 //chrono.clear();chrono.start();
 #endif
                                 // second recursive call
-
+                                
                                 PMatrix1 sigma2(field(),m,n,ord2+1);
 #ifdef MEM_PMBASIS
-                                std::cerr<<"[PM-Basis("<<order<<") "<<_idx<<"/"<<_target<<"] [Sigma2] -> "<<MB(m*n*(ord1+1)*length(field().characteristic(p)))<<"Mo"<<MEMINFO<<std::endl;
+                                std::cerr<<"[PM-Basis("<<order<<") "<<_idx<<"/"<<_target<<"] [Sigma2] -> "<<MB(sigma2.realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
 #endif
                                 d2 = PM_Basis(sigma2, *serie2, ord2, shift);
                                 delete serie2;                                 
 
                                 // compute the result
                                 _PMD.mul(sigma, sigma2, sigma1);
-                                //sigma.resize(d1+d2+1);
-                                sigma.setsize(d1+d2+1);                               
+                                sigma.resize(d1+d2+1);                                
 #ifdef PROFILE_PMBASIS
                                 //chrono.stop();
-                                //std::cout<<"      -> basis product "<<sigma1.size()<<"x"<<sigma2.size()<<" = "<<d1+d2+1<<" -->"<<chrono.usertime()<<MEMINFO<<std::endl;
+                                //std::cout<<"      -> basis product "<<sigma1.size()<<"x"<<sigma2.size()<<" = "<<d1+d2+1<<" -->"<<chrono.usertime()<<MEMINFO2<<std::endl;
 #endif
 
 #ifdef __CHECK_PMBASIS
@@ -237,7 +248,7 @@ namespace LinBox {
                                 
                                 _eta=(_eta!=0.0?std::min(_eta,tcomp*magicnumber):tcomp*magicnumber);
                                 std::cerr<<"[PM-Basis : "<<order<<" ("<<_idx<<"/"<<_target<<")] : "<<chrono.usertime()
-                                         << " (ETA: "<< telap<<"s / "<<_eta<<"s)"<<MEMINFO<<std::endl;
+                                         << " (ETA: "<< telap<<"s / "<<_eta<<"s)"<<MEMINFO2<<std::endl;
                                 chrono.clear();chrono.start();
 #endif
 
@@ -261,7 +272,9 @@ namespace LinBox {
                         return d;
 
                 }
-
+                
+          
+                
                 // serie must have exactly order elements (i.e. its degree = order-1)
                 template<typename PMatrix1, typename PMatrix2>
                 size_t M_Basis(PMatrix1              &sigma,
@@ -628,8 +641,170 @@ namespace LinBox {
                         //        cout<<"Early termination at order "<<sss<<" ("<<order<<")"<<endl;
                 }
 
+#ifdef LOW_MEMORY_PMBASIS
+                // serie must have exactly order elements (i.e. its degree = order-1)
+                // sigma can have at most order+1 elements (i.e. its degree = order)
+                // !!! sigma is not allocated apriori !!!
+                template<typename PMatrix1, typename PMatrix2>
+                size_t PM_Basis_low(PMatrix1*                &sigma_ptr,
+                                    const PMatrix2           *serie_ptr,
+                                    size_t                    order,
+                                    std::vector<size_t>       &shift)
+                {
+
+#ifdef PROFILE_PMBASIS
+                        //std::cout<<"Start PM-Basis : "<<order<<" ("<<_idx<<"/"<<_target<<")] : "<<std::endl;//MEMINFO2<<std::endl;
+                        if (_target==0) _target=order;
+                        if (!_started) {_started=true; _start = std::chrono::system_clock::now();}
+                        std::chrono::time_point<std::chrono::system_clock> _chrono_start=std::chrono::system_clock::now();
+#endif
+                        
+                        if (order <= MBASIS_THRESHOLD) {
+#if defined (PROFILE_PMBASIS) or defined(__CHECK_PMBASIS)
+                                _idx+=order;
+#endif
+                                sigma_ptr = new PMatrix1(field(),serie_ptr->rowdim(),serie_ptr->rowdim(),order+1);
+                                size_t res= M_Basis(*sigma_ptr, *serie_ptr, order, shift);
+                                delete serie_ptr;
+                                return res;
+                        }
+                        else {
+#ifdef PROFILE_PMBASIS
+                                Timer chrono;
+                                chrono.start();
+#endif
+                                size_t ord1,ord2,d1,d2;
+                                ord1 = order>>1;
+                                ord2 = order-ord1; // ord1+ord2=order
+                                size_t m,n,k;
+                                m=serie_ptr->rowdim();
+                                n=serie_ptr->rowdim();
+                                k=serie_ptr->coldim();
+                                integer p;
+
+                                // first recursive call
+                                PMatrix1 *sigma1_ptr, *sigma2_ptr;
+                                PMatrix2 *serie1_ptr, *serie2_ptr;
+
+                                // Allocate serie1
+                                serie1_ptr= new PMatrix2(field(),n,k,ord1);                                
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [ALLOC Serie1] -> "<<MB(serie1_ptr->realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
+#endif
+                                serie1_ptr->copy(*serie_ptr,0,ord1-1);
+                                d1 = PM_Basis_low(sigma1_ptr, serie1_ptr, ord1, shift);
+                                // no more needed
+                                // delete serie1_ptr; 
+                                
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [DEL Serie1] -> "<<MEMINFO2<<std::endl;
+#endif
+
+
+                                if (_EarlyStop.terminated()){
+                                        sigma_ptr=sigma1_ptr;
+                                        delete serie_ptr;
+                                        return d1;
+                                }
+
+                                // Allocate serie2
+                                serie2_ptr=new PMatrix2(field(),n,k,ord2);//serie2 size=ord1+1 -> midproduct)
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [ALLOC Serie2] -> "<<MB(serie2_ptr->realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
+#endif
+                                
+                                _PMD.midproductgen(*serie2_ptr, *sigma1_ptr, *serie_ptr, true, ord1+1,ord1+ord2);
+#ifndef __CHECK_PMBASIS
+                                delete serie_ptr; // the initial serie is no more needed (except with checking pmbasis)
+#endif         
+                                // second recursive call                                                                
+                                d2 = PM_Basis_low(sigma2_ptr, serie2_ptr, ord2, shift);
+                                // no more needed
+                                // delete serie2_ptr;
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [DEL Serie2] -> "<<MEMINFO2<<std::endl;
+#endif                                
+                                // compute the result
+                                sigma_ptr = new PMatrix1(field(),m,n,d1+d2+1);
+                                //sigma_ptr = new PMatrix1(field(),m,n,order+1);                                
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [ALLOC Sigma] -> "<<MB(sigma_ptr->realmeminfo())<<"Mo"<<MEMINFO2<<std::endl;
+#endif                                
+                                _PMD.mul(*sigma_ptr, *sigma2_ptr, *sigma1_ptr, d1+d2);
+                                //sigma_ptr->resize(d1+d2+1);                                
+                                delete sigma1_ptr;
+                                delete sigma2_ptr;
+#ifdef MEM_PMBASIS
+                                std::cerr<<"[PM-Basis ("<<order<<") "<<_idx<<"/"<<_target<<"] [DEL Sigma 1/2] -> "<<MEMINFO2<<std::endl;
+#endif
+
+                                
+#ifdef PROFILE_PMBASIS
+                                //chrono.stop();
+                                //std::cout<<"      -> basis product "<<sigma1.size()<<"x"<<sigma2.size()<<" = "<<d1+d2+1<<" -->"<<chrono.usertime()<<MEMINFO2<<std::endl;
+#endif
+
+#ifdef __CHECK_PMBASIS
+                                std::cout<<"PMBASIS: order "<<_idx<<check_orderbasis(field(),*sigma_ptr,*serie_ptr,order)<<std::endl;
+                                delete serie_ptr;
+#endif
+#ifdef PROFILE_PMBASIS
+                                chrono.stop();
+                                _end = std::chrono::system_clock::now();                                
+                                std::chrono::duration<double> elapsed_beginning = _end-_start;
+                                std::chrono::duration<double> elapsed_comp      = _end-_chrono_start;
+
+                                double magicnumber=double(_target)/double(order)*log(double(_target)/double(order))/log(2.);
+                                double tcomp = elapsed_comp.count();
+                                double telap = elapsed_beginning.count();
+                                
+                                _eta=(_eta!=0.0?std::min(_eta,tcomp*magicnumber):tcomp*magicnumber);
+                                std::cerr<<"[PM-Basis : "<<order<<" ("<<_idx<<"/"<<_target<<")] : "<<chrono.usertime()
+                                         << " (ETA: "<< telap<<"s / "<<_eta<<"s)"<<MEMINFO2<<std::endl;
+                                chrono.clear();chrono.start();
+#endif
+
+
+                                return d1+d2;
+                        }
+                }
+#endif // LOW_MEMORY_PMBASIS
+
+
         };
 
+        
+        typedef Givaro::Modular<RecInt::ruint128,RecInt::ruint256>   MYRECINT;
+        template<>
+		size_t OrderBasis<MYRECINT,EarlyTerm<(size_t) -1> >::M_Basis(PolynomialMatrix<PMType::polfirst,PMStorage::plain, MYRECINT>            &sigma,
+                                                            const PolynomialMatrix<PMType::polfirst,PMStorage::plain, MYRECINT>      &serie,
+                                                            size_t                 order,
+                                                            std::vector<size_t>   &shift)
+        {
+                Givaro::Integer p; field().cardinality(p);
+                typedef Givaro::Modular<Givaro::Integer> NewField;
+                NewField F(p);
+                OrderBasis<NewField > SB(F);
+                typedef PolynomialMatrix<PMType::matfirst,PMStorage::plain, NewField> NewMatrix;
+                
+                NewMatrix sigma1(F,sigma.rowdim(),sigma.coldim(),order+1);
+                NewMatrix serie1(F,serie.rowdim(),serie.coldim(),order);
+                serie1.copy(serie,0,order-1);
+
+                //std::cout<<"Serie: "<<serie<<std::endl;
+                //std::cout<<"Serie1: "<<serie1<<std::endl;
+
+                size_t d= SB.M_Basis(sigma1,serie1,order,shift);
+                sigma.copy(sigma1,0,d);
+                
+                //std::cout<<"Sigma1: "<<sigma1<<std::endl;
+                //std::cout<<"Sigma: "<<sigma<<std::endl;
+
+
+                return d;
+        }
+        
+        
 } // end of namespace LinBox
 
 // Local Variables:
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-algorithms.h b/linbox/algorithms/polynomial-matrix/polynomial-fft-algorithms.h
new file mode 100644
index 0000000..65ca574
--- /dev/null
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-algorithms.h
@@ -0,0 +1,401 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2016 Romain Lebreton, Pascal Giorgi
+ *
+ * Written by Pascal Giorgi <pascal.giorgi at lirmm.fr>
+ *            Romain Lebreton <romain.lebreton at lirmm.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+ * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+
+#ifndef __LINBOX_polynomial_fft_algorithms_H
+#define __LINBOX_polynomial_fft_algorithms_H
+
+#include <iostream>
+#include "linbox/linbox-config.h"
+#include "fflas-ffpack/fflas/fflas_simd.h"
+#include "linbox/algorithms/polynomial-matrix/simd-additional-functions.h"
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-init.h"
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-butterflies.h"
+
+namespace LinBox {
+
+	template<typename Field, typename simd = Simd<typename Field::Element>, uint8_t vect_size = simd::vect_size>
+	class FFT_algorithms : public FFT_butterflies<Field, simd, vect_size> {
+	public:
+		using Element = typename Field::Element;
+		FFT_algorithms(const FFT_init<Field>& f_i) : FFT_butterflies<Field, simd, vect_size>(f_i) {}
+		void DIF_mod2p (Element *fft);
+		void DIT_mod4p (Element *fft);
+		void DIF (Element *fft);
+		void DIT (Element *fft);
+	}; // FFT_algorithms
+
+	template<typename Field>
+	class FFT_algorithms<Field, NoSimd<typename Field::Element>, 1> : public FFT_butterflies<Field, NoSimd<typename Field::Element>, 1> {
+	public:
+		using Element = typename Field::Element;
+
+		FFT_algorithms(const FFT_init<Field>& f_i) : FFT_butterflies<Field, NoSimd<typename Field::Element>, 1>(f_i) {}
+
+		void DIF_mod2p (Element *fft) {
+			for (size_t w = this->n >> 1, f = 1; w != 0; f <<= 1, w >>= 1){
+				// w : witdh of butterflies
+				// f : # families of butterflies
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j++)
+						this->Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], (this->pow_w)[j*f], (this->pow_wp)[j*f]);
+			}
+		}
+
+		void DIT_mod4p (Element *fft) {
+			for (size_t w = 1, f = this->n >> 1; f >= 1; w <<= 1, f >>= 1)
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j++)
+						this->Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], (this->pow_w)[j*f], (this->pow_wp)[j*f]);
+		}
+
+		void DIF (Element *fft) {
+			DIF_mod2p(fft);
+			//DIF_mod2p_iterative(fft);
+			for (uint64_t i = 0; i < this->n; i++) {
+				//				if (fft[i] >= (_pl << 1)) fft[i] -= (_pl << 1);
+				if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+			}
+		}
+
+		void DIT (Element *fft) {
+			DIT_mod4p(fft);
+			//DIF_mod2p_iterative(fft);
+			for (uint64_t i = 0; i < this->n; i++) {
+				if (fft[i] >= (this->_pl << 1)) fft[i] -= (this->_pl << 1);
+				if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+			}
+		}
+
+	}; // FFT_algorithms<Field, NoSimd<typename Field::Element>, 1>
+
+	template<typename Field, typename simd>
+	class FFT_algorithms<Field, simd, 4> : public FFT_butterflies<Field, simd, 4> {
+	public:
+		using Element = typename Field::Element;
+		using Compute_t = typename Field::Compute_t;
+		using Residu_t = typename Field::Residu_t;
+		using vect_t = typename simd::vect_t;
+
+		FFT_algorithms(const FFT_init<Field>& f_i) : FFT_butterflies<Field, simd, 4>(f_i) {
+			linbox_check(simd::vect_size == 4);
+		}
+
+		void DIF_mod2p (Element *fft) {
+			const uint64_t& n = this->n;
+			const Residu_t& _pl = this->_pl;
+			const Residu_t& _dpl = this->_dpl;
+
+			vect_t P,P2;
+			P  = simd::set1(_pl);
+			P2 = simd::set1(_dpl);
+			Element * tab_w = &(this->pow_w) [0];
+			Element * tab_wp= &(this->pow_wp)[0];
+			size_t w, f;
+			for (w = n >> 1, f = 1; w >= 4; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1){
+				// w : witdh of butterflies
+				// f : # families of butterflies
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j+=4)
+
+#define A0 &fft[0] +  (i << 1)   *w+ j
+#define A4 &fft[0] + ((i << 1)+1)*w+ j
+						this->Butterfly_DIF_mod2p(A0,A4, tab_w+j,tab_wp+j,P,P2);
+#undef A0
+#undef A4
+				//std::cout<<fft<<std::endl;
+			}
+			// Last two steps
+			if (n >= 8) {
+				vect_t W,Wp;
+				W = simd::set1 (tab_w [1]);
+				Wp= simd::set1 (tab_wp[1]);
+
+				for (size_t i = 0; i < f; i+=2)
+#define A0 &fft[0] +  (i << 2)
+#define A4 &fft[0] + ((i << 2)+4)
+					this->Butterfly_DIF_mod2p_laststeps(A0,A4,W,Wp,P,P2);
+				//std::cout<<fft<<std::endl;
+#undef A0
+#undef A4
+			} else {
+				FFT_algorithms<Field, NoSimd<Element>, 1> fft_algo_1 (FFT_init<Field> (this->field(),this->ln,this->getRoot()));
+
+				for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							fft_algo_1.Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
+		}
+
+		void DIT_mod4p (Element *fft) {
+			const uint64_t& n = this->n;
+			const Residu_t& _pl = this->_pl;
+			const Residu_t& _dpl = this->_dpl;
+
+			vect_t P,P2;
+			P = simd::set1(_pl);
+			P2 = simd::set1(_dpl);
+			// First two steps
+			if (n >= 8) {
+				vect_t W,Wp;
+				W = simd::set1 ((this->pow_w) [n-3]);
+				Wp= simd::set1 ((this->pow_wp)[n-3]);
+
+				for (size_t i = 0; i < n; i+=8)
+					this->Butterfly_DIT_mod4p_firststeps(&fft[i],&fft[i+4],W,Wp,P,P2);
+
+				Element * tab_w = &(this->pow_w) [n-8];
+				Element * tab_wp= &(this->pow_wp)[n-8];
+				for (size_t w = 4, f = n >> 3; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
+					// w : witdh of butterflies
+					// f : # families of butterflies
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j+=4)
+#define A0 &fft[0] +  (i << 1)   *w+ j
+#define A4 &fft[0] + ((i << 1)+1)*w+ j
+							this->Butterfly_DIT_mod4p(A0,A4, tab_w+j,tab_wp+j,P,P2);
+
+#undef A0
+#undef A4
+
+				}
+			} else {
+				FFT_algorithms<Field, NoSimd<Element>, 1> fft_algo_1 (FFT_init<Field> (this->field(),this->ln,this->getRoot()));
+
+				Element * tab_w = &(this->pow_w) [n-2];
+				Element * tab_wp= &(this->pow_wp)[n-2];
+				for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							fft_algo_1.Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
+		}
+
+		void DIF (Element *fft) {
+			DIF_mod2p(fft);
+
+			if (this->n >= 4) {
+				vect_t P;
+				P  = simd::set1(this->_pl);
+				for (uint64_t i = 0; i < this->n; i += 8)
+					reduce<Element,simd>(&fft[i],P);
+				return;
+			} else {
+				for (uint64_t i = 0; i < this->n; i++)
+					if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+			}
+		}
+
+		void DIT (Element *fft) {
+			DIT_mod4p(fft);
+
+			if (this->n >= 4) {
+				vect_t P,P2;
+				P  = simd::set1(this->_pl);
+				P2 = simd::set1(this->_dpl);
+				for (uint64_t i = 0; i < this->n; i += 8){
+					reduce<Element,simd>(&fft[i],P2);
+					reduce<Element,simd>(&fft[i],P);
+				}
+				return;
+
+			} else {
+				for (uint64_t i = 0; i < this->n; i++) {
+					if (fft[i] >= (this->_pl << 1)) fft[i] -= (this->_pl << 1);
+					if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+				}
+			}
+		}
+
+	}; // FFT_algorithms<Field, NoSimd<typename Field::Element>, 4>
+
+	template<typename Field, typename simd>
+	class FFT_algorithms<Field, simd, 8> : public FFT_butterflies<Field, simd, 8> {
+	public:
+		using Element = typename Field::Element;
+		using vect_t = typename simd::vect_t;
+
+		FFT_algorithms(const FFT_init<Field>& f_i) : FFT_butterflies<Field, simd, 8>(f_i) {
+			linbox_check(simd::vect_size == 8);
+		}
+
+		void DIF_mod2p (Element *fft) {
+			vect_t P,P2;
+			P = simd::set1(this->_pl);
+			P2 = simd::set1(this->_dpl);
+
+			Element * tab_w = &(this->pow_w) [0];
+			Element * tab_wp= &(this->pow_wp)[0];
+			size_t w, f;
+			for (w = this->n >> 1, f = 1; w >= 8; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1){
+				// w : witdh of butterflies
+				// f : # families of butterflies
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j+=8)
+#define A0 &fft[0] +  (i << 1)   *w+ j
+#define A4 &fft[0] + ((i << 1)+1)*w+ j
+						this->Butterfly_DIF_mod2p(A0,A4, tab_w+j,tab_wp+j,P,P2);
+
+#undef A0
+#undef A4
+				//std::cout<<fft<<std::endl;
+			}
+			// Last three steps
+			if (this->n >= 16) {
+				vect_t alpha,alphap,beta,betap;
+				Element tmp[8];
+				tmp[0]=tmp[4]=tab_w[0];
+				tmp[1]=tmp[5]=tab_w[1];
+				tmp[2]=tmp[6]=tab_w[2];
+				tmp[3]=tmp[7]=tab_w[3];
+				alpha = MemoryOp<Element,simd>::load(tmp);
+				tmp[0]=tmp[4]=tab_wp[0];
+				tmp[1]=tmp[5]=tab_wp[1];
+				tmp[2]=tmp[6]=tab_wp[2];
+				tmp[3]=tmp[7]=tab_wp[3];
+				alphap = MemoryOp<Element,simd>::load(tmp);
+				beta = simd::set1(tab_w [5]);
+				betap = simd::set1(tab_wp [5]);
+
+				for (size_t i = 0; i < f; i+=2)
+#define A0 &fft[0] + (i << 3)
+#define A4 &fft[0] + (i << 3)+8
+					this->Butterfly_DIF_mod2p_laststeps(A0,A4,alpha,alphap,beta,betap,P,P2);
+#undef A0
+#undef A4
+				//std::cout<<fft<<std::endl;
+			} else {
+				// TODO : improve ?
+				//FFT_algorithms<Field, NoSimd<Element>, 1> fft_algo_1 ((FFT_init<Field>) *this);
+				FFT_algorithms<Field, NoSimd<Element>, 1> fft_algo_1 (FFT_init<Field> (this->field(),this->ln,this->getRoot()));
+
+				for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							fft_algo_1.Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
+		}
+
+		void DIT_mod4p (Element *fft) {
+			const auto &pow_w = this->pow_w;
+			const auto &pow_wp = this->pow_wp;
+			const uint64_t &n = this->n;
+
+			vect_t P,P2;
+			P = simd::set1(this->_pl);
+			P2 = simd::set1(this->_dpl);
+
+			// first three steps
+			if (n >= 16) {
+				vect_t alpha,alphap,beta,betap;
+				alpha = simd::set1((pow_w)[n-3]);
+				alphap = simd::set1((pow_wp)[n-3]);
+				Element tmp[8];
+				tmp[0]=tmp[4]=(pow_w)[n-8];
+				tmp[1]=tmp[5]=(pow_w)[n-7];
+				tmp[2]=tmp[6]=(pow_w)[n-6];
+				tmp[3]=tmp[7]=(pow_w)[n-5];
+				beta = MemoryOp<Element,simd>::load(tmp);
+				tmp[0]=tmp[4]=(pow_wp)[n-8];
+				tmp[1]=tmp[5]=(pow_wp)[n-7];
+				tmp[2]=tmp[6]=(pow_wp)[n-6];
+				tmp[3]=tmp[7]=(pow_wp)[n-5];
+				betap = MemoryOp<Element,simd>::load(tmp);
+				for (uint64_t i = 0; i < n; i+=16) {
+					this->Butterfly_DIT_mod4p_firststeps(&fft[i],&fft[i+8],alpha,alphap,beta,betap,P,P2);
+				}
+				const Element * tab_w = &(pow_w) [n-16];
+				const Element * tab_wp= &(pow_wp)[n-16];
+				for (size_t w = 8, f = n >> 4; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
+					// w : witdh of butterflies
+					// f : # families of butterflies
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j+=8) {
+#define A0 &fft[0] +  (i << 1)   *w+ j
+#define A4 &fft[0] + ((i << 1)+1)*w+ j
+							this->Butterfly_DIT_mod4p(A0,A4, tab_w+j,tab_wp+j,P,P2);
+#undef A0
+#undef A4
+						}
+				}
+			} else {
+
+				FFT_algorithms<Field, NoSimd<Element>, 1> fft_algo_1 (FFT_init<Field> (this->field(),this->ln,this->getRoot()));
+
+				const Element * tab_w = &(pow_w) [n-2];
+				const Element * tab_wp= &(pow_wp)[n-2];
+				for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							fft_algo_1.Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
+		}
+
+		void DIF (Element *fft) {
+			DIF_mod2p(fft);
+
+			if (this->n >= 8) {
+				vect_t P;
+				P  = simd::set1(this->_pl);
+				for (uint64_t i = 0; i < this->n; i += 8){
+					reduce<Element,simd>(&fft[i],P);
+				}
+				return;
+
+			} else {
+				for (uint64_t i = 0; i < this->n; i++)
+					if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+			}
+		}
+
+		void DIT (Element *fft) {
+			DIT_mod4p(fft);
+
+			if (this->n >= 8) {
+				vect_t P,P2;
+				P  = simd::set1(this->_pl);
+				P2 = simd::set1(this->_dpl);
+				for (uint64_t i = 0; i < this->n; i += 8){
+					reduce<Element,simd>(&fft[i],P2);
+					reduce<Element,simd>(&fft[i],P);
+				}
+				return;
+
+			} else {
+				for (uint64_t i = 0; i < this->n; i++) {
+					if (fft[i] >= (this->_pl << 1)) fft[i] -= (this->_pl << 1);
+					if (fft[i] >= this->_pl) fft[i] -= this->_pl;
+				}
+			}
+		}
+
+	}; // FFT_algorithms<Field, NoSimd<typename Field::Element>, 8>
+
+}
+
+#endif // __LINBOX_polynomial_fft_algorithms_H
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-butterflies.h b/linbox/algorithms/polynomial-matrix/polynomial-fft-butterflies.h
new file mode 100644
index 0000000..8afc717
--- /dev/null
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-butterflies.h
@@ -0,0 +1,492 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2016 Romain Lebreton, Pascal Giorgi
+ *
+ * Written by Pascal Giorgi <pascal.giorgi at lirmm.fr>
+ *            Romain Lebreton <romain.lebreton at lirmm.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+ * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+
+#ifndef __LINBOX_polynomial_fft_butterflies_H
+#define __LINBOX_polynomial_fft_butterflies_H
+
+#include <iostream>
+#include "linbox/util/debug.h"
+#include "linbox/linbox-config.h"
+#include "fflas-ffpack/fflas/fflas_simd.h"
+#include "linbox/algorithms/polynomial-matrix/polynomial-fft-init.h"
+#include "linbox/algorithms/polynomial-matrix/simd-additional-functions.h"
+
+namespace LinBox {
+
+	template<typename Field, typename simd = Simd<typename Field::Element>, uint8_t byn = simd::vect_size>
+	class FFT_butterflies : public FFT_init<Field> {
+	public:
+		FFT_butterflies(const FFT_init<Field>& f_i) : FFT_init<Field>(f_i) {
+			std::cerr<<"Not implemented !\n";
+		}
+	}; // FFT_butterflies
+
+	template<typename Field>
+	class FFT_butterflies<Field, NoSimd<typename Field::Element>, 1> : public FFT_init<Field> {
+	public:
+
+		using Element = typename Field::Element;
+
+		FFT_butterflies(const FFT_init<Field>& f_i) : FFT_init<Field>(f_i) {}
+
+		inline void Butterfly_DIT_mod4p (Element& A, Element& B, const Element& alpha, const Element& alphap) {
+			using Compute_t = typename Field::Compute_t;
+			// Harvey's algorithm
+			// 0 <= A,B < 4*p, p < 2^32 / 4
+			// alphap = Floor(alpha * 2^ 32 / p])
+
+			// TODO : replace by substract if greater
+			if (A >= this->_dpl) A -= this->_dpl;
+
+			// TODO : replace by mul_mod_shoup
+			Element tmp = ((Element) alphap * (Compute_t)B) >> (8*sizeof(Element));
+			tmp = alpha * B - tmp * this->_pl;
+
+			// TODO : replace by add_r and sub_r
+			B = A + (this->_dpl - tmp);
+			//        B &= 0XFFFFFFFF;
+			A += tmp;
+		}
+
+		inline void Butterfly_DIF_mod2p (Element& A, Element& B, const Element& alpha, const Element& alphap) {
+			//std::cout<<A<<" $$ "<<B<<"("<<alpha<<","<<alphap<<" ) -> ";
+			using Compute_t = typename Field::Compute_t;
+			// Harvey's algorithm
+			// 0 <= A,B < 2*p, p < 2^32 / 4
+			// alphap = Floor(alpha * 2^ 32 / p])
+
+			Element tmp = A;
+
+			A += B;
+
+			if (A >= this->_dpl) A -= this->_dpl;
+
+			B = tmp + (this->_dpl - B);
+
+			tmp = ((Element) alphap * (Compute_t) B) >> (8*sizeof(Element));
+			B = alpha * B - tmp * this->_pl;
+			//B &= 0xFFFFFFFF;
+			//std::cout<<A<<" $$ "<<B<<"\n ";
+		}
+
+	}; // FFT_butterflies<Field, 1>
+
+	// ATTENTION à tous les uint64_t, SimdComp restants !!!!
+
+	template<typename Field, typename simd>
+	class FFT_butterflies<Field, simd, 4> : public FFT_init<Field> {
+	public:
+
+		using Element = typename Field::Element;
+		using vect_t = typename simd::vect_t;
+		using SimdComp = typename SimdCompute_t<simd,Field>::Compute_t;
+
+		FFT_butterflies(const FFT_init<Field>& f_i) : FFT_init<Field>(f_i) {
+			linbox_check(simd::vect_size == 4);
+		}
+
+		// TODO include P, P2 in precomp
+		// TODO : Same functions Butterfly_DIT_mod4p Butterfly_DIF_mod2p in FFT_butterflies<Field, 8>
+		inline void Butterfly_DIT_mod4p (Element* ABCD, Element* EFGH,
+										 const Element* alpha, const Element* alphap,
+										 const vect_t& P, const vect_t& P2) {
+			vect_t V1,V2,V3,V4,W,Wp,T1;
+			// V1=[A B C D E F G H], V2=[I J K L M N O P]
+			V1 = MemoryOp<Element,simd>::load(ABCD);
+			V2 = MemoryOp<Element,simd>::load(EFGH);
+			W  = MemoryOp<Element,simd>::load(alpha);
+			Wp = MemoryOp<Element,simd>::load(alphap);
+
+			// V3 = V1 mod 2P
+			V3 = reduce<simd>(V1, P2);
+
+			// V4 = V2 * W mod P
+			V4 = mul_mod<simd>(V2,W,P,Wp);
+
+			// V1 = V3 + V4
+			V1 = simd::add(V3,V4);
+			MemoryOp<Element,simd>::store(ABCD,V1);
+
+			// V2 = V3 - (V4 - 2P)
+			T1 = simd::sub(V4,P2);
+			V2 = simd::sub(V3,T1);
+			MemoryOp<Element,simd>::store(EFGH,V2);
+		}
+
+		inline void Butterfly_DIT_mod4p_firststeps (Element* ABCD, Element* EFGH,
+													const vect_t& W,
+													const vect_t& Wp,
+													const vect_t& P, const vect_t& P2) {
+			// First 2 steps
+			// First step
+			vect_t V1,V2,V3,V4,T1,T2,T3,T4;
+			// T1=[A B C D], T2=[E F G H]
+			T1 = MemoryOp<Element,simd>::load(ABCD);
+			T2 = MemoryOp<Element,simd>::load(EFGH);
+
+			// V1=[AECG], V2=[BFDH]
+			MemoryOp<Element,simd>::unpacklohi_twice4(V1,V2,T1,T2);
+
+			// V3 = V1 + V2
+			// Rk: No need for (. mod 2P) since entries are <P
+			V3 = simd::add(V1,V2);
+			// V4 = V1 + (P - V2)
+			// Rk: No need for (. mod 2P) since entries are <P
+			T1 = simd::sub(V2,P);
+			V4 = simd::sub(V1,T1);
+
+			MemoryOp<Element,simd>::unpacklohi4(V1,V2,V3,V4);
+
+			// Second step
+			// T1 = [D D H H]
+			T1 = MemoryOp<Element,simd>::unpackhi4(V4,V4);
+
+			T2 = mul_mod_half<simd, SimdComp>(T1, W, P, Wp);
+
+			T2 = simd::template shuffle<0xDD>(T2);
+			//T2 = simd::template shuffle<0xDD>(T2);
+
+			//At this point, T2 = [D*Wmodp H*Wmodp D*Wmodp H*Wmodp]
+
+			// At this time I have V3=[A E C G], V4=[B F ? ?], T2=[? ? D H]
+			// I need V1 = [A B E F], V2 = [C D G H]
+			// This is not refactored in MemoryOp::... because of different arguments (V3,V4) and (V3,T2)
+			V1 = MemoryOp<Element,simd>::unpacklo4(V3,V4);
+			V2 = MemoryOp<Element,simd>::unpackhi4(V3,T2);
+
+			// T1 = V1 + V2
+			T1 = simd::add(V1,V2);
+			// T2 = V1 - (V2 - 2P)
+			T3 = simd::sub(V2,P2);
+			T2 = simd::sub(V1,T3);
+
+			MemoryOp<Element,simd>::unpacklohi2(V1,V2,T1,T2);
+
+			// Store
+			MemoryOp<Element,simd>::store(ABCD,V1);
+			MemoryOp<Element,simd>::store(EFGH,V2);
+		}
+
+		inline void Butterfly_DIF_mod2p (Element* ABCD, Element* EFGH,
+										 const Element* alpha, const Element* alphap,
+										 const vect_t& P, const vect_t& P2) {
+			vect_t V1,V2,V3,V4,W,Wp,T;
+			// V1=[A B C D], V2=[E F G H]
+			V1 = MemoryOp<Element,simd>::load(ABCD);
+			V2 = MemoryOp<Element,simd>::load(EFGH);
+			W  = MemoryOp<Element,simd>::load(alpha);
+			Wp = MemoryOp<Element,simd>::load(alphap);
+			// V3 = V1 + V2 mod
+			V3 = add_mod<simd >(V1,V2,P2);
+			MemoryOp<Element,simd>::store(ABCD,V3);
+			// V4 = (V1+(2P-V2))alpha mod 2P
+			T = simd::sub(V2,P2);
+			V4 = simd::sub(V1,T);
+
+			T = mul_mod<simd >(V4,W,P,Wp);// T is the result
+			MemoryOp<Element,simd>::store(EFGH,T);
+		}
+
+		inline void Butterfly_DIF_mod2p_laststeps(Element* ABCD, Element* EFGH,
+												  const vect_t& W,
+												  const vect_t& Wp,
+												  const vect_t& P, const vect_t& P2) {
+			vect_t V1,V2,V3,V4,V5,V6,V7;
+			// V1=[A B C D], V2=[E F G H]
+			V1 = MemoryOp<Element,simd>::load(ABCD);
+			V2 = MemoryOp<Element,simd>::load(EFGH);
+
+			/* 1st step */
+			// V3=[A E B F], V4=[C G D H]
+			MemoryOp<Element,simd>::unpacklohi4(V3,V4,V1,V2);
+
+			// V1 = V3 + V4 mod 2P
+			// P2 = [2p 2p 2p 2p]
+			V1 = add_mod<simd >(V3,V4,P2);
+			// V2 = (V3+(2P-V4))alpha mod 2P
+			V5 = simd::sub(V4,P2);
+			V6 = simd::sub(V3,V5);
+			V2 = reduce<simd >(V6, P2);
+			// V4 = [D D H H]
+			V4 = MemoryOp<Element,simd>::unpackhi4(V2,V2);
+
+			// V3 = [* D * H]
+			V3 = mul_mod_half<simd, SimdComp>(V4, W, P, Wp);
+
+			//At this point, V3 = [D*Wmodp H*Wmodp D*Wmodp H*Wmodp]
+			V3 = simd::template shuffle<0xDD>(V3);
+			//V3 = simd::template shuffle<0xDD>(V3); // 0xDD = [3 1 3 1]_base4
+
+			// At this time I have V1=[A E B F], V2=[C G ? ?], V3=[? ? D H]
+			// I need V3 = [A C E G], V4 = [B D F H]
+			// This is not refactored in MemoryOp::... because of different arguments (V1,V3) and (V1,V2)
+			V4 = MemoryOp<Element,simd>::unpackhi4(V1,V3);
+			V3 = MemoryOp<Element,simd>::unpacklo4(V1,V2);
+
+			/* 2nd step */
+			// V1 = V3 + V4 mod 2P
+			V1 = add_mod<simd >(V3,V4,P2);
+			// V2 = V3 + (2P - V4) mod 2P
+			V5 = simd::sub(V4,P2);
+			V6 = simd::sub(V3,V5);
+			V2 = reduce<simd >(V6, P2);
+			// Result in V1 = [A C E G]  and V2 = [B D F H]
+			// Transform to V3=[A B C D], V4=[E F G H]
+			MemoryOp<Element,simd>::unpacklohi4(V3,V4,V1,V2);
+			// Store
+			MemoryOp<Element,simd>::store(ABCD,V3);
+			MemoryOp<Element,simd>::store(EFGH,V4);
+		}
+
+	}; // FFT_butterflies<Field, 4>
+
+
+	template<typename Field, typename simd>
+	class FFT_butterflies<Field, simd, 8> : public FFT_init<Field> {
+	public:
+
+		using Element = typename Field::Element;
+		using vect_t = typename simd::vect_t;
+		using SimdComp = typename SimdCompute_t<simd,Field>::Compute_t;
+
+		FFT_butterflies(const FFT_init<Field>& f_i) : FFT_init<Field>(f_i) {
+			linbox_check(simd::vect_size == 8);
+		}
+
+		// TODO include P, P2 in precomp
+		inline void Butterfly_DIT_mod4p (Element* ABCDEFGH, Element* IJKLMNOP,
+										 const Element* alpha, const Element* alphap,
+										 const vect_t& P, const vect_t& P2) {
+			vect_t V1,V2,V3,V4,W,Wp,T1;
+			// V1=[A B C D E F G H], V2=[I J K L M N O P]
+			V1 = MemoryOp<Element,simd>::load(ABCDEFGH);
+			V2 = MemoryOp<Element,simd>::load(IJKLMNOP);
+			W  = MemoryOp<Element,simd>::load(alpha);
+			Wp = MemoryOp<Element,simd>::load(alphap);
+
+			// V3 = V1 mod 2P
+			V3 = reduce<simd>(V1, P2);
+
+			// V4 = V2 * W mod P
+			V4 = mul_mod<simd>(V2,W,P,Wp);
+
+			// V1 = V3 + V4
+			V1 = simd::add(V3,V4);
+			MemoryOp<Element,simd>::store(ABCDEFGH,V1);
+
+			// V2 = V3 - (V4 - 2P)
+			T1 = simd::sub(V4,P2);
+			V2 = simd::sub(V3,T1);
+			MemoryOp<Element,simd>::store(IJKLMNOP,V2);
+		}
+
+		inline void Butterfly_DIT_mod4p_firststeps (Element* ABCDEFGH, Element* IJKLMNOP,
+													const vect_t& alpha,const vect_t& alphap,
+													const vect_t& beta ,const vect_t& betap,
+													const vect_t& P    ,const vect_t& P2) {
+			// First 3 steps
+			vect_t V1,V2,V3,V4,V5,V6,V7,Q;
+			// V1=[A B C D E F G H], V2=[I J K L M N O P]
+			V1 = MemoryOp<Element,simd>::load(ABCDEFGH);
+			V2 = MemoryOp<Element,simd>::load(IJKLMNOP);
+
+			/*********************************************/
+			/* 1st STEP */
+			/*********************************************/
+			// Transform to V3=[A I C K E M G O], V4=[B J D L F N H P]
+			MemoryOp<Element,simd>::unpacklohi_twice8(V6,V7,V1,V2);
+			MemoryOp<Element,simd>::unpacklohi_twice4(V3,V4,V6,V7);
+
+			// V1 = V3 + V4;       V1 = [A I C K E M G O]
+			// Rk: No need for (. mod 2P) since entries are <P
+			V1 = simd::add(V3,V4);
+
+			// V2 = V3 + (P - V4); V2 = [B J D L F N H P]
+			// Rk: No need for (. mod 2P) since entries are <P
+			V6 = simd::sub(V4,P);
+			V2 = simd::sub(V3,V6);
+
+			/*********************************************/
+			/* 2nd STEP */
+			/*********************************************/
+			// V5 = [D D L L H H P P]
+			V5 = MemoryOp<Element,simd>::unpackhi_twice8(V2,V2);
+
+			// V3 = [* D * L * H * P]
+			V3 = mul_mod_half<simd,SimdComp>(V5,alpha,P,alphap);
+
+			// V7 = [D L D L H P H P]
+			V7 = MemoryOp<Element,simd>::shuffletwice8_DD(V3);
+			//V7 = simd::template shuffle_twice<0xDD>(V3); // 0xDD = 221 = [3 1 3 1]_base4
+
+			// V3= [A B I J E F M N], V4=[C D K L G H O P]
+			V3 = MemoryOp<Element,simd>::unpacklo_twice8(V1,V2);
+			V4 = MemoryOp<Element,simd>::unpackhi_twice8(V1,V7);
+
+			// V1 = V3+V4
+			V1 = simd::add(V3,V4);
+			// V2 = V3 - (V4 - 2P)
+			V7 = simd::sub(V4,P2);
+			V2 = simd::sub(V3,V7);
+
+			/*********************************************/
+			/* 3nd STEP */
+			/*********************************************/
+			// V3= [A B C D I J K L] V4= [E F G H M N O P]
+			MemoryOp<Element,simd>::unpacklohi_twice4(V6,V7,V1,V2);
+			MemoryOp<Element,simd>::unpacklohi2(V3,V4,V6,V7);
+
+			// V6= V3 mod 2P
+			V6 = reduce<simd >(V3, P2);
+
+			// V7= V4.beta mod p
+			V7 = mul_mod<simd >(V4,beta,P,betap);
+
+			// V1 = V6+V7
+			V1 = simd::add(V6,V7);
+
+			// V2 = V6 - (V7 - 2P)
+			V5 = simd::sub(V7,P2);
+			V2 = simd::sub(V6,V5);
+
+			/*********************************************/
+			// V3=[A B C D E F G H] V4=[I J K L M N O P]
+			MemoryOp<Element,simd>::unpacklohi2(V3,V4,V1,V2);
+
+			// Store
+			MemoryOp<Element,simd>::store(ABCDEFGH,V3);
+			MemoryOp<Element,simd>::store(IJKLMNOP,V4);
+		}
+
+		inline void Butterfly_DIF_mod2p (Element* ABCDEFGH, Element* IJKLMNOP,
+										 const Element* alpha, const Element* alphap,
+										 const vect_t& P, const vect_t& P2) {
+			vect_t V1,V2,V3,V4,W,Wp,T;
+			// V1=[A B C D E F G H], V2=[I J K L M N O P]
+			V1 = MemoryOp<Element,simd>::load(ABCDEFGH);
+			V2 = MemoryOp<Element,simd>::load(IJKLMNOP);
+			W  = MemoryOp<Element,simd>::load(alpha);
+			Wp = MemoryOp<Element,simd>::load(alphap);
+
+			// V3 = V1 + V2 mod
+
+			V3 = add_mod<simd >(V1,V2,P2);
+
+			MemoryOp<Element,simd>::store(ABCDEFGH,V3);
+
+			// V4 = (V1+(2P-V2))alpha mod 2P
+			T = simd::sub(V2,P2);
+			V4 = simd::sub(V1,T);
+			T = mul_mod<simd >(V4,W,P,Wp);// T is the result
+			MemoryOp<Element,simd>::store(IJKLMNOP,T);
+		}
+
+		inline void Butterfly_DIF_mod2p_laststeps(Element* ABCDEFGH, Element* IJKLMNOP,
+												  const vect_t& alpha,const vect_t& alphap,
+												  const vect_t& beta ,const vect_t& betap,
+												  const vect_t& P, const vect_t& P2) {
+			// Last 3 steps
+			vect_t V1,V2,V3,V4,V5,V6,V7,Q;
+
+			// V1=[A B C D E F G H], V2=[I J K L M N O P]
+			V1 = MemoryOp<Element,simd>::load(ABCDEFGH);
+			V2 = MemoryOp<Element,simd>::load(IJKLMNOP);
+
+			/* 1st step */
+			// V3=[A B C D I J K L] V4=[E F G H M N O P]
+			MemoryOp<Element,simd>::unpacklohi2(V3,V4,V1,V2);
+
+			// V1 = V3 + V4 mod 2P
+			// P2 = [2p 2p 2p 2p]
+			V1 = add_mod<simd >(V3,V4,P2);
+
+			// V2 = (V3+(2P-V4))alpha mod 2P
+			V5 = simd::sub(V4,P2);
+			V6 = simd::sub(V3,V5);
+			V7 = reduce<simd >(V6, P2);
+			V2 = mul_mod<simd >(V7,alpha,P,alphap);
+
+			/* 2nd step */
+
+			// V3=[A E B F I M J N] V4=[C G D H K O L P]
+			MemoryOp<Element,simd>::unpacklohi_twice8(V3,V4,V1,V2);
+
+			// V1 = V3 + V4 mod 2P
+			// P2 = [2p 2p 2p 2p]
+			V1 = add_mod<simd >(V3,V4,P2);
+
+			// V2 = (V3+(2P-V4))alpha mod 2P
+			// V7 =  (V3+(2P-V4)) mod 2P
+			V5 = simd::sub(V4,P2);
+			V6 = simd::sub(V3,V5);
+			V7 = reduce<simd >(V6, P2);
+
+			// V4 = [D D H H L L P P ]
+			V4 = MemoryOp<Element,simd>::unpackhi_twice8(V7,V7);
+
+			// V3 = [ * D * H * L * P]
+			V3 = mul_mod_half<simd,SimdComp>(V4,beta,P,betap);
+
+			// V2=[D H D H L P L P] but only [* * D H * * L P] matters
+			V2 = MemoryOp<Element,simd>::shuffletwice8_DD(V3);
+			//V2 = simd::template shuffle_twice<0xDD>(V3); // 0xDD = 221 = [3 1 3 1]_base4
+
+			/* 3rd step */
+			// At this time I have V1=[A B E F I J M N], V7=[C G * * K O * *], V2=[* * D H * * L P]
+			// I need V3 = [A C E G I K M O], V4=[B D F H J L N P]
+			V3 = MemoryOp<Element,simd>::unpacklo_twice8(V1,V7);
+			V4 = MemoryOp<Element,simd>::unpackhi_twice8(V1,V2);
+
+			// V1 = V3 + V4 mod 2P
+			V1 = add_mod<simd >(V3,V4,P2);
+
+			// V2 = V3 + (2P - V4) mod 2P
+			V5 = simd::sub(V4,P2);
+			V6 = simd::sub(V3,V5);
+			V2 = reduce<simd >(V6, P2);
+
+			// Result in    V1=[A C E G I K M O] V2=[B D F H J L N P]
+			// Transform to V3=[A B C D I J K L],V4=[E F G H M N O P]
+			MemoryOp<Element,simd>::unpacklohi_twice8(V3,V4,V1,V2);
+
+			// Transform to V1=[A B C D E F G H], V2=[I J K L M N O P]
+			MemoryOp<Element,simd>::unpacklohi2(V1,V2,V3,V4);
+
+			// Store
+			MemoryOp<Element,simd>::store(ABCDEFGH,V1);
+			MemoryOp<Element,simd>::store(IJKLMNOP,V2);
+
+
+		}
+
+
+	}; // FFT_butterflies<Field, 8>
+
+}
+
+#endif // __LINBOX_polynomial_fft_butterflies_H
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-init.h b/linbox/algorithms/polynomial-matrix/polynomial-fft-init.h
new file mode 100644
index 0000000..19dcd66
--- /dev/null
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-init.h
@@ -0,0 +1,299 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2016 Romain Lebreton, Pascal Giorgi
+ *
+ * Written by Pascal Giorgi <pascal.giorgi at lirmm.fr>
+ *            Romain Lebreton <romain.lebreton at lirmm.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+ * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+
+#ifndef __LINBOX_polynomial_fft_init_H
+#define __LINBOX_polynomial_fft_init_H
+
+
+#include <iostream>
+#include "linbox/linbox-config.h"
+#include "linbox/util/debug.h"
+#include "givaro/givinteger.h"
+#include <fflas-ffpack/fflas/fflas_simd.h>
+
+#ifndef ROUND_DOWN
+#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
+#endif
+
+// template<typename T>
+// std::ostream& operator<<(std::ostream& os, const std::vector<T> &x){
+// 	std::ostream_iterator<T> out_it (os,", ");
+// 	std::copy ( x.begin(), x.end(), out_it );
+// 	return os;
+// }
+
+#include "fflas-ffpack/utils/align-allocator.h"
+
+#ifdef __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
+
+//#include "linbox/algorithms/polynomial-matrix/simd.h"
+
+#include "fflas-ffpack/fflas/fflas_simd.h"
+
+#ifdef __LINBOX_USE_AVX2
+/* 256 bits CODE HERE */
+#define __LINBOX_USE_AVX2
+// define 256 bits simd vector type
+typedef __m256i  _vect256_t;
+#endif
+// define 128 bits simd vector type
+typedef __m128i  _vect128_t;
+#endif
+
+namespace LinBox {
+
+	enum SimdLevel {NOSIMD,SSE41,AVX,AVX2};
+
+	struct SimdLevelFinder {
+#ifdef __LINBOX_USE_AVX2
+		const static SimdLevel simdlevel = AVX2;
+#else
+#ifdef __LINBOX_USE_AVX
+		const static SimdLevel simdlevel = AVX;
+#else
+#ifdef __LINBOX_USE_SIMD
+		const static SimdLevel simdlevel = SSE41;
+#else
+		const static SimdLevel simdlevel = NOSIMD;
+#endif
+#endif
+#endif
+	};
+
+	// class to handle FFT transform over wordsize prime field Fp (p < 2^29)
+	//	template <class Field, int SL = SimdLevelFinder::simdlevel>
+	// TODO : A rendre générique / Simd si on doit faire des précalculs dans des Simd::vect_t
+	template <class Field>
+	class FFT_init {
+	public:
+		using Element = typename Field::Element;
+		using Compute_t = typename Field::Compute_t;
+		using Residu_t = typename Field::Residu_t;
+
+		const Field                *fld;
+		Residu_t              _pl, _dpl;
+		uint64_t                      n;
+		size_t                       ln;
+		//Compute_t                  _logp;
+		//Compute_t                     _I;
+		//double                    _pinv;
+		Element                      _w;
+		Element                   _invw;
+		// Du type qui est donné aux Butterfly
+		typedef std::vector<Element,AlignedAllocator<Element, Alignment::DEFAULT> > VECT;
+		VECT    pow_w;
+		VECT   pow_wp; // Precomputations in shoup
+		VECT    _data;
+		Element                      _p;
+		//   pow_w = table of roots of unity. If w = primitive K-th root, then the table is:
+		//           1, w, w^2, ..., w^{K/2-1},
+		//           1, w^2, w^4, ..., w^{K/2-2},
+		//           1, w^4, w^8, ..., w^{K/2-4}
+		//           ...
+		//           1, w^{K/8}, w^{K/4}, w^{3K/8},
+		//           1, w^{K/4},
+		//           1.
+
+		inline const Field & field() const { return *fld; }
+
+		Element find_gen (Residu_t _m, uint64_t _val2p) {
+			// find a primitive 2^k root of unity where
+			// _p - 1 = 2^val2p * m
+			srand((unsigned int) time(NULL));
+			Element y,z;
+			uint64_t j;
+			Element _gen;
+			for (;;) {
+				fld->init(_gen,rand());
+				fld->init(z, 1);
+				for (Residu_t i=0; i < _m; ++i) fld->mulin(z,_gen); // z = z*_gen;
+				if (z == 1) continue;
+				// _gen^i =/ 1 pour 0 <= i < m
+				_gen = z;
+				j = 0;
+				do {
+					y = z;
+					fld->mul(z,y,y); // z = y * y;
+					j++;
+				} while (j != _val2p && z != 1);
+				if (j == _val2p)
+					break;
+			}
+			return _gen;
+		}
+
+		template<typename T=Element>
+		typename std::enable_if<std::is_integral<T>::value>::type init_powers () {
+
+			size_t pos = 0;
+			//uint64_t wi = 1;
+			Element wi = 1;
+
+			// Precomp Quo(2^32,p)
+			Compute_t invp; fld->precomp_p(invp);
+
+			if (ln>0){
+//				using simd=Simd<uint32_t>;
+//				using vect_t =typename simd::vect_t;
+
+				size_t tpts = 1 << (ln - 1);
+				size_t i=0;
+//				for( ;i<std::min(simd::vect_size+1, tpts);i++,pos++){
+				// Precompute pow_wp[1] for faster mult by pow_w[1]
+				for( ;i<std::min((size_t) 2, tpts);i++,pos++){
+					pow_w[pos] = wi;
+
+					// Fake conversion since precomp_b will be used as a Compute_t in mul_precomp_b
+					Compute_t temp;
+					fld->precomp_b(temp, wi); //(((Compute_t)wi*invp)>>(fld->_bitsizep));
+					pow_wp[pos] = static_cast<Element>(temp);
+
+					fld->mulin(wi, _w);
+				}
+				/*
+				vect_t wp_vect, Q_vect,BAR_vect,w_vect,pow_w_vect,pow_wp_vect, pl_vect;
+				BAR_vect= simd::set1(BAR);
+				wp_vect = simd::set1(pow_wp[simd::vect_size]);
+				w_vect  = simd::set1(pow_w[simd::vect_size]);
+				pl_vect = simd::set1(_pl);
+				for (; i < ROUND_DOWN(tpts,simd::vect_size);
+					 i+=simd::vect_size,pos+=simd::vect_size) {
+					pow_w_vect  = simd::loadu((int32_t*)pow_w.data()+pos-simd::vect_size);
+					Q_vect=simd::mulhi(pow_w_vect,wp_vect);
+					pow_w_vect = simd::sub(simd::mullo(pow_w_vect,w_vect),simd::mullo(Q_vect,pl_vect));
+					pow_w_vect=simd::sub(pow_w_vect, simd::vandnot(simd::greater(pow_w_vect,pl_vect),pl_vect));
+					simd::storeu((int32_t*)pow_w.data()+pos,pow_w_vect);
+					pow_wp_vect= simd::mulhi(simd::sll(pow_w_vect,32-_logp),BAR_vect);
+					simd::storeu((int32_t*)pow_wp.data()+pos,pow_wp_vect);
+				}
+				*/
+				// Use pow_wp[1] for speed-up mult by pow_w[1]
+				for( ;i<tpts;i++,pos++){
+					pow_w[pos] = wi;
+
+					// Fake conversion since precomp_b will be used as a Compute_t in mul_precomp_b
+					Compute_t temp;
+					fld->precomp_b(temp, wi); //(((Compute_t)wi*invp)>>(fld->_bitsizep));
+					pow_wp[pos] = static_cast<Element>(temp);
+
+					fld->mul_precomp_b(wi, wi, _w, static_cast<Compute_t>(pow_wp[1]));
+				}
+
+				// Other pow_w elements can be read from previously computed pow_w
+				for(size_t k=2;k<=tpts;k<<=1)
+					for(size_t i=0;i<tpts;i+=k,pos++){
+						pow_w[pos]  = pow_w[i];
+						pow_wp[pos] = pow_wp[i];
+					}
+
+//				std::cout << "Check precomputations : pow_w, pow_wp \n";
+//				std::cout << "[";
+//				for (size_t i=0; i < tpts; i++) std::cout << pow_w[i] << ", ";
+//				std::cout << "]\n";
+//				std::cout << "[";
+//				for (size_t i=0; i < tpts; i++) std::cout << pow_wp[i] << ", ";
+//				std::cout << "]\n\n";
+			}
+		}
+
+		template<typename T=Element>
+		typename std::enable_if<std::is_floating_point<T>::value>::type init_powers () {
+
+			size_t pos = 0;
+			//uint64_t wi = 1;
+			Element wi = 1;
+
+			if (ln>0){
+				size_t tpts = 1 << (ln - 1);
+
+				for(size_t i=0; i<tpts;i++,pos++){
+					pow_w[pos] = wi;
+					fld->mulin(wi,_w);
+				}
+
+				// Other pow_w elements can be read from previously computed pow_w
+				for(size_t k=2;k<=tpts;k<<=1)
+					for(size_t i=0;i<tpts;i+=k,pos++){
+						pow_w[pos]  = pow_w[i];
+					}
+
+			}
+		}
+
+		FFT_init (const Field& fld2, size_t ln2, Element w = 0)
+			: fld (&fld2), n ((1UL << ln2)), ln (ln2), pow_w(n - 1), pow_wp(n - 1), _data(n) {
+			_pl = fld->characteristic();
+			_p  = fld->characteristic();
+
+			linbox_check(_pl <= (field()->maxCardinality() >> 3)); // 8*p <= field()->maxCardinality() for Harvey's butterflies
+			_dpl = (_pl << 1);
+			//_pinv = 1 / (double) _pl;
+
+			Givaro::Timer chrono;
+			chrono.start();
+
+			uint64_t _val2p = 0;
+			Residu_t     _m = _pl;
+			_m = _pl - 1;
+			while ((_m & 1) == 0) {
+				_m >>= 1;
+				_val2p++;
+			}
+
+			linbox_check(ln <= _val2p);      // Otherwise no 2 _ln roots of unity
+
+			if (w == 0){   // find a pseudo 2^lpts-th primitive root of unity
+				//_I = (1L << (_logp << 1)) / _pl;
+				Element _gen = find_gen (_m, _val2p);
+				_w = Givaro::powmod(_gen, 1UL<<(_val2p-ln), _pl);
+			}
+			else {
+				_w = w;
+			}
+
+			// compute w^(-1) mod p = w^(2^lpts - 1)
+			_invw = Givaro::powmod(_w, (1UL<<ln) - 1, _pl);
+
+			chrono.clear();
+			chrono.start();
+
+			init_powers();
+
+			chrono.stop();
+			//cout<<"FFT: table="<<chrono<<endl;
+		}
+
+
+		Element getRoot() const {return _w;}
+		Element getInvRoot() const {return _invw;}
+
+	}; //FFT_init
+
+}
+
+#endif // __LINBOX_polynomial_fft_init_H
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform-simd.inl b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform-simd.inl
index 6c5e0f8..d986e16 100644
--- a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform-simd.inl
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform-simd.inl
@@ -1,4 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014  Pascal Giorgi, Romain Lebreton
  *
@@ -28,7 +29,36 @@
 #ifndef __LINBOX_polynomial_fft_transform_simd_INL
 #define __LINBOX_polynomial_fft_transform_simd_INL
 
-#include "linbox/algorithms/polynomial-matrix/simd.h"
+#include "fflas-ffpack/fflas/fflas_simd.h"
+
+//#include "linbox/algorithms/polynomial-matrix/simd.h"
+
+#ifndef additional_modular_simd_functions
+#define additional_modular_simd_functions
+
+#define Simd_vect typename Simd::vect_t
+
+template <class Simd>
+inline Simd_vect reduce (const Simd_vect a, const Simd_vect p) {
+	Simd_vect t = Simd::greater(p,a);
+	return Simd::sub(a, Simd::vandnot(p,t));
+}
+
+template <class Simd>
+inline Simd_vect add_mod (const Simd_vect a, const Simd_vect b, const Simd_vect p) {
+	Simd_vect c = Simd::add(a,b);
+	return reduce<Simd>(c, p);
+}
+
+template <class Simd>
+inline Simd_vect mul_mod (const Simd_vect a, const Simd_vect b, const Simd_vect p, const Simd_vect bp) {
+	Simd_vect q = Simd::mulhi(a,bp);
+	Simd_vect c = Simd::mullo(a,b);
+	Simd_vect t = Simd::mullo(q,p);
+	return Simd::sub(c,t);
+}
+#undef Simd_vect
+#endif
 
 namespace LinBox {
 
@@ -41,11 +71,11 @@ namespace LinBox {
 
 	template <class Field>
 	inline void FFT_transform<Field>::reduce128_modp(uint32_t* ABCD, const _vect128_t& P) {
-		_vect128_t V1,T;
+		_vect128_t V1;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_MOD_P(V1,V1,P,T);
-		VEC128_STORE(ABCD,V1);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V1 = reduce<Simd128<uint32_t> >(V1, P);
+		Simd128<uint32_t>::store(ABCD,V1);
 	}
 
 	/*-----------------------------------*/
@@ -55,161 +85,162 @@ namespace LinBox {
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_4x1_SSE(uint32_t* ABCD, uint32_t* EFGH,
-								      const uint32_t* alpha,
-								      const uint32_t* alphap,
-								      const _vect128_t& P, const _vect128_t& P2)
+																  const uint32_t* alpha,
+																  const uint32_t* alphap,
+																  const _vect128_t& P, const _vect128_t& P2)
 	{
 		_vect128_t V1,V2,V3,V4,W,Wp,T;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,EFGH);
-		VEC128_LOAD(W ,alpha);
-		VEC128_LOAD(Wp,alphap);
-		// V3 = V1 + V2 mod
-		VEC128_ADD_MOD(V3,V1,V2,P2,T);
-		VEC128_STORE(ABCD,V3);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(EFGH);
+		W  = Simd128<uint32_t>::load(alpha);
+		Wp = Simd128<uint32_t>::load(alphap);
+		// V3 = V1 + V2 mod 2P
+		V3 = add_mod<Simd128<uint32_t> >(V1,V2,P2);
+		Simd128<uint32_t>::store(ABCD,V3);
 		// V4 = (V1+(2P-V2))alpha mod 2P
-		VEC128_SUB_32(T,V2,P2);
-		VEC128_SUB_32(V4,V1,T);
-		VEC128_MUL_MOD(T,V4,W,P,Wp,V1,V2,V3);// V3 is the result
-		VEC128_STORE(EFGH,T);
+		T = Simd128<uint32_t>::sub(V2,P2);
+		V4 = Simd128<uint32_t>::sub(V1,T);
+		T = mul_mod<Simd128<uint32_t> >(V4,W,P,Wp);// T is the result
+		Simd128<uint32_t>::store(EFGH,T);
 	}
 
-
+/*
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_4x1_SSE_laststep(uint32_t* ABCD, uint32_t* EFGH, const _vect128_t& P2) {
 		_vect128_t V1,V2,V3,V4,V5;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,EFGH);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(EFGH);
 		// V3 = [A C B D], V4 = [E G F H]
-		VEC128_SHUFFLE_32(V3,V1,0xD8);
-		VEC128_SHUFFLE_32(V4,V2,0xD8);
+		V3 = Simd128<uint32_t>::shuffle<0xD8>(V1);
+		V4 = Simd128<uint32_t>::shuffle<0xD8>(V2);
 		// V1 = [A E C G], V2 = [B F D H]
-		VEC128_UNPACK_LO_32(V1,V3,V4);
-		VEC128_UNPACK_HI_32(V2,V3,V4);
+		V1 = Simd128<uint32_t>::unpacklo(V3,V4);
+		V2 = Simd128<uint32_t>::unpackhi(V3,V4);
 		// V3 = V1 + V2 mod 2P
-		VEC128_ADD_MOD(V3,V1,V2,P2,V5);
+		V3 = add_mod<Simd128<uint32_t> >(V1,V2,P2);
 		// V4 = V1 + (2P - V2) mod 2P
-		VEC128_SUB_32(V5,V2,P2);
-		VEC128_SUB_32(V2,V1,V5);
-		VEC128_MOD_P(V4,V2,P2,V5);
+		V5 = Simd128<uint32_t>::sub(V2,P2);
+		V2 = Simd128<uint32_t>::sub(V1,V5);
+		V4 = reduce<Simd128<uint32_t> >(V2, P2);
 		// V1 = [A C E G], V2 = [B D F H]
-		VEC128_SHUFFLE_32(V1,V3,0xD8);
-		VEC128_SHUFFLE_32(V2,V4,0xD8);
+		V1 = Simd128<uint32_t>::shuffle<0xD8>(V3);
+		V2 = Simd128<uint32_t>::shuffle<0xD8>(V4);
 		// V3 = [A B C D], V4 = [E F G H]
-		VEC128_UNPACK_LO_32(V3,V1,V2);
-		VEC128_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd128<uint32_t>::unpacklo(V1,V2);
+		V4 = Simd128<uint32_t>::unpackhi(V1,V2);
 		// Store
-		VEC128_STORE(ABCD,V3);
-		VEC128_STORE(EFGH,V4);
+		Simd128<uint32_t>::store(ABCD,V3);
+		Simd128<uint32_t>::store(EFGH,V4);
 	}
+*/
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_4x2_SSE(uint32_t* ABCD, uint32_t* EFGH, uint32_t* IJKL, uint32_t* MNOP,
-								      const uint32_t* alpha,  const uint32_t*beta ,  const uint32_t* gamma,
-								      const uint32_t* alphap, const uint32_t*betap , const uint32_t* gammap,
-								      const _vect128_t& P, const _vect128_t& P2) {
+																  const uint32_t* alpha,  const uint32_t*beta ,  const uint32_t* gamma,
+																  const uint32_t* alphap, const uint32_t*betap , const uint32_t* gammap,
+																  const _vect128_t& P, const _vect128_t& P2) {
 		_vect128_t V1,V2,V3,V4,W,Wp,T1,T2,T3,T4,T5,T6,T7,T8;
 
 		// V1=[A B C D], V2=[E F G H], V3=[I J K L], V4=[M N O P]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,IJKL);
-		VEC128_LOAD(W ,alpha);
-		VEC128_LOAD(Wp,alphap);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(IJKL);
+		W  = Simd128<uint32_t>::load(alpha);
+		Wp = Simd128<uint32_t>::load(alphap);
 		/**************/
 		// T1 = V1 + V2 mod 2P
-		VEC128_ADD_MOD(T1,V1,V2,P2,T8);
+		T1 = add_mod<Simd128<uint32_t> >(V1,V2,P2);
 		// T2 = (V1+(2P-V2))alpha mod 2P
-		VEC128_SUB_32(T7,V2,P2);
-		VEC128_SUB_32(T6,V1,T7);
-		VEC128_MUL_MOD(T2,T6,W,P,Wp,T3,T4,T5);
+		T7 = Simd128<uint32_t>::sub(V2,P2);
+		T6 = Simd128<uint32_t>::sub(V1,T7);
+		T2 = mul_mod<Simd128<uint32_t> >(T6,W,P,Wp);
 		/**************/
-		VEC128_LOAD(V3,EFGH);
-		VEC128_LOAD(V4,MNOP);
-		VEC128_LOAD(W ,beta);
-		VEC128_LOAD(Wp,betap);
+		V3 = Simd128<uint32_t>::load(EFGH);
+		V4 = Simd128<uint32_t>::load(MNOP);
+		W  = Simd128<uint32_t>::load(beta);
+		Wp = Simd128<uint32_t>::load(betap);
 		/**************/
 		// T3 = V3 + V4 mod 2P
-		VEC128_ADD_MOD(T3,V3,V4,P2,T8);
+		T3 = add_mod<Simd128<uint32_t> >(V3,V4,P2);
 		// T4 = (V3+(2P-V4))beta mod 2P
-		VEC128_SUB_32(T7,V4,P2);
-		VEC128_SUB_32(T6,V3,T7);
-		VEC128_MUL_MOD(T4,T6,W,P,Wp,V1,V2,T8);// T1 is the result
+		T7 = Simd128<uint32_t>::sub(V4,P2);
+		T6 = Simd128<uint32_t>::sub(V3,T7);
+		T4 = mul_mod<Simd128<uint32_t> >(T6,W,P,Wp);// T1 is the result
 		/**************/
-		VEC128_LOAD(W ,gamma);
-		VEC128_LOAD(Wp,gammap);
+		W  = Simd128<uint32_t>::load(gamma);
+		Wp = Simd128<uint32_t>::load(gammap);
 		/**************/
 		// V1 = T1 + T3 mod 2P
-		VEC128_ADD_MOD(V1,T1,T3,P2,T8);
+		V1 = add_mod<Simd128<uint32_t> >(T1,T3,P2);
 		// V3 = (T1+(2P-T3))gamma mod 2P
-		VEC128_SUB_32(T7,T3,P2);
-		VEC128_SUB_32(T6,T1,T7);
-		VEC128_MUL_MOD(V3,T6,W,P,Wp,T3,T5,T8);// T1 is the result
+		T7 = Simd128<uint32_t>::sub(T3,P2);
+		T6 = Simd128<uint32_t>::sub(T1,T7);
+		V3 = mul_mod<Simd128<uint32_t> >(T6,W,P,Wp);// T1 is the result
 		/**************/
 		// V2 = T2 + T4 mod 2P
-		VEC128_ADD_MOD(V2,T2,T4,P2,T8);
+		V2 = add_mod<Simd128<uint32_t> >(T2,T4,P2);
 		// V4 = (T2+(2P-T4))gamma mod 2P
-		VEC128_SUB_32(T7,T4,P2);
-		VEC128_SUB_32(T6,T2,T7);
-		VEC128_MUL_MOD(V4,T6,W,P,Wp,T1,T3,T8);// T1 is the result
+		T7 = Simd128<uint32_t>::sub(T4,P2);
+		T6 = Simd128<uint32_t>::sub(T2,T7);
+		V4 = mul_mod<Simd128<uint32_t> >(T6,W,P,Wp);// T1 is the result
 		/**************/
-		VEC128_STORE(ABCD,V1);
-		VEC128_STORE(EFGH,V3);
-		VEC128_STORE(IJKL,V2);
-		VEC128_STORE(MNOP,V4);
+		Simd128<uint32_t>::store(ABCD,V1);
+		Simd128<uint32_t>::store(EFGH,V3);
+		Simd128<uint32_t>::store(IJKL,V2);
+		Simd128<uint32_t>::store(MNOP,V4);
 	}
 
 
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_4x2_SSE_last2step(uint32_t* ABCD, uint32_t* EFGH,
-										const _vect128_t& W,
-										const _vect128_t& Wp,
-										const _vect128_t& P, const _vect128_t& P2) {
+																			const _vect128_t& W,
+																			const _vect128_t& Wp,
+																			const _vect128_t& P, const _vect128_t& P2) {
 		_vect128_t V1,V2,V3,V4,V5,V6,V7;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,EFGH);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(EFGH);
 		// V3=[A E B F], V4=[C G D H]
-		VEC128_UNPACK_LO_32(V3,V1,V2);
-		VEC128_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd128<uint32_t>::unpacklo(V1,V2);
+		V4 = Simd128<uint32_t>::unpackhi(V1,V2);
 		// V1 = V3 + V4 mod 2P
 		// P2 = [2p 2p 2p 2p]
-		VEC128_ADD_MOD(V1,V3,V4,P2,V5);
+		V1 = add_mod<Simd128<uint32_t> >(V3,V4,P2);
 		// V2 = (V3+(2P-V4))alpha mod 2P
-		VEC128_SUB_32(V5,V4,P2);
-		VEC128_SUB_32(V6,V3,V5);
-		VEC128_MOD_P(V2,V6,P2,V2);
+		V5 = Simd128<uint32_t>::sub(V4,P2);
+		V6 = Simd128<uint32_t>::sub(V3,V5);
+		V2 = reduce<Simd128<uint32_t> >(V6, P2);
 		// V4 = [D D H H]
-		VEC128_UNPACK_HI_32(V4,V2,V2);
+		V4 = Simd128<uint32_t>::unpackhi(V2,V2);
 		// V6 = V4 * Wp mod 2^64
 		// Wp = [Wp ? Wp ?]
-		VEC128_MUL_32(V7,V4,Wp);
-		VEC128_MUL_LO_32(V5,V7,P);
+		V7 = Simd128<uint64_t>::mulx(V4,Wp);
+		V5 = Simd128<uint32_t>::mullo(V7,P);
 		// At this point V4= [? Q_D*p ? Q_H*p]
 		// V5 = [D D H H] * [W W W W] mod 2^32
-		VEC128_MUL_LO_32(V6,V4,W);
-		VEC128_SUB_32(V4,V6,V5);
-		VEC128_SHUFFLE_32(V3,V4,0xDD);
+		V6 = Simd128<uint32_t>::mullo(V4,W);
+		V4 = Simd128<uint32_t>::sub(V6,V5);
+		V3 = Simd128<uint32_t>::shuffle<0xDD>(V4);
 		//At this point, V2 = [D*Wmodp H*Wmodp D*Wmodp H*Wmodp]
 		// At this time I have V1=[A E B F], V2=[C G ? ?], V3=[? ? D H]
 		// I need V3 = [A C E G], V4 = [B D F H]
-		VEC128_UNPACK_HI_32(V4,V1,V3);
-		VEC128_UNPACK_LO_32(V3,V1,V2);
+		V4 = Simd128<uint32_t>::unpackhi(V1,V3);
+		V3 = Simd128<uint32_t>::unpacklo(V1,V2);
 		// V1 = V3 + V4 mod 2P
-		VEC128_ADD_MOD(V1,V3,V4,P2,V5);
+		V1 = add_mod<Simd128<uint32_t> >(V3,V4,P2);
 		// V2 = V3 + (2P - V4) mod 2P
-		VEC128_SUB_32(V5,V4,P2);
-		VEC128_SUB_32(V6,V3,V5);
-		VEC128_MOD_P(V2,V6,P2,V2);
+		V5 = Simd128<uint32_t>::sub(V4,P2);
+		V6 = Simd128<uint32_t>::sub(V3,V5);
+		V2 = reduce<Simd128<uint32_t> >(V6, P2);
 		// Result in V1 = [A C E G]  and V2 = [B D F H]
 		// Transform to V3=[A B C D], V4=[E F G H]
-		VEC128_UNPACK_LO_32(V3,V1,V2);
-		VEC128_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd128<uint32_t>::unpacklo(V1,V2);
+		V4 = Simd128<uint32_t>::unpackhi(V1,V2);
 		// Store
-		VEC128_STORE(ABCD,V3);
-		VEC128_STORE(EFGH,V4);
+		Simd128<uint32_t>::store(ABCD,V3);
+		Simd128<uint32_t>::store(EFGH,V4);
 	}
 
 
@@ -220,81 +251,81 @@ namespace LinBox {
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIT_mod4p_4x1_SSE(uint32_t* ABCD, uint32_t* EFGH,
-								      const uint32_t* alpha,
-								      const uint32_t* alphap,
-								      const _vect128_t& P, const _vect128_t& P2) {
+																  const uint32_t* alpha,
+																  const uint32_t* alphap,
+																  const _vect128_t& P, const _vect128_t& P2) {
 		_vect128_t V1,V2,V3,V4,W,Wp,T1,T2;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,EFGH);
-		VEC128_LOAD(W ,alpha);
-		VEC128_LOAD(Wp,alphap);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(EFGH);
+		W  = Simd128<uint32_t>::load(alpha);
+		Wp = Simd128<uint32_t>::load(alphap);
 		// V3 = V1 mod 2P
-		VEC128_MOD_P (V3,V1,P2,T1);
+		V3 = reduce<Simd128<uint32_t> >(V1, P2);
 		// V4 = V2 * W mod P
-		VEC128_MUL_MOD(V4,V2,W,P,Wp,V1,T1,T2);
+		V4 = mul_mod<Simd128<uint32_t> >(V2,W,P,Wp);
 		// V1 = V3 + V4
-		VEC128_ADD_32(V1,V3,V4);
-		VEC128_STORE(ABCD,V1);
+		V1 = Simd128<uint32_t>::add(V3,V4);
+		Simd128<uint32_t>::store(ABCD,V1);
 		// V2 = V3 - (V4 - 2P)
-		VEC128_SUB_32(T1,V4,P2);
-		VEC128_SUB_32(V2,V3,T1);
-		VEC128_STORE(EFGH,V2);
+		T1 = Simd128<uint32_t>::sub(V4,P2);
+		V2 = Simd128<uint32_t>::sub(V3,T1);
+		Simd128<uint32_t>::store(EFGH,V2);
 	}
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIT_mod4p_4x2_SSE_first2step(uint32_t* ABCD, uint32_t* EFGH,
-										 const _vect128_t& W,
-										 const _vect128_t& Wp,
-										 const _vect128_t& P, const _vect128_t& P2) {
+																			 const _vect128_t& W,
+																			 const _vect128_t& Wp,
+																			 const _vect128_t& P, const _vect128_t& P2) {
 		_vect128_t V1,V2,V3,V4,T1,T2,T3,T4;
 		// V1=[A B C D], V2=[E F G H]
-		VEC128_LOAD(V1,ABCD);
-		VEC128_LOAD(V2,EFGH);
+		V1 = Simd128<uint32_t>::load(ABCD);
+		V2 = Simd128<uint32_t>::load(EFGH);
 		// T1 = [A C B D], T2 = [E G F H]
-		VEC128_SHUFFLE_32(T1,V1,0xD8);
-		VEC128_SHUFFLE_32(T2,V2,0xD8);
+		T1 = Simd128<uint32_t>::shuffle<0xD8>(V1);
+		T2 = Simd128<uint32_t>::shuffle<0xD8>(V2);
 		// V1 = [A E C G], V2 = [B F D H]
-		VEC128_UNPACK_LO_32(V1,T1,T2);
-		VEC128_UNPACK_HI_32(V2,T1,T2);
+		V1 = Simd128<uint32_t>::unpacklo(T1,T2);
+		V2 = Simd128<uint32_t>::unpackhi(T1,T2);
 		// V3 = V1 + V2
 		// Rk: No need for (. mod 2P) since entries are <P
-		VEC128_ADD_32(V3,V1,V2);
+		V3 = Simd128<uint32_t>::add(V1,V2);
 		// V4 = V1 + (P - V2)
 		// Rk: No need for (. mod 2P) since entries are <P
-		VEC128_SUB_32(T1,V2,P);
-		VEC128_SUB_32(V4,V1,T1);
+		T1 = Simd128<uint32_t>::sub(V2,P);
+		V4 = Simd128<uint32_t>::sub(V1,T1);
 		// T1 = [D D H H]
-		VEC128_UNPACK_HI_32(T1,V4,V4);
+		T1 = Simd128<uint32_t>::unpackhi(V4,V4);
 		// T2 = T1 * Wp mod 2^64
 		// Wp = [Wp ? Wp ?]
-		VEC128_MUL_32(T2,T1,Wp);
-		VEC128_MUL_LO_32(T3,T2,P);
+		T2 = Simd128<uint64_t>::mulx(T1,Wp);
+		T3 = Simd128<uint32_t>::mullo(T2,P);
 		// At this point T3= [? Q_D*p ? Q_H*p]
 		// T4 = [D D H H] * [W W W W] mod 2^32
-		VEC128_MUL_LO_32(T4,T1,W);
-		VEC128_SUB_32(T1,T4,T3);
-		VEC128_SHUFFLE_32(T2,T1,0XDD);
+		T4 = Simd128<uint32_t>::mullo(T1,W);
+		T1 = Simd128<uint32_t>::sub(T4,T3);
+		T2 = Simd128<uint32_t>::shuffle<0xDD>(T1);
 		//At this point, T2 = [D*Wmodp H*Wmodp D*Wmodp H*Wmodp]
 		// At this time I have V3=[A E C G], V4=[B F ? ?], T2=[? ? D H]
 		// I need V1 = [A B E F], V2 = [C D G H]
-		VEC128_UNPACK_LO_32(V1,V3,V4);
-		VEC128_UNPACK_HI_32(V2,V3,T2);
+		V1 = Simd128<uint32_t>::unpacklo(V3,V4);
+		V2 = Simd128<uint32_t>::unpackhi(V3,T2);
 		// T1 = V1 + V2
-		VEC128_ADD_32(T1,V1,V2);
+		T1 = Simd128<uint32_t>::add(V1,V2);
 		// T2 = V1 - (V2 - 2P)
-		VEC128_SUB_32(T3,V2,P2);
-		VEC128_SUB_32(T2,V1,T3);
+		T3 = Simd128<uint32_t>::sub(V2,P2);
+		T2 = Simd128<uint32_t>::sub(V1,T3);
 		// Result in T1 = [A B E F]  and T2 = [C D G H]
 		// Transform to V1=[A C B D], V2=[E G F H]
-		VEC128_UNPACK_LO_32(V1,T1,T2);
-		VEC128_UNPACK_HI_32(V2,T1,T2);
+		V1 = Simd128<uint32_t>::unpacklo(T1,T2);
+		V2 = Simd128<uint32_t>::unpackhi(T1,T2);
 		// Then T1=[A B C D], T2=[E F G H]
-		VEC128_SHUFFLE_32(T1,V1,0xD8);
-		VEC128_SHUFFLE_32(T2,V2,0xD8);
+		T1 = Simd128<uint32_t>::shuffle<0xD8>(V1);
+		T2 = Simd128<uint32_t>::shuffle<0xD8>(V2);
 		// Store
-		VEC128_STORE(ABCD,T1);
-		VEC128_STORE(EFGH,T2);
+		Simd128<uint32_t>::store(ABCD,T1);
+		Simd128<uint32_t>::store(EFGH,T2);
 	}
 
 	/*-----------------------------------*/
@@ -304,51 +335,51 @@ namespace LinBox {
 	template <class Field>
 	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative4x1_SSE (uint32_t *fft) {
 		_vect128_t P,P2;
-		P  = _mm_set1_epi32(_pl);
-		P2 = _mm_set1_epi32(_dpl);
+		P  = Simd128<uint32_t>::set1(_pl);
+		P2 = Simd128<uint32_t>::set1(_dpl);
 		uint32_t * tab_w = &pow_w [0];
 		uint32_t * tab_wp= &pow_wp[0];
 		size_t w, f;
 		for (w = n >> 1, f = 1; w >= 4; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1){
-			// w : witdh of butterflies
-			// f : # families of butterflies
-			for (size_t i = 0; i < f; i++)
-				for (size_t j = 0; j < w; j+=4)
+				// w : witdh of butterflies
+				// f : # families of butterflies
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j+=4)
 
 #define A0 &fft[0] +  (i << 1)   *w+ j
 #define A4 &fft[0] + ((i << 1)+1)*w+ j
-					Butterfly_DIF_mod2p_4x1_SSE(A0,A4, tab_w+j,tab_wp+j,P,P2);
+						Butterfly_DIF_mod2p_4x1_SSE(A0,A4, tab_w+j,tab_wp+j,P,P2);
 #undef A0
 #undef A4
-			//std::cout<<fft<<std::endl;
-		}
+				//std::cout<<fft<<std::endl;
+			}
 		// Last two steps
 		if (n >= 8) {
-			_vect128_t W,Wp;
-			W = _mm_set1_epi32 ((int)tab_w [1]);
-			Wp= _mm_set1_epi32 ((int)tab_wp[1]);
+				_vect128_t W,Wp;
+				W = Simd128<uint32_t>::set1 ((int)tab_w [1]);
+				Wp= Simd128<uint32_t>::set1 ((int)tab_wp[1]);
 
-			for (size_t i = 0; i < f; i+=2)
+				for (size_t i = 0; i < f; i+=2)
 #define A0 &fft[0] +  (i << 2)
 #define A4 &fft[0] + ((i << 2)+4)
-				Butterfly_DIF_mod2p_4x2_SSE_last2step(A0,A4,W,Wp,P,P2);
-			//std::cout<<fft<<std::endl;
+					Butterfly_DIF_mod2p_4x2_SSE_last2step(A0,A4,W,Wp,P,P2);
+				//std::cout<<fft<<std::endl;
 #undef A0
 #undef A4
-		} else {
-			for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j++)
-						Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
-		}
+			} else {
+				for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
 	}
 
 	template <class Field>
 	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative4x2_SSE (uint32_t *fft) {
 		size_t w, f;
 		_vect128_t P,P2;
-		P  = _mm_set1_epi32(_pl);
-		P2 = _mm_set1_epi32(_dpl);
+		P  = Simd128<uint32_t>::set1(_pl);
+		P2 = Simd128<uint32_t>::set1(_dpl);
 		uint32_t * tab_w =  &pow_w[0];
 		uint32_t * tab_wp= &pow_wp[0];
 		for (w = n >> 1, f = 1; w >= 8; tab_w+=w+(w>>1), tab_wp+=w+(w>>1), w >>= 2, f <<= 2)
@@ -361,87 +392,87 @@ namespace LinBox {
 #define A2 &fft[0] + ((i << 1)+1)*w+ j
 #define A3 &fft[0] + ((i << 1)+1)*w+(j+(w >> 1))
 
-					Butterfly_DIF_mod2p_4x2_SSE(A0, A1, A2, A3,
-								    tab_w +j, tab_w +j+(w >> 1), tab_w +j+w,
-								    tab_wp+j, tab_wp+j+(w >> 1), tab_wp+j+w,
-								    P,P2);
+						Butterfly_DIF_mod2p_4x2_SSE(A0, A1, A2, A3,
+													tab_w +j, tab_w +j+(w >> 1), tab_w +j+w,
+													tab_wp+j, tab_wp+j+(w >> 1), tab_wp+j+w,
+													P,P2);
 #undef A0
 #undef A1
 #undef A2
 #undef A3
-				}
+					}
 
 		// Last two steps
 		if (n >= 8) {
-			if (w == 4) {
-				for (size_t i = 0; i < f; i++)
+				if (w == 4) {
+						for (size_t i = 0; i < f; i++)
 #define A0 &fft[0] +  (i << 1)   *w
 #define A4 &fft[0] + ((i << 1)+1)*w
-					Butterfly_DIF_mod2p_4x1_SSE(A0,A4, tab_w,tab_wp,P,P2);
+							Butterfly_DIF_mod2p_4x1_SSE(A0,A4, tab_w,tab_wp,P,P2);
 #undef A0
 #undef A4
-				tab_w+=w;
-				tab_wp+=w;
-				w >>= 1;
-				f <<= 1;
-			}
+						tab_w+=w;
+						tab_wp+=w;
+						w >>= 1;
+						f <<= 1;
+					}
 
-			_vect128_t W,Wp;
-			W = _mm_set1_epi32 ((int)tab_w [1]);
-			Wp= _mm_set1_epi32 ((int)tab_wp[1]);
+				_vect128_t W,Wp;
+				W = Simd128<uint32_t>::set1 ((int)tab_w [1]);
+				Wp= Simd128<uint32_t>::set1 ((int)tab_wp[1]);
 
-			for (size_t i = 0; i < f; i+=2)
+				for (size_t i = 0; i < f; i+=2)
 #define A0 &fft[0] +  (i << 2)
 #define A4 &fft[0] + ((i << 2)+4)
-				Butterfly_DIF_mod2p_4x2_SSE_last2step(A0,A4,W,Wp,P,P2);
+					Butterfly_DIF_mod2p_4x2_SSE_last2step(A0,A4,W,Wp,P,P2);
 #undef A0
 #undef A4
-		} else {
-			for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j++)
-						Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
-		}
+			} else {
+				for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
 	}
 
 	template <class Field>
 	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative4x1_SSE (uint32_t *fft)
 	{
 		_vect128_t P,P2;
-		VEC128_SET_32(P,_pl);
-		VEC128_SET_32(P2,_dpl);
+		P = Simd128<uint32_t>::set1(_pl);
+		P2 = Simd128<uint32_t>::set1(_dpl);
 		// Last two steps
 		if (n >= 8) {
-			_vect128_t W,Wp;
-			W = _mm_set1_epi32 ((int)pow_w [n-3]);
-			Wp= _mm_set1_epi32 ((int)pow_wp[n-3]);
-
-			for (size_t i = 0; i < n; i+=8)
-				Butterfly_DIT_mod4p_4x2_SSE_first2step(&fft[i],&fft[i+4],W,Wp,P,P2);
-
-			uint32_t * tab_w = &pow_w [n-8];
-			uint32_t * tab_wp= &pow_wp[n-8];
-			for (size_t w = 4, f = n >> 3; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
-				// w : witdh of butterflies
-				// f : # families of butterflies
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j+=4)
+				_vect128_t W,Wp;
+				W = Simd128<uint32_t>::set1 ((int)pow_w [n-3]);
+				Wp= Simd128<uint32_t>::set1 ((int)pow_wp[n-3]);
+
+				for (size_t i = 0; i < n; i+=8)
+					Butterfly_DIT_mod4p_4x2_SSE_first2step(&fft[i],&fft[i+4],W,Wp,P,P2);
+
+				uint32_t * tab_w = &pow_w [n-8];
+				uint32_t * tab_wp= &pow_wp[n-8];
+				for (size_t w = 4, f = n >> 3; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
+						// w : witdh of butterflies
+						// f : # families of butterflies
+						for (size_t i = 0; i < f; i++)
+							for (size_t j = 0; j < w; j+=4)
 #define A0 &fft[0] +  (i << 1)   *w+ j
 #define A4 &fft[0] + ((i << 1)+1)*w+ j
-						Butterfly_DIT_mod4p_4x1_SSE(A0,A4, tab_w+j,tab_wp+j,P,P2);
+								Butterfly_DIT_mod4p_4x1_SSE(A0,A4, tab_w+j,tab_wp+j,P,P2);
 
 #undef A0
 #undef A4
 
+					}
+			} else {
+				uint32_t * tab_w = &pow_w [n-2];
+				uint32_t * tab_wp= &pow_wp[n-2];
+				for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
 			}
-		} else {
-			uint32_t * tab_w = &pow_w [n-2];
-			uint32_t * tab_wp= &pow_wp[n-2];
-			for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j++)
-						Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
-		}
 	}
 
 
@@ -452,14 +483,14 @@ namespace LinBox {
 	 ******************************************************************************************************************/
 
 
-#ifdef __LINBOX_HAVE_AVX2
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 
 	template <class Field>
 	inline void FFT_transform<Field>::reduce256_modp(uint32_t* ABCD, const _vect256_t& P) {
-		_vect256_t V1,T;
-		VEC256_LOADU(V1,ABCD);
-		VEC256_MOD_P(V1,V1,P,T);
-		VEC256_STOREU(ABCD,V1);
+		_vect256_t V1;
+		V1 = Simd256<uint32_t>::loadu(ABCD);
+		V1 = reduce<Simd256<uint32_t> >(V1, P);
+		Simd256<uint32_t>::storeu(ABCD,V1);
 	}
 
 
@@ -469,111 +500,113 @@ namespace LinBox {
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_8x1_AVX(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP,
-								      const uint32_t* alpha,
-								      const uint32_t* alphap,
-								      const _vect256_t& P, const _vect256_t& P2) {
+																  const uint32_t* alpha,
+																  const uint32_t* alphap,
+																  const _vect256_t& P, const _vect256_t& P2) {
 		_vect256_t V1,V2,V3,V4,W,Wp,T;
 		// V1=[A B C D E F G H], V2=[I J K L M N O P]
-		VEC256_LOADU(V1,ABCDEFGH);
-		VEC256_LOADU(V2,IJKLMNOP);
-		VEC256_LOADU(W ,alpha);
-		VEC256_LOADU(Wp,alphap);
+		V1 = Simd256<uint32_t>::loadu(ABCDEFGH);
+		V2 = Simd256<uint32_t>::loadu(IJKLMNOP);
+		W  = Simd256<uint32_t>::loadu(alpha);
+		Wp = Simd256<uint32_t>::loadu(alphap);
 
 		// V3 = V1 + V2 mod
-		VEC256_ADD_MOD(V3,V1,V2,P2,T);
-		VEC256_STOREU(ABCDEFGH,V3);
+
+		V3 = add_mod<Simd256<uint32_t> >(V1,V2,P2);
+
+		Simd256<uint32_t>::storeu(ABCDEFGH,V3);
 
 		// V4 = (V1+(2P-V2))alpha mod 2P
-		VEC256_SUB_32(T,V2,P2);
-		VEC256_SUB_32(V4,V1,T);
-		VEC256_MUL_MOD(T,V4,W,P,Wp,V1,V2,V3);// V3 is the result
-		VEC256_STOREU(IJKLMNOP,T);
+		T = Simd256<uint32_t>::sub(V2,P2);
+		V4 = Simd256<uint32_t>::sub(V1,T);
+		T = mul_mod<Simd256<uint32_t> >(V4,W,P,Wp);// T is the result
+		Simd256<uint32_t>::storeu(IJKLMNOP,T);
 	}
 
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIF_mod2p_8x3_AVX_last3step(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP,
-										const _vect256_t& alpha,const _vect256_t& alphap,
-										const _vect256_t& beta ,const _vect256_t& betap,
-										const _vect256_t& P    ,const _vect256_t& P2) {
+																			const _vect256_t& alpha,const _vect256_t& alphap,
+																			const _vect256_t& beta ,const _vect256_t& betap,
+																			const _vect256_t& P    ,const _vect256_t& P2) {
 		_vect256_t V1,V2,V3,V4,V5,V6,V7,Q;
 
 		// V1=[A B C D E F G H], V2=[I J K L M N O P]
-		VEC256_LOADU(V1,ABCDEFGH);
-		VEC256_LOADU(V2,IJKLMNOP);
+		V1 = Simd256<uint32_t>::loadu(ABCDEFGH);
+		V2 = Simd256<uint32_t>::loadu(IJKLMNOP);
 
 		/* 1st step */
 		// V3=[A B C D I J K L] V4=[E F G H M N O P]
-		VEC256_UNPACK_LO_128(V3,V1,V2);
-		VEC256_UNPACK_HI_128(V4,V1,V2);
+		V3 = Simd256<uint64_t>::unpacklo128(V1,V2);
+		V4 = Simd256<uint64_t>::unpackhi128(V1,V2);
 
 		// V1 = V3 + V4 mod 2P
 		// P2 = [2p 2p 2p 2p]
-		VEC256_ADD_MOD(V1,V3,V4,P2,V5);
+		V1 = add_mod<Simd256<uint32_t> >(V3,V4,P2);
 
 		// V2 = (V3+(2P-V4))alpha mod 2P
-		VEC256_SUB_32(V5,V4,P2);
-		VEC256_SUB_32(V6,V3,V5);
-		VEC256_MOD_P(V7,V6,P2,V2);
-		VEC256_MUL_MOD(V2,V7,alpha,P,alphap,V3,V4,V5);
+		V5 = Simd256<uint32_t>::sub(V4,P2);
+		V6 = Simd256<uint32_t>::sub(V3,V5);
+		V7 = reduce<Simd256<uint32_t> >(V6, P2);
+		V2 = mul_mod<Simd256<uint32_t> >(V7,alpha,P,alphap);
 
 		/* 2nd step */
 
 		// V3=[A E B F I M J N] V4=[C G D H K O L P]
-		VEC256_UNPACK_LO_32(V3,V1,V2);
-		VEC256_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd256<uint32_t>::unpacklo_twice(V1,V2);
+		V4 = Simd256<uint32_t>::unpackhi_twice(V1,V2);
 
 		// V1 = V3 + V4 mod 2P
 		// P2 = [2p 2p 2p 2p]
-		VEC256_ADD_MOD(V1,V3,V4,P2,V5);
+		V1 = add_mod<Simd256<uint32_t> >(V3,V4,P2);
 
 		// V2 = (V3+(2P-V4))alpha mod 2P
 		// V7 =  (V3+(2P-V4)) mod 2P
-		VEC256_SUB_32(V5,V4,P2);
-		VEC256_SUB_32(V6,V3,V5);
-		VEC256_MOD_P(V7,V6,P2,V2);
+		V5 = Simd256<uint32_t>::sub(V4,P2);
+		V6 = Simd256<uint32_t>::sub(V3,V5);
+		V7 = reduce<Simd256<uint32_t> >(V6, P2);
 
 		// V4 = [D D H H L L P P ]
-		VEC256_UNPACK_HI_32(V4,V7,V7);
+		V4 = Simd256<uint32_t>::unpackhi_twice(V7,V7);
 
 		// Q = V4 * beta mod 2^64 = [* Qd * Qh * Ql * Qp]
 		// with betap= [ betap * betap * betap * betap *]
-		VEC256_MUL_32(Q,V4,betap);
+		Q = Simd256<uint64_t>::mulx(V4,betap);
 		// V5 = [* Qd.P * Qh.P * Ql.P * Qp.P]
-		VEC256_MUL_LO_32(V5,Q,P);
+		V5 = Simd256<uint32_t>::mullo(Q,P);
 		// V6 = V4 * beta mod 2^32
-		VEC256_MUL_LO_32(V6,V4,beta);
+		V6 = Simd256<uint32_t>::mullo(V4,beta);
 		// V3 = V6 - V5 = [* (D.beta mod p) * (H.beta mod p) * (L.beta mod p) * (P.beta mod p)]
-		VEC256_SUB_32(V3,V6,V5);
+		V3 = Simd256<uint32_t>::sub(V6,V5);
 		// V2=[* * D H * * L P]
-		VEC256_SHUFFLE_32(V2,V3,0xDD);
+		V2 = Simd256<uint32_t>::shuffle_twice<0xDD>(V3);
 
 		/* 3nd step */
 		// At this time I have V1=[A B E F I J M N], V7=[C G * * K O * *], V2=[* * D H * * L P]
 		// I need V3 = [A C E G I K M O], V4=[B D F H J L N P]
-		VEC256_UNPACK_LO_32(V3,V1,V7);
-		VEC256_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd256<uint32_t>::unpacklo_twice(V1,V7);
+		V4 = Simd256<uint32_t>::unpackhi_twice(V1,V2);
 
 		// V1 = V3 + V4 mod 2P
-		VEC256_ADD_MOD(V1,V3,V4,P2,V5);
+		V1 = add_mod<Simd256<uint32_t> >(V3,V4,P2);
 
 		// V2 = V3 + (2P - V4) mod 2P
-		VEC256_SUB_32(V5,V4,P2);
-		VEC256_SUB_32(V6,V3,V5);
-		VEC256_MOD_P(V2,V6,P2,V2);
+		V5 = Simd256<uint32_t>::sub(V4,P2);
+		V6 = Simd256<uint32_t>::sub(V3,V5);
+		V2 = reduce<Simd256<uint32_t> >(V6, P2);
 
 		// Result in    V1=[A C E G I K M O] V2=[B D F H J L N P]
 		// Transform to V3=[A B C D I J K L],V4=[E F G H M N O P]
-		VEC256_UNPACK_LO_32(V3,V1,V2);
-		VEC256_UNPACK_HI_32(V4,V1,V2);
+		V3 = Simd256<uint32_t>::unpacklo_twice(V1,V2);
+		V4 = Simd256<uint32_t>::unpackhi_twice(V1,V2);
 
 		// Transform to V1=[A B C D E F G H], V2=[I J K L M N O P]
-		VEC256_UNPACK_LO_128(V1,V3,V4);
-		VEC256_UNPACK_HI_128(V2,V3,V4);
+		V1 = Simd256<uint64_t>::unpacklo128(V3,V4);
+		V2 = Simd256<uint64_t>::unpackhi128(V3,V4);
 
 		// Store
-		VEC256_STOREU(ABCDEFGH,V1);
-		VEC256_STOREU(IJKLMNOP,V2);
+		Simd256<uint32_t>::storeu(ABCDEFGH,V1);
+		Simd256<uint32_t>::storeu(IJKLMNOP,V2);
 	}
 
 
@@ -581,55 +614,56 @@ namespace LinBox {
 	template <class Field>
 	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative8x1_AVX (uint32_t *fft) {
 		_vect256_t P,P2;
-		VEC256_SET_32(P,_pl);
-		VEC256_SET_32(P2,_dpl);
+		P = Simd256<uint32_t>::set1(_pl);
+		P2 = Simd256<uint32_t>::set1(_dpl);
 
 		uint32_t * tab_w = &pow_w [0];
 		uint32_t * tab_wp= &pow_wp[0];
 		size_t w, f;
 		for (w = n >> 1, f = 1; w >= 8; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1){
-			// w : witdh of butterflies
-			// f : # families of butterflies
-			for (size_t i = 0; i < f; i++)
-				for (size_t j = 0; j < w; j+=8)
+				// w : witdh of butterflies
+				// f : # families of butterflies
+				for (size_t i = 0; i < f; i++)
+					for (size_t j = 0; j < w; j+=8)
 #define A0 &fft[0] +  (i << 1)   *w+ j
 #define A4 &fft[0] + ((i << 1)+1)*w+ j
-					Butterfly_DIF_mod2p_8x1_AVX(A0,A4, tab_w+j,tab_wp+j,P,P2);
+						Butterfly_DIF_mod2p_8x1_AVX(A0,A4, tab_w+j,tab_wp+j,P,P2);
 
 #undef A0
 #undef A4
-			//std::cout<<fft<<std::endl;
-		}
+				//std::cout<<fft<<std::endl;
+			}
 		// Last three steps
 		if (n >= 16) {
-			_vect256_t alpha,alphap,beta,betap;
-			uint32_t tmp[8];
-			tmp[0]=tmp[4]=tab_w[0];
-			tmp[1]=tmp[5]=tab_w[1];
-			tmp[2]=tmp[6]=tab_w[2];
-			tmp[3]=tmp[7]=tab_w[3];
-			VEC256_LOADU(alpha,tmp);
-			tmp[0]=tmp[4]=tab_wp[0];
-			tmp[1]=tmp[5]=tab_wp[1];
-			tmp[2]=tmp[6]=tab_wp[2];
-			tmp[3]=tmp[7]=tab_wp[3];
-			VEC256_LOADU(alphap,tmp);
-			VEC256_SET_32(beta,tab_w [5]);
-			VEC256_SET_32(betap,tab_wp [5]);
-
-			for (size_t i = 0; i < f; i+=2)
+				_vect256_t alpha,alphap,beta,betap;
+				uint32_t tmp[8];
+				tmp[0]=tmp[4]=tab_w[0];
+				tmp[1]=tmp[5]=tab_w[1];
+				tmp[2]=tmp[6]=tab_w[2];
+				tmp[3]=tmp[7]=tab_w[3];
+				alpha = Simd256<uint32_t>::loadu(tmp);
+				tmp[0]=tmp[4]=tab_wp[0];
+				tmp[1]=tmp[5]=tab_wp[1];
+				tmp[2]=tmp[6]=tab_wp[2];
+				tmp[3]=tmp[7]=tab_wp[3];
+				alphap = Simd256<uint32_t>::loadu(tmp);
+				beta = Simd256<uint32_t>::set1(tab_w [5]);
+				betap = Simd256<uint32_t>::set1(tab_wp [5]);
+
+				for (size_t i = 0; i < f; i+=2)
 #define A0 &fft[0] + (i << 3)
 #define A4 &fft[0] + (i << 3)+8
-				Butterfly_DIF_mod2p_8x3_AVX_last3step(A0,A4,alpha,alphap,beta,betap,P,P2);
+					Butterfly_DIF_mod2p_8x3_AVX_last3step(A0,A4,alpha,alphap,beta,betap,P,P2);
 #undef A0
 #undef A4
-			//std::cout<<fft<<std::endl;
-		} else {
-			for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j++)
-						Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
-		}
+				//std::cout<<fft<<std::endl;
+			} else {
+				for (; w >= 1; tab_w+=w, tab_wp+=w, w >>= 1, f <<= 1)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
+			}
+		
 	}
 
 
@@ -639,123 +673,123 @@ namespace LinBox {
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIT_mod4p_8x1_AVX(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP,
-								      const uint32_t* alpha,
-								      const uint32_t* alphap,
-								      const _vect256_t& P, const _vect256_t& P2) {
-		_vect256_t V1,V2,V3,V4,W,Wp,T1,T2;
+																  const uint32_t* alpha,
+																  const uint32_t* alphap,
+																  const _vect256_t& P, const _vect256_t& P2) {
+		_vect256_t V1,V2,V3,V4,W,Wp,T1;
 		// V1=[A B C D E F G H], V2=[I J K L M N O P]
-		VEC256_LOADU(V1,ABCDEFGH);
-		VEC256_LOADU(V2,IJKLMNOP);
-		VEC256_LOADU(W ,alpha);
-		VEC256_LOADU(Wp,alphap);
+		V1 = Simd256<uint32_t>::loadu(ABCDEFGH);
+		V2 = Simd256<uint32_t>::loadu(IJKLMNOP);
+		W  = Simd256<uint32_t>::loadu(alpha);
+		Wp = Simd256<uint32_t>::loadu(alphap);
 
 		// V3 = V1 mod 2P
-		VEC256_MOD_P (V3,V1,P2,T1);
+		V3 = reduce<Simd256<uint32_t> >(V1, P2);
 
 		// V4 = V2 * W mod P
-		VEC256_MUL_MOD(V4,V2,W,P,Wp,V1,T1,T2);
+		V4 = mul_mod<Simd256<uint32_t> >(V2,W,P,Wp);
 
 		// V1 = V3 + V4
-		VEC256_ADD_32(V1,V3,V4);
-		VEC256_STOREU(ABCDEFGH,V1);
+		V1 = Simd256<uint32_t>::add(V3,V4);
+		Simd256<uint32_t>::storeu(ABCDEFGH,V1);
 
 		// V2 = V3 - (V4 - 2P)
-		VEC256_SUB_32(T1,V4,P2);
-		VEC256_SUB_32(V2,V3,T1);
-		VEC256_STOREU(IJKLMNOP,V2);
+		T1 = Simd256<uint32_t>::sub(V4,P2);
+		V2 = Simd256<uint32_t>::sub(V3,T1);
+		Simd256<uint32_t>::storeu(IJKLMNOP,V2);
 	}
 
 
 	template <class Field>
 	inline void FFT_transform<Field>::Butterfly_DIT_mod4p_8x3_AVX_first3step(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP,
-										 const _vect256_t& alpha,const _vect256_t& alphap,
-										 const _vect256_t& beta ,const _vect256_t& betap,
-										 const _vect256_t& P, const _vect256_t& P2) {
+																			 const _vect256_t& alpha,const _vect256_t& alphap,
+																			 const _vect256_t& beta ,const _vect256_t& betap,
+																			 const _vect256_t& P, const _vect256_t& P2) {
 		_vect256_t V1,V2,V3,V4,V5,V6,V7,Q;
 		// V1=[A B C D E F G H], V2=[I J K L M N O P]
-		VEC256_LOADU(V1,ABCDEFGH);
-		VEC256_LOADU(V2,IJKLMNOP);
+		V1 = Simd256<uint32_t>::loadu(ABCDEFGH);
+		V2 = Simd256<uint32_t>::loadu(IJKLMNOP);
 
 
 		/*********************************************/
 		/* 1st STEP */
 		/*********************************************/
 		// Transform to V3=[A I C K E M G O], V4=[B J D L F N H P]
-		VEC256_UNPACK_LO_32(V6,V1,V2); // V6=[A I B J E M F N]
-		VEC256_UNPACK_HI_32(V7,V1,V2); // V7=[C K D L G O H P]
-		VEC256_UNPACK_LO_64(V3,V6,V7); // V3=[A I C K E M G O]
-		VEC256_UNPACK_HI_64(V4,V6,V7); // V4=[B J D L F N H P]
+		V6 = Simd256<uint32_t>::unpacklo_twice(V1,V2); // V6=[A I B J E M F N]
+		V7 = Simd256<uint32_t>::unpackhi_twice(V1,V2); // V7=[C K D L G O H P]
+		V3 = Simd256<uint64_t>::unpacklo_twice(V6,V7); // V3=[A I C K E M G O]
+		V4 = Simd256<uint64_t>::unpackhi_twice(V6,V7); // V4=[B J D L F N H P]
 
 
 
 
 		// V1 = V3 + V4;       V1 = [A I C K E M G O]
 		// Rk: No need for (. mod 2P) since entries are <P
-		VEC256_ADD_32(V1,V3,V4);
+		V1 = Simd256<uint32_t>::add(V3,V4);
 
 		// V2 = V3 + (P - V4); V2 = [B J D L F N H P]
 		// Rk: No need for (. mod 2P) since entries are <P
-		VEC256_SUB_32(V6,V4,P);
-		VEC256_SUB_32(V2,V3,V6);
+		V6 = Simd256<uint32_t>::sub(V4,P);
+		V2 = Simd256<uint32_t>::sub(V3,V6);
 
 		/*********************************************/
 		/* 2nd STEP */
 		/*********************************************/
 		// V5 = [D D L L H H P P]
-		VEC256_UNPACK_HI_32(V5,V2,V2);
+		V5 = Simd256<uint32_t>::unpackhi_twice(V2,V2);
 		// Q = V5 * alpha mod 2^64 = [* Qd * Qh * Ql * Qp]
 		// with betap= [ alphap * alphap * alphap * alphap *]
-		VEC256_MUL_32(Q,V5,alphap);
+		Q = Simd256<uint64_t>::mulx(V5,alphap);
 		// V6 = [* Qd.P * Qh.P * Ql.P * Qp.P]
-		VEC256_MUL_LO_32(V6,Q,P);
+		V6 = Simd256<uint32_t>::mullo(Q,P);
 		// V7 = V5 * alpha mod 2^32
-		VEC256_MUL_LO_32(V7,V5,alpha);
+		V7 = Simd256<uint32_t>::mullo(V5,alpha);
 		// V3 = V7 - V6 = [* (D.alpha mod p) * (L.alpha mod p) * (H.alpha mod p) * (P.alpha mod p)]
-		VEC256_SUB_32(V3,V7,V6);
+		V3 = Simd256<uint32_t>::sub(V7,V6);
 		// V7=[D L * * H P * *]
-		VEC256_SHUFFLE_32(V7,V3,0xFD);
+		V7 = Simd256<uint32_t>::shuffle_twice<0xFD>(V3);
 		// V6 = [B J D L F N H P]
-		VEC256_UNPACK_LO_64(V6,V2,V7);
+		V6 = Simd256<uint64_t>::unpacklo_twice(V2,V7);
 		// V3= [A B I J E F M N], V4=[C D K L G H O P]
-		VEC256_UNPACK_LO_32(V3,V1,V6);
-		VEC256_UNPACK_HI_32(V4,V1,V6);
+		V3 = Simd256<uint32_t>::unpacklo_twice(V1,V6);
+		V4 = Simd256<uint32_t>::unpackhi_twice(V1,V6);
 
 		// V1 = V3+V4
-		VEC256_ADD_32(V1,V3,V4);
+		V1 = Simd256<uint32_t>::add(V3,V4);
 		// V2 = V3 - (V4 - 2P)
-		VEC256_SUB_32(V7,V4,P2);
-		VEC256_SUB_32(V2,V3,V7);
+		V7 = Simd256<uint32_t>::sub(V4,P2);
+		V2 = Simd256<uint32_t>::sub(V3,V7);
 
 		/*********************************************/
 		/* 3nd STEP */
 		/*********************************************/
 		// V3= [A B C D I J K L] V4= [E F G H M N O P]
-		VEC256_UNPACK_LO_64(V6,V1,V2);
-		VEC256_UNPACK_HI_64(V7,V1,V2);
-		VEC256_UNPACK_LO_128(V3,V6,V7);
-		VEC256_UNPACK_HI_128(V4,V6,V7);
+		V6 = Simd256<uint64_t>::unpacklo_twice(V1,V2);
+		V7 = Simd256<uint64_t>::unpackhi_twice(V1,V2);
+		V3 = Simd256<uint64_t>::unpacklo128(V6,V7);
+		V4 = Simd256<uint64_t>::unpackhi128(V6,V7);
 
 		// V6= V3 mod 2P
-		VEC256_MOD_P(V6,V3,P2,V7);
+		V6 = reduce<Simd256<uint32_t> >(V3, P2);
 
 		// V7= V4.beta mod p
-		VEC256_MUL_MOD(V7,V4,beta,P,betap,V1,V2,V5);
+		V7 = mul_mod<Simd256<uint32_t> >(V4,beta,P,betap);
 
 		// V1 = V6+V7
-		VEC256_ADD_32(V1,V6,V7);
+		V1 = Simd256<uint32_t>::add(V6,V7);
 
 		// V2 = V6 - (V7 - 2P)
-		VEC256_SUB_32(V5,V7,P2);
-		VEC256_SUB_32(V2,V6,V5);
+		V5 = Simd256<uint32_t>::sub(V7,P2);
+		V2 = Simd256<uint32_t>::sub(V6,V5);
 
 		/*********************************************/
 		// V3=[A B C D E F G H] V4=[I J K L M N O P]
-		VEC256_UNPACK_LO_128(V3,V1,V2);
-		VEC256_UNPACK_HI_128(V4,V1,V2);
+		V3 = Simd256<uint64_t>::unpacklo128(V1,V2);
+		V4 = Simd256<uint64_t>::unpackhi128(V1,V2);
 
 		// Store
-		VEC256_STOREU(ABCDEFGH,V3);
-		VEC256_STOREU(IJKLMNOP,V4);
+		Simd256<uint32_t>::storeu(ABCDEFGH,V3);
+		Simd256<uint32_t>::storeu(IJKLMNOP,V4);
 	}
 
 
@@ -763,48 +797,48 @@ namespace LinBox {
 	template <class Field>
 	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative8x1_AVX (uint32_t *fft) {
 		_vect256_t P,P2;
-		VEC256_SET_32(P,_pl);
-		VEC256_SET_32(P2,_dpl);
+		P = Simd256<uint32_t>::set1(_pl);
+		P2 = Simd256<uint32_t>::set1(_dpl);
 
 		// first three steps
 		if (n >= 16) {
-			_vect256_t alpha,alphap,beta,betap;
-			VEC256_SET_32(alpha,pow_w[n-3]);
-			VEC256_SET_32(alphap,pow_wp[n-3]);
-			uint32_t tmp[8];
-			tmp[0]=tmp[4]=pow_w[n-8];
-			tmp[1]=tmp[5]=pow_w[n-7];
-			tmp[2]=tmp[6]=pow_w[n-6];
-			tmp[3]=tmp[7]=pow_w[n-5];
-			VEC256_LOADU(beta,tmp);
-			tmp[0]=tmp[4]=pow_wp[n-8];
-			tmp[1]=tmp[5]=pow_wp[n-7];
-			tmp[2]=tmp[6]=pow_wp[n-6];
-			tmp[3]=tmp[7]=pow_wp[n-5];
-			VEC256_LOADU(betap,tmp);
-			for (uint64_t i = 0; i < n; i+=16)
-				Butterfly_DIT_mod4p_8x3_AVX_first3step(&fft[i],&fft[i+8],alpha,alphap,beta,betap,P,P2);
-			uint32_t * tab_w = &pow_w [n-16];
-			uint32_t * tab_wp= &pow_wp[n-16];
-			for (size_t w = 8, f = n >> 4; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
-				// w : witdh of butterflies
-				// f : # families of butterflies
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j+=8)
+				_vect256_t alpha,alphap,beta,betap;
+				alpha = Simd256<uint32_t>::set1(pow_w[n-3]);
+				alphap = Simd256<uint32_t>::set1(pow_wp[n-3]);
+				uint32_t tmp[8];
+				tmp[0]=tmp[4]=pow_w[n-8];
+				tmp[1]=tmp[5]=pow_w[n-7];
+				tmp[2]=tmp[6]=pow_w[n-6];
+				tmp[3]=tmp[7]=pow_w[n-5];
+				beta = Simd256<uint32_t>::loadu(tmp);
+				tmp[0]=tmp[4]=pow_wp[n-8];
+				tmp[1]=tmp[5]=pow_wp[n-7];
+				tmp[2]=tmp[6]=pow_wp[n-6];
+				tmp[3]=tmp[7]=pow_wp[n-5];
+				betap = Simd256<uint32_t>::loadu(tmp);
+				for (uint64_t i = 0; i < n; i+=16)
+					Butterfly_DIT_mod4p_8x3_AVX_first3step(&fft[i],&fft[i+8],alpha,alphap,beta,betap,P,P2);
+				uint32_t * tab_w = &pow_w [n-16];
+				uint32_t * tab_wp= &pow_wp[n-16];
+				for (size_t w = 8, f = n >> 4; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w){
+						// w : witdh of butterflies
+						// f : # families of butterflies
+						for (size_t i = 0; i < f; i++)
+							for (size_t j = 0; j < w; j+=8)
 #define A0 &fft[0] +  (i << 1)   *w+ j
 #define A4 &fft[0] + ((i << 1)+1)*w+ j
-						Butterfly_DIT_mod4p_8x1_AVX(A0,A4, tab_w+j,tab_wp+j,P,P2);
+								Butterfly_DIT_mod4p_8x1_AVX(A0,A4, tab_w+j,tab_wp+j,P,P2);
 #undef A0
 #undef A4
+					}
+			} else {
+				uint32_t * tab_w = &pow_w [n-2];
+				uint32_t * tab_wp= &pow_wp[n-2];
+				for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
+					for (size_t i = 0; i < f; i++)
+						for (size_t j = 0; j < w; j++)
+							Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
 			}
-		} else {
-			uint32_t * tab_w = &pow_w [n-2];
-			uint32_t * tab_wp= &pow_wp[n-2];
-			for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1, tab_w-=w, tab_wp-=w)
-				for (size_t i = 0; i < f; i++)
-					for (size_t j = 0; j < w; j++)
-						Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], tab_w[j], tab_wp[j]);
-		}
 	}
 
 
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.h b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.h
index bd19bff..1c98fdf 100755
--- a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.h
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.h
@@ -1,4 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014  Pascal Giorgi, Romain Lebreton
  *
@@ -30,9 +31,13 @@
 
 #include <iostream>
 #include "linbox/linbox-config.h"
-#include "linbox/algorithms/polynomial-matrix/simd.h"
 #include "linbox/util/debug.h"
 #include "givaro/givinteger.h"
+#include <fflas-ffpack/fflas/fflas_simd.h>
+
+#ifndef ROUND_DOWN
+#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
+#endif
 
 // template<typename T>
 // std::ostream& operator<<(std::ostream& os, const std::vector<T> &x){
@@ -43,6 +48,26 @@
 
 #include "fflas-ffpack/utils/align-allocator.h"
 
+#ifdef __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
+
+//#include "linbox/algorithms/polynomial-matrix/simd.h"
+
+#include "fflas-ffpack/fflas/fflas_simd.h"
+
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
+/* 256 bits CODE */
+#define __LINBOX_HAVE_AVX_INSTRUCTIONS2
+
+// define 256 bits simd vector type
+typedef __m256i  _vect256_t;
+
+#endif
+
+// define 128 bits simd vector type
+typedef __m128i  _vect128_t;
+
+#endif
+
 namespace LinBox {
 
 
@@ -79,7 +104,7 @@ namespace LinBox {
 
 		uint64_t find_gen (uint64_t _m, uint64_t _val2p) {
 			// find a primitive 2^k root of unity where
-			// _p - 1 = 2^k * m
+			// _p - 1 = 2^val2p * m
 			srand((unsigned int) time(NULL));
 			uint64_t y,z,j;
 			uint64_t _gen;
@@ -103,22 +128,21 @@ namespace LinBox {
 		}
 
 		FFT_transform (const Field& fld2, size_t ln2, Element w = 0)
-			: fld (&fld2), n ((1U << ln2)), ln (ln2), pow_w(n - 1), pow_wp(n - 1), _data(n) {
+			: fld (&fld2), n ((1UL << ln2)), ln (ln2), pow_w(n - 1), pow_wp(n - 1), _data(n) {
 			_pl = fld->characteristic();
 			_p  = fld->characteristic();
 
 			linbox_check((_pl >> 29) == 0 ); // 8*p < 2^31 for Harvey's butterflies
 			_dpl = (_pl << 1);
 			//_pinv = 1 / (double) _pl;
-							
+			Givaro::Timer chrono;
+			
 			if (w == 0){   // find a pseudo 2^lpts-th primitive root of unity
+
+				chrono.start();
+
 				uint64_t _val2p = 0;
 				uint64_t     _m = _pl;
-				uint64_t  _logp = 0;
-				while (_m) {
-					_m >>= 1;
-					_logp++;
-				}
 				_m = _pl - 1;
 				while ((_m & 1) == 0) {
 					_m >>= 1;
@@ -127,32 +151,89 @@ namespace LinBox {
 				//_I = (1L << (_logp << 1)) / _pl;
 				uint64_t _gen = find_gen (_m, _val2p);
 				_w = Givaro::powmod(_gen, 1<<(_val2p-ln), _pl);
+		
 			}
 			else {
 				_w = (uint32_t)w;
 			}
+			chrono.clear();
+			chrono.start();
 			
 			// compute w^(-1) mod p = w^(2^lpts - 1)
-			_invw = Givaro::powmod(_w, (1<<ln) - 1, _pl);
+			_invw = Givaro::powmod(_w, ((uint64_t)1<<ln) - 1, _pl);
 			
 			size_t pos = 0;
-			uint64_t wi = 1;
-			uint64_t __w = _w;
+			//uint64_t wi = 1;
+			uint32_t wi = 1;
+			uint32_t __w = _w;
+			uint64_t  _logp = Givaro::Integer(_pl).bitsize() -1;
+			uint32_t BAR= (Givaro::Integer(1)<<(32+_logp))/Givaro::Integer(_pl);
+			uint32_t Q;
+			//cout<<"log Bar: "<<Integer(BAR).bitsize()<<endl;
 			if (ln>0){
+#ifdef MYOLD_FFTINIT
 				size_t tpts = 1 << (ln - 1);
 				while (tpts > 0) {
 					for (size_t i = 0; i < tpts; i++, pos++) {
 						pow_w[pos] = wi;
 						pow_wp[pos] = ((uint64_t) pow_w[pos] << 32UL) / _pl;
 						wi= (wi*__w)%_pl;
-						//field().mulin(wi, __w);
 					}
 					wi = 1;
 					__w = (__w * __w) % _pl;
 					//field().mulin(__w, __w);
 					tpts >>= 1;
 				}
-			}
+#else
+//				using simd=Simd<uint32_t>;
+//				using vect_t =typename simd::vect_t;
+				
+				size_t tpts = 1 << (ln - 1);
+				size_t i=0;
+//				for( ;i<std::min(simd::vect_size+1, tpts);i++,pos++){
+				// Precompute pow_wp[1] for faster mult by pow_w[1]
+				for( ;i<std::min((size_t) 2, tpts);i++,pos++){
+					pow_w[pos] = wi;
+					pow_wp[pos] = ((uint64_t) pow_w[pos] << 32UL) / _pl;
+					wi= ((uint64_t)wi*__w)%_pl;
+				}
+				/*
+				vect_t wp_vect, Q_vect,BAR_vect,w_vect,pow_w_vect,pow_wp_vect, pl_vect;
+				BAR_vect= simd::set1(BAR);
+				wp_vect = simd::set1(pow_wp[simd::vect_size]);
+				w_vect  = simd::set1(pow_w[simd::vect_size]);
+				pl_vect = simd::set1(_pl);
+				for (; i < ROUND_DOWN(tpts,simd::vect_size);
+					 i+=simd::vect_size,pos+=simd::vect_size) {
+					pow_w_vect  = simd::loadu((int32_t*)pow_w.data()+pos-simd::vect_size);
+					Q_vect=simd::mulhi(pow_w_vect,wp_vect);
+					pow_w_vect = simd::sub(simd::mullo(pow_w_vect,w_vect),simd::mullo(Q_vect,pl_vect));
+					pow_w_vect=simd::sub(pow_w_vect, simd::vandnot(simd::greater(pow_w_vect,pl_vect),pl_vect));
+					simd::storeu((int32_t*)pow_w.data()+pos,pow_w_vect);
+					pow_wp_vect= simd::mulhi(simd::sll(pow_w_vect,32-_logp),BAR_vect);
+					simd::storeu((int32_t*)pow_wp.data()+pos,pow_wp_vect);
+				}
+				*/
+				// Use pow_wp[1] for speed-up mult by pow_w[1]
+				for( ;i<tpts;i++,pos++){
+					pow_w[pos] = wi;
+					pow_wp[pos]= (((uint64_t)wi*BAR)>>_logp);
+					Q= ((uint64_t)wi*pow_wp[1])>>32;
+					wi= (uint32_t)(wi*__w - Q*_pl);
+					wi-=(wi>=_pl?_pl:0);
+				}
+				
+				// Other pow_w elements can be read from previously computed pow_w
+				for(size_t k=2;k<=tpts;k<<=1)
+					for(size_t i=0;i<tpts;i+=k,pos++){
+						pow_w[pos]  = pow_w[i];
+						pow_wp[pos] = pow_wp[i];
+					}
+#endif	
+				
+			}	
+			chrono.stop();
+			//cout<<"FFT: table="<<chrono<<endl;
 		}
 
 
@@ -160,13 +241,13 @@ namespace LinBox {
 		Element getInvRoot() const {return _invw;}
 
 		
-		void FFT_DIF_Harvey (uint32_t *fft) {			
-#ifdef __LINBOX_USE_SIMD
-#ifdef __AVX2__
+		void FFT_DIF_Harvey (uint32_t *fft) {
+#ifdef __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 			FFT_DIF_Harvey_mod2p_iterative8x1_AVX(fft);
 			if (n>=8){
 				_vect256_t P;
-				VEC256_SET_32(P,_pl);
+				P = Simd256<uint32_t>::set1(_pl);
 				for (uint64_t i = 0; i < n; i += 8)
 					reduce256_modp(fft+i,P);
 				return;
@@ -176,7 +257,7 @@ namespace LinBox {
 #endif
 			if (n >=4) {
 				_vect128_t P;
-				VEC128_SET_32(P,_pl);
+				P = Simd128<uint32_t>::set1(_pl);
 				for (uint64_t i = 0; i < n; i += 4)
 					reduce128_modp(fft+i,P);
 			} else {
@@ -187,20 +268,20 @@ namespace LinBox {
 			// FALLBACK WHEN NO SIMD VERSION
 			FFT_DIF_Harvey_mod2p_iterative2x2(fft);
 			for (uint64_t i = 0; i < n; i++) {
-				if (fft[i] >= (_pl << 1)) fft[i] -= (_pl << 1);
+//				if (fft[i] >= (_pl << 1)) fft[i] -= (_pl << 1);
 				if (fft[i] >= _pl) fft[i] -= _pl;
-			}			
+			}
 #endif 
 		}
 		
 		void FFT_DIT_Harvey (uint32_t *fft) {
-#ifdef __LINBOX_USE_SIMD
-#ifdef __AVX2__
+#ifdef __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 			FFT_DIT_Harvey_mod4p_iterative8x1_AVX(fft);
 			if (n>=8){
 				_vect256_t P,P2;
-				VEC256_SET_32(P, _pl);
-				VEC256_SET_32(P2,_dpl);
+				P = Simd256<uint32_t>::set1( _pl);
+				P2 = Simd256<uint32_t>::set1(_dpl);
 				for (uint64_t i = 0; i < n; i += 8){
 					reduce256_modp(&fft[i],P2);
 					reduce256_modp(&fft[i],P);
@@ -212,8 +293,8 @@ namespace LinBox {
 #endif
 			if (n >=4) {
 				_vect128_t P,P2;
-				VEC128_SET_32(P,_pl);
-				VEC128_SET_32(P2,_dpl);
+				P = Simd128<uint32_t>::set1(_pl);
+				P2 = Simd128<uint32_t>::set1(_dpl);
 				for (uint64_t i = 0; i < n; i += 4){
 					reduce128_modp(&fft[i],P2);
 					reduce128_modp(&fft[i],P);
@@ -230,7 +311,7 @@ namespace LinBox {
 			for (uint64_t i = 0; i < n; i++) {
 				if (fft[i] >= (_pl << 1)) fft[i] -= (_pl << 1);
 				if (fft[i] >= _pl) fft[i] -= _pl;
-			}			
+			}
 #endif 
 		}
 		
@@ -246,27 +327,35 @@ namespace LinBox {
 			FFT_DIT_Harvey(fft);
 		}
 		
-		// FFT with conversion from Element to uint32_t		
+		// FFT with conversion from Element to uint32_t
 		template <typename T=Element>
 		typename std::enable_if<!std::is_same<T,uint32_t>::value>::type
 		FFT_DIF (T *fft) {
-			for(uint64_t i=0;i<n;i++)
-				_data[i]=fft[i];
-			FFT_DIF_Harvey(&_data[0]);
-			for(uint64_t i=0;i<n;i++)
-				fft[i]=_data[i];
-			
+			// for(uint64_t i=0;i<n;i++)
+			// 	_data[i]=fft[i];
+			// FFT_DIF_Harvey(&_data[0]);
+			// for(uint64_t i=0;i<n;i++)
+			// 	fft[i]=_data[i];
+			std::copy(fft,fft+n,_data.data());
+			FFT_DIF_Harvey(_data.data());
+			std::copy(_data.begin(),_data.end(),fft);
+
 		}
 		template <typename T=Element>
 		typename std::enable_if<!std::is_same<T,uint32_t>::value>::type
 		FFT_DIT (T *fft) {
-			for(uint64_t i=0;i<n;i++)
-				_data[i]=fft[i];
-			FFT_DIT_Harvey(&_data[0]);
-			for(uint64_t i=0;i<n;i++)
-				fft[i]=_data[i];			
+			// for(uint64_t i=0;i<n;i++)
+			// 	_data[i]=fft[i];
+			// FFT_DIT_Harvey(&_data[0]);
+			// for(uint64_t i=0;i<n;i++)
+			// 	fft[i]=_data[i];
+			std::copy(fft,fft+n,_data.data());
+			FFT_DIT_Harvey(_data.data());
+			std::copy(_data.begin(),_data.end(),fft);
+
+
 		}
-				
+
 		/*
 		 * Different implementations for the butterfly operations
 		 */
@@ -279,24 +368,24 @@ namespace LinBox {
 		inline void Butterfly_DIF_mod2p_4x1_SSE(uint32_t* ABCD, uint32_t* EFGH,const uint32_t* alpha, const uint32_t* alphap, const __m128i& P, const __m128i& P2);
 		inline void Butterfly_DIF_mod2p_4x1_SSE_laststep(uint32_t* ABCD, uint32_t* EFGH, const __m128i& P2);
 		inline void Butterfly_DIF_mod2p_4x2_SSE(uint32_t* , uint32_t* ,uint32_t* , uint32_t* ,
-							const uint32_t* ,const uint32_t* ,const uint32_t* ,
-							const uint32_t* ,const uint32_t* ,const uint32_t* ,
-							const __m128i& P, const __m128i& P2);
+												const uint32_t* ,const uint32_t* ,const uint32_t* ,
+												const uint32_t* ,const uint32_t* ,const uint32_t* ,
+												const __m128i& P, const __m128i& P2);
 		inline void Butterfly_DIF_mod2p_4x2_SSE_last2step(uint32_t* ABCD, uint32_t* EFGH, const __m128i& W,
-								  const __m128i& Wp, const __m128i& P, const __m128i& P2);
+														  const __m128i& Wp, const __m128i& P, const __m128i& P2);
 		inline void Butterfly_DIT_mod4p_4x1_SSE(uint32_t* ABCD, uint32_t* EFGH, const uint32_t* alpha,
-							const uint32_t* alphap,const __m128i& P, const __m128i& P2);
+												const uint32_t* alphap,const __m128i& P, const __m128i& P2);
 		inline void Butterfly_DIT_mod4p_4x2_SSE_first2step(uint32_t* ABCD, uint32_t* EFGH, const __m128i& W,
-								   const __m128i& Wp, const __m128i& P, const __m128i& P2);
-#ifdef __AVX2__
+														   const __m128i& Wp, const __m128i& P, const __m128i& P2);
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		inline void reduce256_modp(uint32_t*, const __m256i&);
 		inline void Butterfly_DIF_mod2p_8x1_AVX(uint32_t* ABCD, uint32_t* EFGH, const uint32_t* alpha,const uint32_t* alphap,const __m256i& P, const __m256i& P2);
 		inline void Butterfly_DIF_mod2p_8x3_AVX_last3step(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP, const __m256i& alpha,const __m256i& alphap,
-								  const __m256i& beta ,const __m256i& betap, const __m256i& P    ,const __m256i& P2);
+														  const __m256i& beta ,const __m256i& betap, const __m256i& P    ,const __m256i& P2);
 		inline void Butterfly_DIT_mod4p_8x1_AVX(uint32_t* ABCD, uint32_t* EFGH, const uint32_t* alpha,const uint32_t* alphap,
-			const __m256i& P, const __m256i& P2);
+												const __m256i& P, const __m256i& P2);
 		inline void Butterfly_DIT_mod4p_8x3_AVX_first3step(uint32_t* ABCDEFGH, uint32_t* IJKLMNOP, const __m256i& alpha,const __m256i& alphap,
-								   const __m256i& beta ,const __m256i& betap, const __m256i& P    ,const __m256i& P2);
+														   const __m256i& beta ,const __m256i& betap, const __m256i& P    ,const __m256i& P2);
 
 
 #endif
@@ -305,25 +394,27 @@ namespace LinBox {
 		 * Different implementation of DIF/DIT with Harvey's trick
 		 */
 
-		void FFT_DIF_Harvey_mod2p_iterative    (Element *fft);
-		void FFT_DIF_Harvey_mod2p_iterative2x2 (Element *fft);
-		void FFT_DIF_Harvey_mod2p_iterative3x3 (Element *fft);
-		void FFT_DIT_Harvey_mod4p_iterative2x2 (Element *fft);
-		void FFT_DIT_Harvey_mod4p_iterative3x3 (Element *fft);
+		void FFT_DIF_Harvey_mod2p_iterative    (uint32_t *fft);
+		void FFT_DIF_Harvey_mod2p_iterative2x2 (uint32_t *fft);
+		void FFT_DIF_Harvey_mod2p_iterative3x3 (uint32_t *fft);
+		void FFT_DIT_Harvey_mod4p_iterative    (uint32_t *fft);
+		void FFT_DIT_Harvey_mod4p_iterative2x2 (uint32_t *fft);
+		void FFT_DIT_Harvey_mod4p_iterative3x3 (uint32_t *fft);
 		// SIMD implementations follow
 		void FFT_DIF_Harvey_mod2p_iterative4x1_SSE (uint32_t *fft);
 		void FFT_DIF_Harvey_mod2p_iterative4x2_SSE (uint32_t *fft);
 		void FFT_DIT_Harvey_mod4p_iterative4x1_SSE (uint32_t *fft);
-#ifdef __AVX2__
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 		void FFT_DIF_Harvey_mod2p_iterative8x1_AVX (uint32_t *fft);
 		void FFT_DIT_Harvey_mod4p_iterative8x1_AVX (uint32_t *fft);
 #endif
 
-	};
+	}; // class FFT_transform
+
 } // end of namespace LinBox
 
 #include "linbox/algorithms/polynomial-matrix/polynomial-fft-transform.inl"
-#ifdef __LINBOX_USE_SIMD
+#ifdef __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
 #include "linbox/algorithms/polynomial-matrix/polynomial-fft-transform-simd.inl"
 #endif
 #endif // __LINBOX_FFT_H
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.inl b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.inl
index 84cc795..1ef4584 100644
--- a/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.inl
+++ b/linbox/algorithms/polynomial-matrix/polynomial-fft-transform.inl
@@ -1,4 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014  Pascal Giorgi, Romain Lebreton
  *
@@ -59,18 +60,18 @@ namespace LinBox {
 
 
 	template <class Field>
-	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative (Element *fft) {
-		for (size_t w = n >> 1, f = 1, pos_w = 0; w != 0; f <<= 1, pos_w += w, w >>= 1)
+	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative (uint32_t *fft) {
+		for (size_t w = n >> 1, f = 1, pos_w = 0; w != 0; f <<= 1, pos_w += w, w >>= 1){
 			// w : witdh of butterflies
 			// f : # families of butterflies
-			for (size_t i = 0; i < f; i++){
+			for (size_t i = 0; i < f; i++)
 				for (size_t j = 0; j < w; j++)
-					Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], pow_w[j*f], pow_wp[j*f]);				
-			}
+					Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], pow_w[j*f], pow_wp[j*f]);
+		}
 	}
 
 	template <class Field>
-	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative2x2 (Element *fft) {
+	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative2x2 (uint32_t *fft) {
 		size_t w, f;
 		for (w = n >> 1, f = 1; w >= 2; w >>= 2, f <<= 2)
 			// w : witdh of butterflies
@@ -104,7 +105,7 @@ namespace LinBox {
 	}
 
 	template <class Field>
-	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative3x3 (Element *fft) {
+	void FFT_transform<Field>::FFT_DIF_Harvey_mod2p_iterative3x3 (uint32_t *fft) {
 		size_t w, f;
 		for (w = n >> 1, f = 1; w >= 4; w >>= 3, f <<= 3)
 			// w : witdh of butterflies
@@ -154,9 +155,16 @@ namespace LinBox {
 					Butterfly_DIF_mod2p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], pow_w[j*f], pow_wp[j*f]);
 	}
 
+	template <class Field>
+	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative (uint32_t *fft) {
+		for (size_t w = 1, f = n >> 1; f >= 1; w <<= 1, f >>= 1)
+			for (size_t i = 0; i < f; i++)
+				for (size_t j = 0; j < w; j++)
+					Butterfly_DIT_mod4p(fft[(i << 1)*w+j], fft[((i << 1)+1)*w+j], pow_w[j*f], pow_wp[j*f]);
+	}
 
 	template <class Field>
-	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative2x2 (Element *fft) {
+	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative2x2 (uint32_t *fft) {
 		size_t w, f;
 		for (w = 1, f = n >> 1; f >= 2; w <<= 2, f >>= 2)
 			// w : witdh of butterflies
@@ -187,7 +195,7 @@ namespace LinBox {
 	}
 
 	template <class Field>
-	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative3x3 (Element *fft) {
+	void FFT_transform<Field>::FFT_DIT_Harvey_mod4p_iterative3x3 (uint32_t *fft) {
 		size_t w, f;
 		for (w = 1, f = n >> 1; f >= 4; w <<= 3, f >>= 3)
 			// w : witdh of butterflies
diff --git a/linbox/algorithms/polynomial-matrix/polynomial-matrix-domain.h b/linbox/algorithms/polynomial-matrix/polynomial-matrix-domain.h
index 7274dd3..830a808 100755
--- a/linbox/algorithms/polynomial-matrix/polynomial-matrix-domain.h
+++ b/linbox/algorithms/polynomial-matrix/polynomial-matrix-domain.h
@@ -34,6 +34,9 @@
 #include "linbox/algorithms/polynomial-matrix/matpoly-mult-kara.h"
 #include "linbox/algorithms/polynomial-matrix/matpoly-mult-fft.h"
 #include <algorithm>
+
+
+        
 namespace LinBox
 {
 
@@ -51,12 +54,12 @@ namespace LinBox
 		inline const Field& field() const {return *_field;}
 
 		template< class PMatrix1,class PMatrix2,class PMatrix3>
-		void mul(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b)
+		void mul(PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b, size_t max_rowdeg=0)
 		{
 			size_t d = a.size()+b.size();
                         if (d > FFT_DEG_THRESHOLD){
                                 //std::cout<<"PolMul FFT"<<std::endl;
-				_fft.mul(c,a,b);
+				_fft.mul(c,a,b,max_rowdeg);
                         }
 			else
 				if ( d > KARA_DEG_THRESHOLD){
@@ -68,7 +71,8 @@ namespace LinBox
 					_naive.mul(c,a,b);
                                 }
 		}
-
+               
+                
 		template< class PMatrix1,class PMatrix2,class PMatrix3>
 		void midproduct (PMatrix1 &c, const PMatrix2 &a, const PMatrix3 &b)
 		{
diff --git a/linbox/algorithms/polynomial-matrix/simd-additional-functions.h b/linbox/algorithms/polynomial-matrix/simd-additional-functions.h
new file mode 100644
index 0000000..949240a
--- /dev/null
+++ b/linbox/algorithms/polynomial-matrix/simd-additional-functions.h
@@ -0,0 +1,474 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2016 Romain Lebreton
+ *
+ * Written by Romain Lebreton <romain.lebreton at lirmm.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+ * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+#ifndef __LINBOX_simd_additional_functions_H
+#define __LINBOX_simd_additional_functions_H
+
+#include <iostream>
+#include "linbox/util/debug.h"
+#include "linbox/linbox-config.h"
+#include "fflas-ffpack/fflas/fflas_simd.h"
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#define INLINE __attribute__((always_inline)) inline
+#else
+#define INLINE inline
+#endif
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#define CONST __attribute__((const))
+#else
+#define CONST
+#endif
+
+
+namespace LinBox {
+
+
+	template <typename simd, typename Field>
+	struct SimdCompute_t {};
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+	template <typename Field>
+	struct SimdCompute_t<Simd128<typename Field::Element>, Field> {
+		using Compute_t = Simd128<typename Field::Compute_t>;
+	};
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
+	template <typename Field>
+	struct SimdCompute_t<Simd256<typename Field::Element>, Field> {
+		using Compute_t = Simd256<typename Field::Compute_t>;
+	};
+#endif
+
+
+#define Simd_vect typename Simd::vect_t
+
+	/*
+	 * Generic memory operations
+	*/
+	template<class T, class Simd = Simd<T>>
+	struct MemoryOp {
+
+		// Call load /store  (16 bits alignement)        if Simd128
+		static INLINE Simd_vect load (const T* const p);
+
+		// Call loadu/storeu (no alignement requirement) if Simd256
+		static INLINE void store(T *p, Simd_vect v);
+
+		static INLINE Simd_vect shuffletwice8_DD (Simd_vect& s1);
+
+		static INLINE Simd_vect unpacklo2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo16 (const Simd_vect& a, const Simd_vect& b);
+
+		static INLINE Simd_vect unpackhi2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi16 (const Simd_vect& a, const Simd_vect& b);
+
+		static INLINE Simd_vect unpacklo_twice2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo_twice4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo_twice8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklo_twice16 (const Simd_vect& a, const Simd_vect& b);
+
+		static INLINE Simd_vect unpackhi_twice2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi_twice4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi_twice8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpackhi_twice16 (const Simd_vect& a, const Simd_vect& b);
+
+		static INLINE Simd_vect unpacklohi_twice2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi_twice4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi_twice8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi_twice16 (const Simd_vect& a, const Simd_vect& b);
+
+		static INLINE Simd_vect unpacklohi2 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi4 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi8 (const Simd_vect& a, const Simd_vect& b);
+		static INLINE Simd_vect unpacklohi16 (const Simd_vect& a, const Simd_vect& b);
+
+	}; // MemoryOp
+
+#undef Simd_vect
+
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+	template<class T>
+	struct MemoryOp<T, Simd128<T>> {
+		using simd = Simd128<T>;
+		using simd_vect = typename simd::vect_t;
+
+		/**************/
+		/* load/store */
+		/**************/
+		static INLINE simd_vect load (const T* const p) {return simd::load(p);}
+		static INLINE void store(T *p, simd_vect v) {return simd::store(p, v);}
+
+		/*********************/
+		/* Specific shuffles */
+		/*********************/
+		static INLINE simd_vect shuffletwice8_DD (simd_vect& s1) {
+			using simd128_16 = Simd128<uint16_t>;
+			using simd128_64 = Simd128<uint64_t>;
+			//			std::cout << "Test shuffletwice8_DD :\n"; FFLAS::print<simd128_16>(std::cout,s1);
+			simd_vect s2 = simd128_64::sll(s1,16);
+			//			std::cout << "\n"; FFLAS::print<simd128_16>(std::cout,s2);
+			//			std::cout << "\n"; FFLAS::print<simd128_16>(std::cout,simd128_16::template blend<0x44>(s1,s2)); std::cout << "\n\n";
+			return simd128_16::template blend<0x44>(s1,s2); // 0x44 = [0 1 0 0 0 1 0 0]_base2
+		}
+
+		/********************/
+		/* unpacklo         */
+		/********************/
+		static INLINE simd_vect unpacklo2 (const simd_vect& a, const simd_vect& b) {return Simd128<uint64_t>::unpacklo(a,b); }
+		static INLINE simd_vect unpacklo4 (const simd_vect& a, const simd_vect& b) {return Simd128<uint32_t>::unpacklo(a,b); }
+		static INLINE simd_vect unpacklo8 (const simd_vect& a, const simd_vect& b) {return Simd128<uint16_t>::unpacklo(a,b); }
+
+		/********************/
+		/* unpackhi         */
+		/********************/
+		static INLINE simd_vect unpackhi2 (const simd_vect& a, const simd_vect& b) {return Simd128<uint64_t>::unpackhi(a,b); }
+		static INLINE simd_vect unpackhi4 (const simd_vect& a, const simd_vect& b) {return Simd128<uint32_t>::unpackhi(a,b); }
+		static INLINE simd_vect unpackhi8 (const simd_vect& a, const simd_vect& b) {return Simd128<uint16_t>::unpackhi(a,b); }
+
+		/**************/
+		/* unpacklohi */
+		/**************/
+		static INLINE void unpacklohi2 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd128_64 = Simd128<uint64_t>;
+			s1 = simd128_64::unpacklo(a, b);
+			s2 = simd128_64::unpackhi(a, b);
+		}
+
+		static INLINE void unpacklohi4 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd128_32 = Simd128<uint32_t>;
+			s1 = simd128_32::unpacklo(a, b);
+			s2 = simd128_32::unpackhi(a, b);
+		}
+
+		static INLINE void unpacklohi8 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd128_16 = Simd128<uint16_t>;
+			s1 = simd128_16::unpacklo(a, b);
+			s2 = simd128_16::unpackhi(a, b);
+		}
+
+		/********************/
+		/* unpacklo_twice   */
+		/********************/
+		static INLINE simd_vect unpacklo_twice2 (const simd_vect& a, const simd_vect& b) { return unpacklo2(a,b); }
+
+		static INLINE simd_vect unpacklo_twice4 (const simd_vect& a, const simd_vect& b) {
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			return simd128_32::unpacklo(a1,b1);
+		}
+
+		static INLINE simd_vect unpacklo_twice8 (const simd_vect& a, const simd_vect& b) {
+			using simd128_16 = Simd128<uint16_t>;
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			return simd128_16::unpacklo(a1,b1);
+		}
+
+		/********************/
+		/* unpackhi_twice   */
+		/********************/
+		static INLINE simd_vect unpackhi_twice2 (const simd_vect& a, const simd_vect& b) { return unpackhi2(a,b); }
+
+		static INLINE simd_vect unpackhi_twice4 (const simd_vect& a, const simd_vect& b) {
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			return simd128_32::unpackhi(a1,b1);
+		}
+
+		static INLINE simd_vect unpackhi_twice8 (const simd_vect& a, const simd_vect& b) {
+			using simd128_16 = Simd128<uint16_t>;
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			return simd128_16::unpackhi(a1,b1);
+		}
+
+		/********************/
+		/* unpacklohi_twice */
+		/********************/
+		static INLINE void unpacklohi_twice2 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			unpacklohi2(s1, s2, a, b);
+		}
+
+		static INLINE void unpacklohi_twice4 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			s1 = simd128_32::unpacklo(a1,b1);
+			s2 = simd128_32::unpackhi(a1,b1);
+		}
+
+		static INLINE void unpacklohi_twice8 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd128_16 = Simd128<uint16_t>;
+			using simd128_32 = Simd128<uint32_t>;
+			simd_vect a1 = simd128_32::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd128_32::template shuffle<0xD8>(b);
+			s1 = simd128_16::unpacklo(a1,b1);
+			s2 = simd128_16::unpackhi(a1,b1);
+		}
+	}; // MemoryOp<T, Simd128<T>>
+#endif
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+	template<class T>
+	struct MemoryOp<T, Simd256<T>> {
+		using simd = Simd256<T>;
+		using simd_vect = typename simd::vect_t;
+
+		/**************/
+		/* load/store */
+		/**************/
+		static INLINE simd_vect load (const T* const p) {return simd::loadu(p);}
+		static INLINE void store(T *p, simd_vect v) {return simd::storeu(p, v);}
+
+		/*********************/
+		/* Specific shuffles */
+		/*********************/
+		static INLINE simd_vect shuffletwice8_DD (simd_vect& s1) {
+			using simd256_32 = Simd256<uint32_t>;
+			return simd256_32::template shuffle_twice<0xDD>(s1);
+		}
+
+		/********************/
+		/* unpacklo         */
+		/********************/
+		static INLINE simd_vect unpacklo2 (const simd_vect& a, const simd_vect& b) {return simd::unpacklo128(a, b); }
+
+		static INLINE simd_vect unpacklo4 (const simd_vect& a, const simd_vect& b) {
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_64::unpacklo_twice(a1,b1);
+		}
+
+		static INLINE simd_vect unpacklo8 (const simd_vect& a, const simd_vect& b) {
+			using simd256_32 = Simd256<uint32_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_32::unpacklo_twice(a1, b1);
+		}
+
+		static INLINE simd_vect unpacklo16 (const simd_vect& a, const simd_vect& b) {
+			using simd256_16 = Simd256<uint16_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_16::unpacklo_twice(a1, b1);
+		}
+
+		/********************/
+		/* unpackhi         */
+		/********************/
+		static INLINE simd_vect unpackhi2 (const simd_vect& a, const simd_vect& b) {return simd::unpackhi128(a, b); }
+
+		static INLINE simd_vect unpackhi4 (const simd_vect& a, const simd_vect& b) {
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_64::unpackhi_twice(a1,b1);
+		}
+
+		static INLINE simd_vect unpackhi8 (const simd_vect& a, const simd_vect& b) {
+			using simd256_32 = Simd256<uint32_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_32::unpackhi_twice(a1, b1);
+		}
+
+		static INLINE simd_vect unpackhi16 (const simd_vect& a, const simd_vect& b) {
+			using simd256_16 = Simd256<uint16_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			return simd256_16::unpackhi_twice(a1, b1);
+		}
+
+		/**************/
+		/* unpacklohi */
+		/**************/
+		static INLINE void unpacklohi2 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			s1 = simd::unpacklo128(a, b);
+			s2 = simd::unpackhi128(a, b);
+		}
+
+		static INLINE void unpacklohi4 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			s1 = simd256_64::unpacklo_twice(a1, b1);
+			s2 = simd256_64::unpackhi_twice(a1, b1);
+		}
+
+		static INLINE void unpacklohi8 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_32 = Simd256<uint32_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			s1 = simd256_32::unpacklo_twice(a1, b1);
+			s2 = simd256_32::unpackhi_twice(a1, b1);
+		}
+
+		static INLINE void unpacklohi16 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_16 = Simd256<uint16_t>;
+			using simd256_64 = Simd256<uint64_t>;
+			simd_vect a1 = simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+			simd_vect b1 = simd256_64::template shuffle<0xD8>(b);
+			s1 = simd256_16::unpacklo_twice(a1, b1);
+			s2 = simd256_16::unpackhi_twice(a1, b1);
+		}
+
+		/********************/
+		/* unpacklo_twice   */
+		/********************/
+		static INLINE simd_vect unpacklo_twice2 (const simd_vect& a, const simd_vect& b) { return unpacklo2(a,b); }
+
+		static INLINE simd_vect unpacklo_twice4 (const simd_vect& a, const simd_vect& b) { return Simd256<uint64_t>::unpacklo_twice(a, b); }
+
+		static INLINE simd_vect unpacklo_twice8 (const simd_vect& a, const simd_vect& b) { return Simd256<uint32_t>::unpacklo_twice(a, b); }
+
+		static INLINE simd_vect unpacklo_twice16 (const simd_vect& a, const simd_vect& b) { return Simd256<uint16_t>::unpacklo_twice(a, b); }
+
+		/********************/
+		/* unpackhi_twice   */
+		/********************/
+		static INLINE simd_vect unpackhi_twice2 (const simd_vect& a, const simd_vect& b) { return unpackhi2(a,b); }
+
+		static INLINE simd_vect unpackhi_twice4 (const simd_vect& a, const simd_vect& b) { return Simd256<uint64_t>::unpackhi_twice(a, b); }
+
+		static INLINE simd_vect unpackhi_twice8 (const simd_vect& a, const simd_vect& b) { return Simd256<uint32_t>::unpackhi_twice(a, b); }
+
+		static INLINE simd_vect unpackhi_twice16 (const simd_vect& a, const simd_vect& b) { return Simd256<uint16_t>::unpackhi_twice(a, b); }
+
+		/********************/
+		/* unpacklohi_twice */
+		/********************/
+		static INLINE void unpacklohi_twice2 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			unpacklohi2(s1, s2, a, b);
+		}
+
+		static INLINE void unpacklohi_twice4 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_64 = Simd256<uint64_t>;
+			s1 = simd256_64::unpacklo_twice(a, b);
+			s2 = simd256_64::unpackhi_twice(a, b);
+		}
+
+		static INLINE void unpacklohi_twice8 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_32 = Simd256<uint32_t>;
+			s1 = simd256_32::unpacklo_twice(a, b);
+			s2 = simd256_32::unpackhi_twice(a, b);
+		}
+
+		static INLINE void unpacklohi_twice16 (simd_vect& s1, simd_vect& s2, const simd_vect& a, const simd_vect& b) {
+			using simd256_16 = Simd256<uint16_t>;
+			s1 = simd256_16::unpacklo_twice(a, b);
+			s2 = simd256_16::unpackhi_twice(a, b);
+		}
+
+	};// MemoryOp<T, Simd256<T>>
+#endif
+
+#define Simd_vect typename Simd::vect_t
+
+	/*
+	 * Generic arithmetic operation
+	 */
+	template <class Simd>
+	INLINE Simd_vect reduce (const Simd_vect& a, const Simd_vect& p) {
+		Simd_vect t = Simd::greater(p,a);
+		return Simd::sub(a, Simd::vandnot(p,t));
+	}
+
+	template <class Element, class Simd>
+	INLINE void reduce (Element* a, const Simd_vect& p) {
+		Simd_vect V1;
+		V1 = MemoryOp<Element, Simd>::load(a);
+		V1 = reduce<Simd>(V1, p);
+		MemoryOp<Element, Simd>::store(a,V1);
+	}
+
+	template <class Simd>
+	INLINE Simd_vect add_mod (const Simd_vect& a, const Simd_vect& b, const Simd_vect& p) {
+		Simd_vect c = Simd::add(a,b);
+		return reduce<Simd>(c, p);
+	}
+
+	template <class Simd>
+	INLINE Simd_vect mul_mod (const Simd_vect& a, const Simd_vect& b, const Simd_vect& p, const Simd_vect& bp) {
+		//		std::cout << "Inputs of mul_mod : a, b, p, bp, q, c, t, c - t\n";
+		Simd_vect q = Simd::mulhi(a,bp);
+		Simd_vect c = Simd::mullo(a,b);
+		Simd_vect t = Simd::mullo(q,p);
+		//		FFLAS::print<Simd>(std::cout, a); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, b); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, p); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, bp); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, q); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, c); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, t); std::cout << "\n";
+		//		FFLAS::print<Simd>(std::cout, Simd::sub(c,t)); std::cout << "\n\n";
+		return Simd::sub(c,t);
+	}
+
+	/*
+	 * a = [a0, a0, a2, a2, ...]
+	* b = [?, b0, ?, b2, ...] with bp its shoup mul_mod precomputation [b0p ? b2p ?, ... ]
+	* Return [?, (a0*b0) mod p, ?, (a2*b2) mod p, ... ]
+	*/
+	template <class Simd, class SimdCompute_t>
+	INLINE Simd_vect mul_mod_half (const Simd_vect& a, const Simd_vect& b, const Simd_vect& p, const Simd_vect& bp) {
+#if 1
+		return mul_mod<Simd>(a, b , p, bp);
+#else
+		// TODO : DO SOMETHING IF Modular<uint64, uint128> and no mulx exits
+
+		// T2 = a * bp mod 2^64 (for Modular<Element = uint32, Compute_t = uint64>)
+		// bp = [b0p ? b2p ?, ... ] is enough
+		Simd_vect T2 = SimdCompute_t::mulx(a,bp);
+		Simd_vect T3 = Simd::mullo(T2,p);
+		// At this point T3= [? quo(D)*p ? quo(H)*p] mod 2^32
+		// T4 = [D D H H] * [?, b0, ?, b2] mod 2^32
+		T2 = Simd::mullo(a,b);
+		return Simd::sub(T2,T3);
+#endif
+	}
+
+#undef Simd_vect
+
+}
+
+#endif // __LINBOX_simd_additional_functions_H
diff --git a/linbox/algorithms/polynomial-matrix/simd.h b/linbox/algorithms/polynomial-matrix/simd.h
index badcc2d..6bd7987 100644
--- a/linbox/algorithms/polynomial-matrix/simd.h
+++ b/linbox/algorithms/polynomial-matrix/simd.h
@@ -32,9 +32,8 @@
 #include <iostream>
 
 
-#ifdef __AVX2__
+#ifdef __LINBOX_HAVE_AVX_INSTRUCTIONS2
 /* 256 bits CODE HERE */
-#define __LINBOX_HAVE_AVX2
 
 // define 256 bits simd vector type
 typedef __m256i  _vect256_t; 
@@ -197,11 +196,11 @@ typedef __m128i  _vect128_t;
 #define VEC128_UNPACK_HI_32(C,A,B)			\
 	 C = _mm_unpackhi_epi32(A,B);
 
-// C = unpack_lo32(A,B)
+// C = unpack_lo64(A,B)
 #define VEC128_UNPACK_LO_64(C,A,B)			\
 	 C = _mm_unpacklo_epi64(A,B);
 
-// C = unpack_hi32(A,B)
+// C = unpack_hi64(A,B)
 #define VEC128_UNPACK_HI_64(C,A,B)			\
 	 C = _mm_unpackhi_epi64(A,B);
 
diff --git a/linbox/algorithms/rational-reconstruction.h b/linbox/algorithms/rational-reconstruction.h
index a14c2f0..7f1d808 100644
--- a/linbox/algorithms/rational-reconstruction.h
+++ b/linbox/algorithms/rational-reconstruction.h
@@ -445,7 +445,7 @@ namespace LinBox
 			std::vector<Integer> zz(_lcontainer.size(), modulus);   // stores each truncated p-adic approximation
 			_r.assign(modulus, _r.one);
 
-			size_t len = _lcontainer.length();
+			uint64_t len = _lcontainer.length();
 			/* should be ceil(log(2*numbound*denbound)/log(prime))
 			 *
 			 * should grow in rough proportion to overall
@@ -490,7 +490,7 @@ namespace LinBox
 					_r.convert(iD, _lcontainer.numbound());
 					_r.convert(iN, _lcontainer.denbound());
 					_r.convert(pPower, prime);
-					pPower = pow(pPower, uint64_t(len)-1);
+					pPower = Givaro::pow(pPower, uint64_t(len-1));
 
 					tmp = pPower * iN;
 					tmp /= iD;
diff --git a/linbox/algorithms/rational-solver.inl b/linbox/algorithms/rational-solver.inl
index 7e92d61..8f8e95d 100644
--- a/linbox/algorithms/rational-solver.inl
+++ b/linbox/algorithms/rational-solver.inl
@@ -523,7 +523,7 @@ namespace LinBox
 #endif
 		// m = n =
 		root(tmproot, tmp,3);
-		m = n = tmproot;
+		m = n = uint32_t(tmproot);
 		// 		std::cout<<"block factor= "<<m<<"\n";;
 		typedef SparseMatrix<Field> FMatrix;
 
diff --git a/linbox/algorithms/smith-form-sparseelim-poweroftwo.h b/linbox/algorithms/smith-form-sparseelim-poweroftwo.h
index 879558a..b668d00 100644
--- a/linbox/algorithms/smith-form-sparseelim-poweroftwo.h
+++ b/linbox/algorithms/smith-form-sparseelim-poweroftwo.h
@@ -396,7 +396,7 @@ namespace LinBox
                 ranks.resize(0);
 
                 typedef typename BB::Row Vecteur;
-                size_t EXPONENT = EXPONENTMAX;
+                uint64_t EXPONENT = EXPONENTMAX;
                 UInt_t TWOK(1U); TWOK <<= EXPONENT;
                 UInt_t TWOKMONE(TWOK); --TWOKMONE;
 ENSURE( TWOK == (UInt_t(1U) << EXPONENT) );
diff --git a/linbox/algorithms/vector-fraction.h b/linbox/algorithms/vector-fraction.h
index f128a09..89eca12 100644
--- a/linbox/algorithms/vector-fraction.h
+++ b/linbox/algorithms/vector-fraction.h
@@ -220,7 +220,7 @@ namespace LinBox
 
 			// find A s.t. gcd(denBound, denom + A*other.denom) = g
 			// strategy: pick random values of A <= d(y_0)
-			integer tmp;
+			uint64_t tmp;
 			_domain.convert(tmp, denBound);
 			typename Domain::RandIter randiter(_domain, tmp); //seed omitted
 			// TODO: I don't think this random iterator has high-quality low order bits, which are needed
@@ -274,7 +274,7 @@ namespace LinBox
 
 			// find A s.t. gcd(denBound, denom + A*other.denom) = g
 			// strategy: pick random values of A <= lcm(d(denom), d(other.denom))
-			integer tmp;
+			uint64_t tmp;
 			_domain.mul(tmpe, denom, other.denom);
 			_domain.convert(tmp, tmpe);
 			typename Domain::RandIter randiter(_domain, tmp); //seed omitted
diff --git a/linbox/blackbox/apply.h b/linbox/blackbox/apply.h
index 42cc7ce..9993864 100644
--- a/linbox/blackbox/apply.h
+++ b/linbox/blackbox/apply.h
@@ -158,7 +158,7 @@ namespace LinBox
 
 			integer tmp;
 			bool use_neg=false;
-			size_t maxword=0;
+			uint32_t maxword=0;
 			for (size_t i=0;i<n;++i){
 				_domain.convert(tmp,x[i]);
 				if (tmp <0)
@@ -424,7 +424,7 @@ namespace LinBox
 					maxBitSize+=1;
 				}
 				// compute the number of chunk
-				if (maxValue*prime*_matM.coldim() < integer("9007199254740992")){
+				if (maxValue*prime* uint32_t(_matM.coldim()) < integer("9007199254740992")){
 					num_chunks=1;
 					use_neg=false;
 				}
@@ -988,7 +988,7 @@ namespace LinBox
 						LinBox::integer result, tmp;
 						if (use_neg) {
 							result = -ctd[i];
-							result <<= (num_chunks-1)*16;
+							result <<= uint64_t((num_chunks-1)*16);
 #ifdef DEBUG_CHUNK_APPLYM
 							cout << "rcneg: " << result << endl;
 #endif
diff --git a/linbox/linbox-config.h b/linbox/linbox-config.h
index 2aadc0b..51629e4 100644
--- a/linbox/linbox-config.h
+++ b/linbox/linbox-config.h
@@ -58,12 +58,19 @@ using std::ptrdiff_t;
 #endif
 #endif
 
-#ifdef __FFLASFFPACK_USE_SIMD
-#define __LINBOX_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
+#define __LINBOX_HAVE_SSE4_1_INSTRUCTIONS
 #else
 #define __LINBOX_NO_SIMD
 #endif
 
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
+#define __LINBOX_HAVE_AVX_INSTRUCTIONS
+#endif
+
+#ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
+#define __LINBOX_HAVE_AVX_INSTRUCTIONS2
+#endif
 
 namespace LinBox {
 
diff --git a/linbox/matrix/polynomial-matrix.h b/linbox/matrix/polynomial-matrix.h
index afe31cc..69263e9 100755
--- a/linbox/matrix/polynomial-matrix.h
+++ b/linbox/matrix/polynomial-matrix.h
@@ -34,6 +34,16 @@
 #include "givaro/modular.h"
 #include <algorithm>
 
+#ifdef TRACK_MEMORY_MATPOL
+uint64_t max_memory=0, cur_memory=0;
+#define ADD_MEM(x) {cur_memory+=x; max_memory=std::max(max_memory,cur_memory);}
+#define DEL_MEM(x) {cur_memory-=x;}
+#define STR_MEMINFO std::right<<"\033[31m [ MEM: cur="<<cur_memory/1000000.<<" Mo --- max="<<max_memory/1000000.<<" Mo \033[0m]"
+#define PRINT_MEMINFO std::cerr<<"\033[31m[ MEM: cur="<<cur_memory/1000000.<<" Mo --- max="<<max_memory/1000000.<<" Mo ]\033[0m"<<std::endl;
+#else
+#define ADD_MEM(X) ;
+#define DEL_MEM(X) ;     
+#endif
 
 #define COPY_BLOCKSIZE 32
 
@@ -46,6 +56,9 @@ namespace LinBox{
 	template<size_t type, size_t storage, class Field>
 	class PolynomialMatrix;
 
+	template<typename Field> uint64_t element_storage(const Field& F)      { integer p;F.characteristic(p); return length(p);}
+	template<> uint64_t element_storage(const Givaro::Modular<Givaro::Integer> &F) { integer p;F.characteristic(p); return length(p)+sizeof(Givaro::Integer);}
+	
 	// Class for Polynomial Matrix stored as a Matrix of Polynomials
 	template<class _Field>
 	class PolynomialMatrix<PMType::polfirst,PMStorage::plain,_Field> {
@@ -73,9 +86,17 @@ namespace LinBox{
 					_repview[i*_col+j]= Polynomial(_rep.begin()+(i*_col+j)*_store,_size);
 			//integer p;
 			//std::cout<<"MatrixP allocating : "<<r*c*s*length(f.characteristic(p))/1000000.<<"Mo"<<std::endl;
+			//std::cout<<"(ALLOC) PolynomialMatrix<polfirst> at "<<this<<" : "<<r<<"x"<<c<<" - size= "<<s<<" ==> "<<MB(realmeminfo())<<" Mo   "<<STR_MEMINFO<<std::endl;
+			ADD_MEM(realmeminfo());
 		}
 
+		PolynomialMatrix(const Self_t&) = delete;
+		
 		~PolynomialMatrix(){
+			DEL_MEM(realmeminfo());
+			_rep.clear();
+			//std::cout<<"(FREE) PolynomialMatrix<polfirst> at "<<this<<" : "<<_row<<"x"<<_col<<" - size= "<<_store<<" ==> "<<MB(realmeminfo())<<" Mo   "<<STR_MEMINFO<<std::endl;
+
 			//integer p;
 			//std::cout<<"MatrixP Desallocating : "<<_row*_col*_store*length(_fld->characteristic(p))/1000000.<<"Mo"<<std::endl;
 			
@@ -106,6 +127,8 @@ namespace LinBox{
 
 		// resize the polynomial length of the polynomial matrix
 		void resize(size_t s){
+			if (s==_store) return;
+			//std::cout<<"MATPOL RESIZING : "<<_store<<" --> "<<s<<std::endl;
 			if (s>_store){
 				_rep.resize(s*_row*_col);
 				size_t k=s*_row*_col-1;
@@ -123,9 +146,14 @@ namespace LinBox{
 					for (size_t j=0;j<s;j++,k++)
 						_rep[k]=_rep[i*_store+j];
 				_rep.resize(s*_row*_col);
+				//_rep.shrink_to_fit();
 			}
+			integer p;_fld->characteristic(p); size_t bb=p.bitsize(); if(bb>64) bb+=128; bb/=8;
+			size_t mem=realmeminfo();
 			_store=s;
 			setsize(s);
+			ADD_MEM(realmeminfo());
+			DEL_MEM(mem);
 		}
 
 		void changesize(size_t s){
@@ -289,7 +317,10 @@ namespace LinBox{
 		Element* getWritePointer(){return &_rep[0];}
 		const Element* getPointer() const {return &_rep[0];}
 
-		size_t realmeminfo()const { return _rep.capacity()*sizeof(Element)+_repview.capacity()*sizeof(Polynomial);}
+		size_t realmeminfo()const {
+			return _row*_col*(_store*element_storage(field())+sizeof(Polynomial));}
+		// return _rep.capacity()*sizeof(Element)+_repview.capacity()*sizeof(Polynomial);}
+	
 		size_t meminfo()const { return _rep.size()*sizeof(Element);}
 
 		void changeField(const Field& F){_fld=&F;}
@@ -320,16 +351,21 @@ namespace LinBox{
 
 		PolynomialMatrix() {}
 
+		PolynomialMatrix(const Self_t&) = delete;
+		
 		PolynomialMatrix(const Field& f, size_t r, size_t c, size_t s) :
 			_rep(s,Matrix(f)), _row(r), _col(c), _size(s), _fld(&f) {
 			//_row(r), _col(c), _size(s), _fld(&f) {			
 			for(size_t i=0;i<s;i++)
 				_rep[i].init(f,r,c);
 			//integer p;
-			//std::cout<<"PMatrix allocating : "<<r*c*s*length(f.characteristic(p))/1000000.<<"Mo"<<std::endl;
+			//std::cout<<"(ALLOC) matfirst at "<<this<<" : "<<r<<"x"<<c<<" - size= "<<s<<" ==> "<<MB(realmeminfo())<<" Mo"<<std::endl;
+			ADD_MEM(realmeminfo());
 		}
 
 		~PolynomialMatrix(){
+			DEL_MEM(realmeminfo());
+			//std::cout<<"(FREE) matfirst at "<<this<<" : "<<_row<<"x"<<_col<<" - size= "<<_size<<" ==> "<<MB(realmeminfo())<<" Mo"<<std::endl;
 			//integer p;
 			//std::cout<<"PMatrix Desallocating : "<<_row*_col*_size*length(_fld->characteristic(p))/1000000.<<"Mo"<<std::endl;
 		}
@@ -511,6 +547,11 @@ namespace LinBox{
 			return os;
                 }
 
+		size_t realmeminfo()const {
+			
+			return _size*(_row*_col*element_storage(field())+sizeof(Matrix));
+		}
+		
 		// NEED FOR YUHASZ
 		typedef typename std::vector<Matrix>::const_iterator const_iterator;
 		const_iterator begin() const {return _rep.begin();}
diff --git a/linbox/matrix/sparsematrix/sparse-csr-matrix.h b/linbox/matrix/sparsematrix/sparse-csr-matrix.h
index f60d125..71cbd50 100644
--- a/linbox/matrix/sparsematrix/sparse-csr-matrix.h
+++ b/linbox/matrix/sparsematrix/sparse-csr-matrix.h
@@ -501,7 +501,7 @@ namespace LinBox {
 				S._start[i+1] +=  S._start[i] ;
 
 			{
-				size_t i = 0 ;
+				index_t i = 0 ;
 				svector_t done_col(S.rowdim(),0);
 				for (size_t nextlig = 1 ; nextlig <= rowdim() ; ++nextlig) {
 					// treating line before nextlig
@@ -635,7 +635,7 @@ namespace LinBox {
 		/// make matrix ready to use after a sequence of setEntry calls.
 		void finalize()
 		{
-			if (_start[rowdim()] != _nbnz) { /* if it is so, then all before are 0 and we are fine... */
+			if (_start[rowdim()] != (index_t)_nbnz) { /* if it is so, then all before are 0 and we are fine... */
 				for (size_t i = 2 ; i <= rowdim() ; ++i)
 					_start[i] += _start[i-1];
 				linbox_check(_start[rowdim()] == _nbnz);
@@ -685,7 +685,7 @@ namespace LinBox {
 			myIterator low = std::lower_bound (beg, end, j);
 			ibeg = (index_t)(low-_colid.begin());
 			// insert
-			if ( low == end || j != _colid[ibeg] ) {
+			if ( low == end || (index_t)j != _colid[ibeg] ) {
 				// std::cout << "# 2 insert " << i << ',' << j << ':' << e << std::endl;
 				for (size_t k = i+1 ; k <= _rownb ; ++k)
 					_start[k] += 1 ;
diff --git a/linbox/randiter/givaro-poly.h b/linbox/randiter/givaro-poly.h
index 48308c8..9ce0b0c 100644
--- a/linbox/randiter/givaro-poly.h
+++ b/linbox/randiter/givaro-poly.h
@@ -48,7 +48,7 @@ namespace LinBox
 		GivaroPolyRandIter(Field pd, 
                                    const integer& size = 0,
                                    const integer& seed = 0) :
-                        _randIter(Givaro::GIV_randIter<SubDomain,integer>(pd.subdomain(), size, seed))
+		_randIter(Givaro::GIV_randIter<SubDomain,integer>(pd.subdomain(), uint64_t(size), seed))
 		{_pd = pd;}
 		
 		GivaroPolyRandIter(const GivaroPolyRandIter &R)
diff --git a/linbox/randiter/mersenne-twister.h b/linbox/randiter/mersenne-twister.h
index 4687176..d3af870 100644
--- a/linbox/randiter/mersenne-twister.h
+++ b/linbox/randiter/mersenne-twister.h
@@ -93,9 +93,9 @@ namespace LinBox
 
 }
 
-#if defined(LinBoxSrcOnly) or defined(LinBoxTestOnly)
-#include "linbox/randiter/mersenne-twister.C"
-#endif
+//#if defined(LinBoxSrcOnly) or defined(LinBoxTestOnly)
+//#include "linbox/randiter/mersenne-twister.C"
+//#endif
 #endif // __LINBOX_mersenne_twister_H
 
 
diff --git a/linbox/randiter/random-fftprime.h b/linbox/randiter/random-fftprime.h
index bd4bb18..183ed67 100644
--- a/linbox/randiter/random-fftprime.h
+++ b/linbox/randiter/random-fftprime.h
@@ -1,3 +1,5 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /* linbox/algorithms/
  * Copyright (C) 2005  Pascal Giorgi
  *
@@ -35,12 +37,12 @@ namespace LinBox
 
 	class RandomFFTPrime {
 	public:
-                // define the prime type
+		// define the prime type
 		typedef integer Prime_Type;
-                
+
 		uint64_t           _bits;
-                Prime_Type  _prime_bound;
-                
+		Prime_Type  _prime_bound;
+
 		RandomFFTPrime(Prime_Type pbound=0x100000, unsigned long seed = 0) :
 			_bits(pbound.bitsize()), _prime_bound(pbound)
 		{
@@ -51,29 +53,29 @@ namespace LinBox
 		}
 
 		
-                 /** @brief randomPrime(size_t b)
+		/** @brief randomPrime(size_t b)
 		 *  return a random FFT prime with a 2-valuation larger than b in its order
-                 *  the randomness is on the FFT primes lying in the given range
-                 *  an error is thrown if no such prime exist
+				 *  the randomness is on the FFT primes lying in the given range
+				 *  an error is thrown if no such prime exist
 		 */
 		inline Prime_Type randomPrime (size_t b) const
 		{
-                        integer tmp;
-                        randomPrime(tmp,b);
+			integer tmp;
+			randomPrime(tmp,b);
 			return tmp;
-                }
+		}
 
-                /** @brief randomPrime(Prime_Type& p, size_t b)
+		/** @brief randomPrime(Prime_Type& p, size_t b)
 		 *  return a random FFT prime with a 2-valuation larger than b in its order
-                 *  the randomness is on the FFT primes lying in the given range
-                 *  an error is thrown if no such prime exist
+				 *  the randomness is on the FFT primes lying in the given range
+				 *  an error is thrown if no such prime exist
 		 */
 		inline Prime_Type randomPrime (Prime_Type& t, uint64_t b) const
 		{
-                        linbox_check(b<_bits);
+			linbox_check(b<_bits);
 			size_t tresh;
 			do {
-                                size_t cbits= (size_t)rand() %(_bits-b);
+				size_t cbits= (size_t)rand() %(_bits-b);
 				tresh = 1<<(cbits);
 				uint64_t p = 1<<((size_t)_bits-cbits);
 				do {
@@ -83,8 +85,8 @@ namespace LinBox
 				} while (!Givaro::Protected::probab_prime(t,25) && (tresh));
 			}
 			while(tresh==0);
-                        linbox_check(Givaro::Protected::probab_prime(t,25))
-			return t;
+			linbox_check(Givaro::Protected::probab_prime(t,25))
+					return t;
 		}
 
 		/** @brief generatePrime()
@@ -93,7 +95,7 @@ namespace LinBox
 		inline Prime_Type generatePrime() const
 		{
 			integer tmp;
-                        generatePrime(tmp);
+			generatePrime(tmp);
 			return tmp;
 		}
 
@@ -119,97 +121,112 @@ namespace LinBox
 			return t;
 		}
 
-                // generate a vector of distinct FFT primes with largest 2-valuation
-                // s.t. their product is larger than a given bound
-                inline std::vector<Prime_Type> generatePrimes (const Prime_Type & bound) const {
-                        std::vector<Prime_Type> primes;
-                        Prime_Type prod=1;
-                        integer tmp;
-                        for (int64_t b = _bits - 1; b >= 0; b--)
-                                for (int64_t l = ((int64_t)1 << (_bits - b - 1)) + 1; l < (1L << (_bits - b)); l +=2) {
-                                        tmp = ((int64_t)1 << b) * l + 1;
-                                        if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
-                                                primes.push_back(tmp);
-                                                prod*=tmp;
-                                                if (prod > bound)
-                                                        return primes;
-                                        }
-                                }
-                        linbox_check(prod > bound ); // Could not find enough primes
-                        return primes;
-                }
-
-                  // generate a vector of distinct FFT primes with largest 2-valuation
-                // s.t. their product is larger than a given bound
-                inline bool generatePrimes (const Prime_Type & bound, std::vector<Prime_Type> &primes) const {
-                        primes.clear();
-                        Prime_Type prod=1;
-                        integer tmp;
-                        for (int64_t b = (int64_t)_bits - 1; b >= 0; b--)
-                                for (int64_t l = (1L << ((int64_t)_bits - b - 1)) + 1; l < (1L << ((int64_t)_bits - b)); l +=2) {
-                                        tmp = (1L << b) * l + 1;
-                                        if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
-                                                primes.push_back(tmp);
-                                                prod*=tmp;
-                                                if (prod > bound){
-                                                        return true;
-                                                }
-                                        }
-                                }
-                        return false; // false -> Could not find enough primes
-                }
-
-                size_t twoVal(integer t) const {
-                        integer x=t;
-                        size_t v=0;
-                        while(x%2 == 0) {v++;x/=2;}
-                        return v;
-                }
-
-                // generate a vector of distinct FFT primes with  2-valuation largest than val
-                // s.t. their product is larger than a given bound
-                inline bool generatePrimes ( uint64_t val, const Prime_Type & bound, std::vector<Prime_Type> &primes) const {
-                        primes.clear();
-                        Prime_Type prod=1;
-                        integer tmp;
-                        // std::cout<<"rns bound: "<<bound<<std::endl;
-                        // std::cout<<"2 valuation: "<<val<<std::endl;
-                        // std::cout<<"prime bitmax: "<<_bits<<std::endl;
-                        // std::cout<<"prime max: "<<_prime_bound<<std::endl;
-
-                        if (val > _bits) return false;
+		// generate a vector of distinct FFT primes with largest 2-valuation
+		// s.t. their product is larger than a given bound
+		inline std::vector<Prime_Type> generatePrimes (const Prime_Type & bound) const {
+			std::vector<Prime_Type> primes;
+			Prime_Type prod=1;
+			integer tmp;
+			for (int64_t b = _bits - 1; b >= 0; b--)
+				for (int64_t l = ((int64_t)1 << (_bits - b - 1)) + 1; l < (1L << (_bits - b)); l +=2) {
+					tmp = ((int64_t)1 << b) * l + 1;
+					if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
+						primes.push_back(tmp);
+						prod*=tmp;
+						if (prod > bound)
+							return primes;
+					}
+				}
+			linbox_check(prod > bound ); // Could not find enough primes
+			return primes;
+		}
+
+		// generate a vector of distinct FFT primes with largest 2-valuation
+		// s.t. their product is larger than a given bound
+		inline bool generatePrimes (const Prime_Type & bound, std::vector<Prime_Type> &primes) const {
+			primes.clear();
+			Prime_Type prod=1;
+			integer tmp;
+			for (int64_t b = (int64_t)_bits - 1; b >= 0; b--)
+				for (int64_t l = (1L << ((int64_t)_bits - b - 1)) + 1; l < (1L << ((int64_t)_bits - b)); l +=2) {
+					tmp = (1L << b) * l + 1;
+					if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
+						primes.push_back(tmp);
+						prod*=tmp;
+						if (prod > bound){
+							return true;
+						}
+					}
+				}
+			return false; // false -> Could not find enough primes
+		}
+
+		size_t twoVal(integer t) const {
+			integer x=t;
+			size_t v=0;
+			while(x%2 == 0) {v++;x/=2;}
+			return v;
+		}
+
+		// generate a vector of distinct FFT primes with  2-valuation largest than val
+		// s.t. their product is larger than a given bound
+		inline bool generatePrimes ( uint64_t val, const Prime_Type & bound, std::vector<Prime_Type> &primes) const {
+			primes.clear();
+			Prime_Type prod=1;
+			integer tmp;
+			// std::cout<<"rns bound: "<<bound<<std::endl;
+			// std::cout<<"2 valuation: "<<val<<std::endl;
+			// std::cout<<"prime bitmax: "<<_bits<<std::endl;
+			// std::cout<<"prime max: "<<_prime_bound<<std::endl;
+
+			if (val > _bits) return false;
 
 #if 0
-                        for (int64_t b = (int64_t)_bits; b >= (int64_t)val; b--)
-                                // for (uint64_t l = (1ULL << ((int64_t)_bits - b - 1)) + 1; l < (1ULL << ((int64_t)_bits - b)); l +=2) {
-                                for (int64_t l = ((int64_t)1 << ((int64_t)_bits - b)) - 1; l >=1; l -=2) {
-                                        tmp = ((int64_t)1 << b) * l + 1;                                        
-                                        if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
-                                                primes.push_back(tmp);
-                                                prod*=tmp;
-                                                //std::cout<<tmp<<" -> "<<tmp.bitsize()<<" (order="<<twoVal(tmp-1)<<") "<<prod<<std::endl;
-                                                if (prod > bound){
-                                                        return true;
-                                                }
-                                        }
-                                }
+			for (int64_t b = (int64_t)_bits; b >= (int64_t)val; b--)
+				// for (uint64_t l = (1ULL << ((int64_t)_bits - b - 1)) + 1; l < (1ULL << ((int64_t)_bits - b)); l +=2) {
+				for (int64_t l = ((int64_t)1 << ((int64_t)_bits - b)) - 1; l >=1; l -=2) {
+					tmp = ((int64_t)1 << b) * l + 1;
+					if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
+						primes.push_back(tmp);
+						prod*=tmp;
+						//std::cout<<tmp<<" -> "<<tmp.bitsize()<<" (order="<<twoVal(tmp-1)<<") "<<prod<<std::endl;
+						if (prod > bound){
+							return true;
+						}
+					}
+				}
 #else
-                        for (int64_t l = (_prime_bound -1) >>val ; l >=1; l -=1) {
-                                tmp = ((int64_t)1 << val) * l + 1;                                        
-                                if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
-                                        primes.push_back(tmp);
-                                        prod*=tmp;
-                                        //std::cout<<tmp<<" -> "<<tmp.bitsize()<<" (order="<<twoVal(tmp-1)<<") "<<prod<<std::endl;
-                                        if (prod > bound){
-                                                return true;
-                                        }
-                                }
-                        }
-                                
-                        
+			for (int64_t l = (_prime_bound -1) >>val ; l >=1; l -=1) {
+				tmp = ((int64_t)1 << val) * l + 1;
+				if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
+					primes.push_back(tmp);
+					prod*=tmp;
+					//std::cout<<tmp<<" -> "<<tmp.bitsize()<<" (order="<<twoVal(tmp-1)<<") "<<prod<<std::endl;
+					if (prod > bound){
+						// try to replace the last prime with a smallest one
+						for (int64_t k=1;k<l;k++){
+							tmp = ((int64_t)1 << val) * k + 1;
+							if (Givaro::Protected::probab_prime(tmp, 25) >= 1) {
+								if (prod*tmp > bound*primes.back()){
+									//std::cout<<"replacing prime "<<primes.back()<<" with "<<tmp<< " -> "<<tmp.bitsize()<<" (order="<<twoVal(tmp-1)<<") ";
+									prod/=primes.back();
+									primes.back()=tmp;
+									prod*=tmp;
+									//std::cout<<prod<<std::endl;
+									return true;
+								}
+							}
+						}
+
+						return true;
+					}
+				}
+			}
+
+
 #endif
-                        return false; // false -> Could not find enough primes
-                }
+			return false; // false -> Could not find enough primes
+		}
 
 		/** @brief setSeed (unsigned long ul)
 		 *  Set the random seed to be ul.
@@ -222,13 +239,3 @@ namespace LinBox
 }
 
 #endif //__LINBOX_random_fftprime_H
-
-
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,:0,t0,+0,=s
-// Local Variables:
-// mode: C++
-// tab-width: 8
-// indent-tabs-mode: nil
-// c-basic-offset: 8
-// End:
-
diff --git a/linbox/ring/modular/Makefile.am b/linbox/ring/modular/Makefile.am
index 7234edb..88f62fe 100644
--- a/linbox/ring/modular/Makefile.am
+++ b/linbox/ring/modular/Makefile.am
@@ -23,7 +23,6 @@ pkgincludesubdir=$(pkgincludedir)/ring/modular
 
 BASIC_HDRS =            \
     modular-unsigned.h  \
-    modular-unsigned.inl         \
     modular-int32.h     \
     modular-int64.h     \
     modular-short.h     \
diff --git a/linbox/ring/modular/modular-int32.h b/linbox/ring/modular/modular-int32.h
index 7c02cff..44889e3 100644
--- a/linbox/ring/modular/modular-int32.h
+++ b/linbox/ring/modular/modular-int32.h
@@ -57,8 +57,7 @@ namespace LinBox
 	template<class Field>
 	class MVProductDomain;
 
-	template <>
-	template<class Compute>
+        template<class Compute>
 	class FieldAXPY<Givaro::Modular<int32_t,Compute> > {
 	public:
 
@@ -133,7 +132,6 @@ namespace LinBox
 	};
 
 
-	template <>
 	template <class Compute>
 	class DotProductDomain<Givaro::Modular<int32_t,Compute> > : public VectorDomainBase<Givaro::Modular<int32_t,Compute> > {
 
@@ -201,7 +199,6 @@ namespace LinBox
 
 	// Specialization of MVProductDomain for int32_t modular field
 
-	template <>
 	template <class Compute>
 	class MVProductDomain<Givaro::Modular<int32_t,Compute> > {
 	public:
@@ -222,198 +219,176 @@ namespace LinBox
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::DenseVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseSequenceVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseAssociativeVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseParallelVectorTag) const;
+		 VectorCategories::DenseVectorTag) const
+                {
 
-		mutable std::vector<uint64_t> _tmp;
-	};
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-	template <class Compute>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int32_t,Compute> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<int32_t,Compute> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::DenseVectorTag) const
-	{
-
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator l;
 
-		uint64_t t;
+                        uint64_t t;
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+                        std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-			{
-				t = ((uint64_t) *k) * ((uint64_t) *j);
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                                                {
+                                                        t = ((uint64_t) *k) * ((uint64_t) *j);
 
-				*l += t;
+                                                        *l += t;
 
-				if (*l < t)
-					*l += (uint64_t) VD.faxpy ()._two_64;
-			}
-		}
+                                                        if (*l < t)
+                                                                *l += (uint64_t) VD.faxpy ()._two_64;
+                                                }
+                                }
 
-		typename Vector1::iterator w_j;
-		typedef typename Vector1::value_type elements ;
+                        typename Vector1::iterator w_j;
+                        typedef typename Vector1::value_type elements ;
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = elements(*l % VD.field ().characteristic());
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = elements(*l % VD.field ().characteristic());
 
-		return w;
-	}
+                        return w;
+                }
 
-	template <class Compute>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int32_t,Compute> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<int32_t,Compute> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseSequenceVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseSequenceVectorTag) const
+                {
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		typename Matrix::ConstColIterator       i = A.colBegin ();
-		typename Vector2::const_iterator        j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator         l;
+                        typename Matrix::ConstColIterator       i = A.colBegin ();
+                        typename Vector2::const_iterator        j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator         l;
 
-		uint64_t t;
+                        uint64_t t;
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+                        std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
 
-		for (j = v.begin (); j != v.end (); ++j, ++i) {
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
+                        for (j = v.begin (); j != v.end (); ++j, ++i) {
+                                for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
+                                        t = ((uint64_t) k->second) * ((uint64_t) *j);
 
-				_tmp[k->first] += t;
+                                        _tmp[k->first] += t;
 
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += (uint64_t)VD.faxpy ()._two_64;
-			}
-		}
+                                        if (_tmp[k->first] < t)
+                                                _tmp[k->first] += (uint64_t)VD.faxpy ()._two_64;
+                                }
+                        }
 
-		typename Vector1::iterator w_j;
-		typedef typename Vector1::value_type val_t;
+                        typename Vector1::iterator w_j;
+                        typedef typename Vector1::value_type val_t;
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = (val_t)( (int32_t)(*l) % VD.field ().characteristic() );
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = (val_t)( (int32_t)(*l) % VD.field ().characteristic() );
 
-		return w;
-	}
+                        return w;
+                }
 
-	template <class Compute>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int32_t,Compute> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<int32_t,Compute> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseAssociativeVectorTag) const
-	{
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseAssociativeVectorTag) const
+                {
 
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator l;
 
-		uint64_t t;
+                        uint64_t t;
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+                        std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-			{
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                                                {
+                                                        t = ((uint64_t) k->second) * ((uint64_t) *j);
 
-				_tmp[k->first] += t;
+                                                        _tmp[k->first] += t;
 
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += VD.faxpy ()._two_64;
-			}
-		}
+                                                        if (_tmp[k->first] < t)
+                                                                _tmp[k->first] += VD.faxpy ()._two_64;
+                                                }
+                                }
 
-		typename Vector1::iterator w_j;
+                        typename Vector1::iterator w_j;
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field ().characteristic();
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field ().characteristic();
 
-		return w;
-	}
+                        return w;
+                }
 
-	template <class Compute>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int32_t,Compute> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<int32_t,Compute> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseParallelVectorTag) const
-	{
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseParallelVectorTag) const
+                {
 
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::first_type::const_iterator k_idx;
-		typename Matrix::Column::second_type::const_iterator k_elt;
-		std::vector<uint64_t>::iterator l;
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::first_type::const_iterator k_idx;
+                        typename Matrix::Column::second_type::const_iterator k_elt;
+                        std::vector<uint64_t>::iterator l;
 
-		uint64_t t;
+                        uint64_t t;
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+                        std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
-			     k_idx != i->first.end ();
-			     ++k_idx, ++k_elt, ++l)
-			{
-				t = ((uint64_t) *k_elt) * ((uint64_t) *j);
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
+                                             k_idx != i->first.end ();
+                                             ++k_idx, ++k_elt, ++l)
+                                                {
+                                                        t = ((uint64_t) *k_elt) * ((uint64_t) *j);
 
-				_tmp[*k_idx] += t;
+                                                        _tmp[*k_idx] += t;
 
-				if (_tmp[*k_idx] < t)
-					_tmp[*k_idx] += VD.faxpy()._two_64;
-			}
-		}
+                                                        if (_tmp[*k_idx] < t)
+                                                                _tmp[*k_idx] += VD.faxpy()._two_64;
+                                                }
+                                }
 
-		typename Vector1::iterator w_j;
+                        typename Vector1::iterator w_j;
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field().characteristic();
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field().characteristic();
 
-		return w;
-	}
+                        return w;
+                }
 
 
+		mutable std::vector<uint64_t> _tmp;
+	};
 }
 
 #endif //__LINBOX_modular_int32_H
diff --git a/linbox/ring/modular/modular-int64.h b/linbox/ring/modular/modular-int64.h
index 7a860a7..8c0c563 100644
--- a/linbox/ring/modular/modular-int64.h
+++ b/linbox/ring/modular/modular-int64.h
@@ -61,7 +61,6 @@ namespace LinBox
 	template<class Field>
 	class MVProductDomain;
 
-	template <>
 	template <typename Compute_t>
 	class FieldAXPY<Givaro::Modular<int64_t,Compute_t> > {
 	public:
@@ -134,7 +133,6 @@ namespace LinBox
 	};
 
 
-	template <>
 	template <typename Compute_t>
 	class DotProductDomain<Givaro::Modular<int64_t,Compute_t> > : public VectorDomainBase<Givaro::Modular<int64_t,Compute_t> > {
 
@@ -201,7 +199,6 @@ namespace LinBox
 
 	// Specialization of MVProductDomain for int64_t modular field
 
-	template <>
 	template <typename Compute_t>
 	class MVProductDomain<Givaro::Modular<int64_t,Compute_t> > {
 	public:
@@ -222,210 +219,175 @@ namespace LinBox
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field> &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::DenseVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseSequenceVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseAssociativeVectorTag) const;
-		template <class Vector1, class Matrix, class Vector2>
-		Vector1 &mulColDenseSpecialized
-		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseParallelVectorTag) const;
+		 VectorCategories::DenseVectorTag) const
+                {
 
-		mutable std::vector<uint64_t> _tmp;
-	};
-	
-	template <typename Compute_t>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 & MVProductDomain<Givaro::Modular<int64_t,Compute_t> >::
-	mulColDenseSpecialized (const VectorDomain<Givaro::Modular<int64_t,Compute_t> > &VD,
-				Vector1 &w,
-				const Matrix &A,
-				const Vector2 &v,
-				VectorCategories::DenseVectorTag) const
-	{
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator l;
 
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
+                        uint64_t t;
 
-		uint64_t t;
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
 
-		std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                                                {
+                                                        t = ((uint64_t) *k) * ((uint64_t) *j);
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-			{
-				t = ((uint64_t) *k) * ((uint64_t) *j);
+                                                        *l += t;
 
-				*l += t;
+                                                        if (*l < t)
+                                                                *l += VD.faxpy()._two_64;
+                                                }
+                                }
 
-				if (*l < t)
-					*l += VD.faxpy()._two_64;
-			}
-		}
+                        typename Vector1::iterator w_j;
 
-		typename Vector1::iterator w_j;
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field ().characteristic();
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field ().characteristic();
+                        return w;
+                }
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseSequenceVectorTag) const
+                {
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		return w;
-	}
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator l;
 
-	template <typename Compute_t>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int64_t,Compute_t> >::
-	mulColDenseSpecialized (const VectorDomain<Givaro::Modular<int64_t,Compute_t> > &VD,
-				Vector1 &w,
-				const Matrix &A,
-				const Vector2 &v,
-				VectorCategories::SparseSequenceVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
-
-		uint64_t t;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
-
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-			{
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
+                        uint64_t t;
 
-				_tmp[k->first] += t;
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += VD.faxpy()._two_64;
-			}
-		}
+                        std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
 
-		typename Vector1::iterator w_j;
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                                                {
+                                                        t = ((uint64_t) k->second) * ((uint64_t) *j);
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field ().characteristic();
+                                                        _tmp[k->first] += t;
 
-		return w;
-	}
+                                                        if (_tmp[k->first] < t)
+                                                                _tmp[k->first] += VD.faxpy()._two_64;
+                                                }
+                                }
 
-	template <typename Compute_t>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int64_t,Compute_t> > ::
-	mulColDenseSpecialized(const VectorDomain<Givaro::Modular<int64_t,Compute_t> > &VD,
-			       Vector1 &w,
-			       const Matrix &A,
-			       const Vector2 &v,
-			       VectorCategories::SparseAssociativeVectorTag) const
-	{
+                        typename Vector1::iterator w_j;
 
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field ().characteristic();
 
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
+                        return w;
+                }
 
-		uint64_t t;
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseAssociativeVectorTag) const
+                {
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::const_iterator k;
+                        std::vector<uint64_t>::iterator l;
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-			{
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
+                        uint64_t t;
 
-				_tmp[k->first] += t;
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += VD.faxpy()._two_64;
-			}
-		}
+                        std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
 
-		typename Vector1::iterator w_j;
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                                                {
+                                                        t = ((uint64_t) k->second) * ((uint64_t) *j);
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field ().characteristic();
+                                                        _tmp[k->first] += t;
 
-		return w;
-	}
+                                                        if (_tmp[k->first] < t)
+                                                                _tmp[k->first] += VD.faxpy()._two_64;
+                                                }
+                                }
 
-	template <typename Compute_t>
-	template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<int64_t,Compute_t> > ::
-	mulColDenseSpecialized (const VectorDomain<Givaro::Modular<int64_t,Compute_t> > &VD,
-				Vector1 &w,
-				const Matrix &A,
-				const Vector2 &v,
-				VectorCategories::SparseParallelVectorTag) const
-	{
+                        typename Vector1::iterator w_j;
+
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field ().characteristic();
 
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
+                        return w;
+                }
 
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::first_type::const_iterator k_idx;
-		typename Matrix::Column::second_type::const_iterator k_elt;
-		std::vector<uint64_t>::iterator l;
+		template <class Vector1, class Matrix, class Vector2>
+		Vector1 &mulColDenseSpecialized
+		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
+		 VectorCategories::SparseParallelVectorTag) const
+                {
 
-		uint64_t t;
+                        linbox_check (A.coldim () == v.size ());
+                        linbox_check (A.rowdim () == w.size ());
 
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
+                        typename Matrix::ConstColIterator i = A.colBegin ();
+                        typename Vector2::const_iterator j;
+                        typename Matrix::Column::first_type::const_iterator k_idx;
+                        typename Matrix::Column::second_type::const_iterator k_elt;
+                        std::vector<uint64_t>::iterator l;
 
-		std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
+                        uint64_t t;
 
-		for (j = v.begin (); j != v.end (); ++j, ++i)
-		{
-			for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
-			     k_idx != i->first.end ();
-			     ++k_idx, ++k_elt, ++l)
-			{
-				t = ((uint64_t) *k_elt) * ((uint64_t) *j);
+                        if (_tmp.size () < w.size ())
+                                _tmp.resize (w.size ());
 
-				_tmp[*k_idx] += t;
+                        std::fill (_tmp.begin (), _tmp.begin () + w.size (), 0);
 
-				if (_tmp[*k_idx] < t)
-					_tmp[*k_idx] += VD.faxpy()._two_64;
-			}
-		}
+                        for (j = v.begin (); j != v.end (); ++j, ++i)
+                                {
+                                        for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
+                                             k_idx != i->first.end ();
+                                             ++k_idx, ++k_elt, ++l)
+                                                {
+                                                        t = ((uint64_t) *k_elt) * ((uint64_t) *j);
+
+                                                        _tmp[*k_idx] += t;
+
+                                                        if (_tmp[*k_idx] < t)
+                                                                _tmp[*k_idx] += VD.faxpy()._two_64;
+                                                }
+                                }
 
-		typename Vector1::iterator w_j;
+                        typename Vector1::iterator w_j;
 
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l % VD.field ().characteristic();
+                        for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                                *w_j = *l % VD.field ().characteristic();
 
-		return w;
-	}
+                        return w;
+                }
 
 
+		mutable std::vector<uint64_t> _tmp;
+	};
 }
 
 #undef LINBOX_MAX_INT64
diff --git a/linbox/ring/modular/modular-unsigned.h b/linbox/ring/modular/modular-unsigned.h
index ec7230a..a94f058 100755
--- a/linbox/ring/modular/modular-unsigned.h
+++ b/linbox/ring/modular/modular-unsigned.h
@@ -40,12 +40,16 @@
 #ifndef __LINBOX_field_modular_unsigned_H
 #define __LINBOX_field_modular_unsigned_H
 
-namespace LinBox { /*  uint8_t */
+//Dan Roche 7-2-04
+#ifndef __LINBOX_MIN
+#define __LINBOX_MIN(a,b) ( (a) < (b) ? (a) : (b) )
+#endif
 
-	/*! Specialization of FieldAXPY for uint8_t modular field */
+namespace LinBox { /*  uint8_t */
 
-	template <>
-	template<class Compute_t>
+            /*! Specialization of FieldAXPY for uint8_t modular field */
+    
+    template<class Compute_t>
 	class FieldAXPY<Givaro::Modular<uint8_t,Compute_t > > {
 	public:
 
@@ -54,71 +58,71 @@ namespace LinBox { /*  uint8_t */
 		typedef Givaro::Modular<uint8_t, Compute_t> Field;
 
 		FieldAXPY (const Field &F) :
-			_k (((uint64_t) -1LL) / ((F.characteristic() - 1) * (F.characteristic() - 1))),
-			_field (&F),
-			_y (0),
-			i (_k)
-		{
-		}
+                _k (((uint64_t) -1LL) / ((F.characteristic() - 1) * (F.characteristic() - 1))),
+                _field (&F),
+                _y (0),
+                i (_k)
+            {
+            }
 
 		FieldAXPY (const FieldAXPY &faxpy) :
-			_k (faxpy._k),
-			_field (faxpy._field),
-			_y (0),
-			i (_k)
-		{}
+                _k (faxpy._k),
+                _field (faxpy._field),
+                _y (0),
+                i (_k)
+            {}
 
 		FieldAXPY<Field> &operator = (const FieldAXPY &faxpy)
-		{
-			_field = faxpy._field;
-			_y = faxpy._y;
-			_k = faxpy._k;
-			return *this;
-		}
-
+            {
+                _field = faxpy._field;
+                _y = faxpy._y;
+                _k = faxpy._k;
+                return *this;
+            }
+        
 		inline uint64_t& mulacc (const Element &a, const Element &x)
-		{
-			uint32_t t = (uint32_t) a * (uint32_t) x;
+            {
+                uint32_t t = (uint32_t) a * (uint32_t) x;
 
-			if (!i--) {
-				i = int(_k);
-				return _y = _y % (uint32_t) field().characteristic() + t;
-			}
-			else
-				return _y += t;
-		}
+                if (!i--) {
+                    i = int(_k);
+                    return _y = _y % (uint32_t) field().characteristic() + t;
+                }
+                else
+                    return _y += t;
+            }
 
 		inline uint64_t& accumulate (const Element &t)
-		{
+            {
 
-			if (!i--) {
-				i = int(_k);
-				return _y = _y % (uint32_t) field().characteristic() + t;
-			}
-			else
-				return _y += t;
-		}
+                if (!i--) {
+                    i = int(_k);
+                    return _y = _y % (uint32_t) field().characteristic() + t;
+                }
+                else
+                    return _y += t;
+            }
 
 		inline Element &get (Element &y) const
-		{
-			const_cast<FieldAXPY<Field>*>(this)->_y %= (uint32_t) field().characteristic();
-			if ((int32_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
-			y = (uint8_t) _y;
-			const_cast<FieldAXPY<Field>*>(this)->i = int(_k);
-			return y;
-		}
+            {
+                const_cast<FieldAXPY<Field>*>(this)->_y %= (uint32_t) field().characteristic();
+                if ((int32_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
+                y = (uint8_t) _y;
+                const_cast<FieldAXPY<Field>*>(this)->i = int(_k);
+                return y;
+            }
 
 		inline FieldAXPY &assign (const Element y)
-		{
-			_y = y;
-			i = int(_k);
-			return *this;
-		}
+            {
+                _y = y;
+                i = int(_k);
+                return *this;
+            }
 
 		inline void reset()
-		{
-			_y = 0;
-		}
+            {
+                _y = 0;
+            }
 
 		inline const Field & field() const { return *_field; }
 		
@@ -131,9 +135,8 @@ namespace LinBox { /*  uint8_t */
 		int64_t i;
 	};
 
-	//! Specialization of DotProductDomain for unsigned short modular field
+        //! Specialization of DotProductDomain for unsigned short modular field
 
-	template <>
 	template <class Compute_t>
 	class DotProductDomain<Givaro::Modular<uint8_t, Compute_t> > : public  VectorDomainBase<Givaro::Modular<uint8_t, Compute_t> > {
 	public:
@@ -143,22 +146,84 @@ namespace LinBox { /*  uint8_t */
 
 		DotProductDomain(){}
 		DotProductDomain (const Field &F) :
-			VectorDomainBase<Field> (F)
-		{}
+                VectorDomainBase<Field> (F)
+            {}
 		using VectorDomainBase<Field>::field;
 		using VectorDomainBase<Field>::faxpy;
 
 	protected:
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::const_iterator i = v1.begin ();
+                typename Vector2::const_iterator j = v2.begin ();
+
+                typename Vector1::const_iterator iterend = v1.begin () + (ptrdiff_t)(v1.size() % faxpy()._k);
+
+                uint64_t y = 0;
+
+                for (; i != iterend; ++i, ++j)
+                    y += (uint64_t) *i * (uint64_t) *j;
+
+                y %= (uint64_t) field().characteristic();
+
+                for (; iterend != v1.end (); j += (ptrdiff_t)faxpy()._k) {
+                    typename Vector1::const_iterator iter_i = iterend;
+                    typename Vector2::const_iterator iter_j;
+
+                    iterend += (ptrdiff_t)faxpy()._k;
+
+                    for (iter_j = j; iter_i != iterend; ++iter_i, ++iter_j)
+                        y += (uint64_t) *iter_i * (uint64_t) *j;
+
+                    y %= (uint64_t) field().characteristic();
+                }
+
+                return res = (uint8_t) y;
+            }
+
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const;
-        
+		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::first_type::const_iterator i_idx = v1.first.begin ();
+                typename Vector1::second_type::const_iterator i_elt = v1.second.begin ();
+
+                uint64_t y = 0;
+
+                if (v1.first.size () < faxpy()._k) {
+                    for (; i_idx != v1.first.end (); ++i_idx, ++i_elt)
+                        y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
+
+                    return res = uint8_t (y % (uint64_t) field().characteristic());
+                }
+                else {
+                    typename Vector1::first_type::const_iterator iterend = v1.first.begin () +(ptrdiff_t)( v1.first.size() % faxpy()._k);
+
+                    for (; i_idx != iterend; ++i_idx, ++i_elt)
+                        y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
+
+                    y %= (uint64_t) field().characteristic();
+
+                    while (iterend != v1.first.end ()) {
+                        typename Vector1::first_type::const_iterator iter_i_idx = iterend;
+                        typename Vector1::second_type::const_iterator iter_i_elt = i_elt;
+
+                        iterend += (ptrdiff_t)faxpy()._k;
+                        i_elt += (ptrdiff_t)faxpy()._k;
+
+                        for (; iter_i_idx != iterend; ++iter_i_idx, ++iter_i_elt)
+                            y += (uint64_t) *iter_i_elt * (uint64_t) v2[*iter_i_idx];
+
+                        y %= (uint64_t) field().characteristic();
+                    }
+
+                    return res = (uint8_t) y;
+                }
+            }
 	};
 
-	//! Specialization of MVProductDomain for uint8_t modular field
+        //! Specialization of MVProductDomain for uint8_t modular field
 
-	template <>
 	template<class Compute_t>
 	class MVProductDomain<Givaro::Modular<uint8_t,Compute_t> > {
 	public:
@@ -170,27 +235,188 @@ namespace LinBox { /*  uint8_t */
 		template <class Vector1, class Matrix, class Vector2>
 		inline Vector1 &mulColDense
 		(const VectorDomain<Field> &VD, Vector1 &w, const Matrix &A, const Vector2 &v) const
-		{
-			return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
-		}
+            {
+                return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
+            }
 
 	private:
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field> &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::DenseVectorTag) const;
+		 VectorCategories::DenseVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint32_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            *l += *k * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field> &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseSequenceVectorTag) const;
+		 VectorCategories::SparseSequenceVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint32_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
+
+                l_end = _tmp.begin () + (ptrdiff_t)w.size ();
+
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            _tmp[k->first] += k->second * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseAssociativeVectorTag) const;
+		 VectorCategories::SparseAssociativeVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint32_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            _tmp[k->first] += k->second * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseParallelVectorTag) const;
+		 VectorCategories::SparseParallelVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::first_type::const_iterator k_idx;
+                typename Matrix::Column::second_type::const_iterator k_elt;
+                std::vector<uint32_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
+
+                l_end = _tmp.begin () + (ptrdiff_t)w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + (ptrdiff_t)__LINBOX_MIN (uint64_t (A.coldim ()), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
+                             k_idx != i->first.end ();
+                             ++k_idx, ++k_elt, ++l)
+                            _tmp[*k_idx] += *k_elt * *j;
+
+                    j_end += (ptrdiff_t) __LINBOX_MIN (uint64_t (A.coldim () - (size_t)(j_end - v.begin ())), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+                typedef typename Vector1::value_type val_t ;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = (val_t) *l;
+
+                return w;
+            }
+
 
 		mutable std::vector<uint32_t> _tmp;
 	};
@@ -199,8 +425,7 @@ namespace LinBox { /*  uint8_t */
 
 namespace LinBox { /*  uint16_t */
 
-	/*! Specialization of FieldAXPY for uint16_t modular field */
-	template <>
+        /*! Specialization of FieldAXPY for uint16_t modular field */
 	template<class Compute_t>
 	class FieldAXPY<Givaro::Modular<uint16_t,Compute_t> > {
 	public:
@@ -209,66 +434,66 @@ namespace LinBox { /*  uint16_t */
 		typedef Givaro::Modular<uint16_t,Compute_t> Field;
 
 		FieldAXPY (const Field &F) :
-			_k (((uint64_t) -1LL) / ((F.characteristic() - 1) * (F.characteristic() - 1))),
-			_field (&F),
-			_y (0),
-			i (_k)
-		{}
+                _k (((uint64_t) -1LL) / ((F.characteristic() - 1) * (F.characteristic() - 1))),
+                _field (&F),
+                _y (0),
+                i (_k)
+            {}
 		
 		FieldAXPY (const FieldAXPY &faxpy) :
-			_k (faxpy._k), _field (faxpy._field), _y (0), i (_k)
-		{}
+                _k (faxpy._k), _field (faxpy._field), _y (0), i (_k)
+            {}
 
 		FieldAXPY<Field > &operator = (const FieldAXPY &faxpy)
-		{
-			_field = faxpy._field;
-			_y = faxpy._y;
-			_k = faxpy._k;
-			return *this;
-		}
+            {
+                _field = faxpy._field;
+                _y = faxpy._y;
+                _k = faxpy._k;
+                return *this;
+            }
 
 		inline uint64_t& mulacc (const Element &a, const Element &x)
-		{
-			uint64_t t = (uint64_t) ((long long) a * (long long) x);
+            {
+                uint64_t t = (uint64_t) ((long long) a * (long long) x);
 
-			if (!i--) {
-				i = (int)_k;
-				return _y = _y % (uint64_t) field().characteristic() + t;
-			}
-			else
-				return _y += t;
-		}
+                if (!i--) {
+                    i = (int)_k;
+                    return _y = _y % (uint64_t) field().characteristic() + t;
+                }
+                else
+                    return _y += t;
+            }
 
 		inline uint64_t& accumulate (const Element &t)
-		{
-			if (!i--) {
-				i = (int)_k;
-				return _y = _y % (uint64_t) field().characteristic() + t;
-			}
-			else
-				return _y += t;
-		}
+            {
+                if (!i--) {
+                    i = (int)_k;
+                    return _y = _y % (uint64_t) field().characteristic() + t;
+                }
+                else
+                    return _y += t;
+            }
 
 		inline Element &get (Element &y) const
-		{
-			const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
-			if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
-			y = (uint16_t) _y;
-			const_cast<FieldAXPY<Field>*>(this)->i = int(_k);
-			return y;
-		}
+            {
+                const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
+                if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
+                y = (uint16_t) _y;
+                const_cast<FieldAXPY<Field>*>(this)->i = int(_k);
+                return y;
+            }
 
 		inline FieldAXPY &assign (const Element y)
-		{
-			_y = y;
-			i = (int)_k;
-			return *this;
-		}
+            {
+                _y = y;
+                i = (int)_k;
+                return *this;
+            }
 
 		inline void reset()
-		{
-			_y = 0;
-		}
+            {
+                _y = 0;
+            }
 
 		inline const Field & field() const {return *_field;}
 		
@@ -281,66 +506,297 @@ namespace LinBox { /*  uint16_t */
 		int64_t i;
 	};
 
-	//! Specialization of DotProductDomain for unsigned short modular field
+        //! Specialization of DotProductDomain for unsigned short modular field
 
-	template <>
 	template<class Compute_t>
 	class DotProductDomain<Givaro::Modular<uint16_t,Compute_t> > : public VectorDomainBase<Givaro::Modular<uint16_t,Compute_t> > {
 	public:
 
 		typedef uint16_t Element;
-                typedef Givaro::Modular<uint16_t,Compute_t> Field;
+        typedef Givaro::Modular<uint16_t,Compute_t> Field;
 
 		DotProductDomain () {}
 		DotProductDomain (const Field &F) :
-			VectorDomainBase<Field > (F)
-		{}
+                VectorDomainBase<Field > (F)
+            {}
 		using VectorDomainBase<Field>::field;
 		using VectorDomainBase<Field>::faxpy;
 
 	protected:
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::const_iterator i = v1.begin ();
+                typename Vector2::const_iterator j = v2.begin ();
+
+                typename Vector1::const_iterator iterend = v1.begin () + (ptrdiff_t)(v1.size() % faxpy()._k);
+
+                uint64_t y = 0;
+
+                for (; i != iterend; ++i, ++j)
+                    y += (uint64_t) *i * (uint64_t) *j;
+
+                y %= (uint64_t) field().characteristic();
+
+                for (; iterend != v1.end (); j += faxpy()._k) {
+                    typename Vector1::const_iterator iter_i = iterend;
+                    typename Vector2::const_iterator iter_j;
+
+                    iterend += faxpy()._k;
+
+                    for (iter_j = j; iter_i != iterend; ++iter_i, ++iter_j)
+                        y += (uint64_t) *iter_i * (uint64_t) *j;
+
+                    y %= (uint64_t) field().characteristic();
+                }
+
+                return res = (uint16_t) y;
+            }
+
         
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::first_type::const_iterator i_idx = v1.first.begin ();
+                typename Vector1::second_type::const_iterator i_elt = v1.second.begin ();
+
+                uint64_t y = 0;
+
+                if (v1.first.size () < faxpy()._k) {
+                    for (; i_idx != v1.first.end (); ++i_idx, ++i_elt)
+                        y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
+
+                    return res = (uint16_t) (y % (uint64_t) field().characteristic());
+                }
+                else {
+                    typename Vector1::first_type::const_iterator iterend = v1.first.begin () +(ptrdiff_t)( v1.first.size() % faxpy()._k );
+
+                    for (; i_idx != iterend; ++i_idx, ++i_elt)
+                        y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
+
+                    y %= (uint64_t) field().characteristic();
+
+                    while (iterend != v1.first.end ()) {
+                        typename Vector1::first_type::const_iterator iter_i_idx = iterend;
+                        typename Vector1::second_type::const_iterator iter_i_elt = i_elt;
+
+                        iterend += faxpy()._k;
+                        i_elt += faxpy()._k;
+
+                        for (; iter_i_idx != iterend; ++iter_i_idx, ++iter_i_elt)
+                            y += (uint64_t) *iter_i_elt * (uint64_t) v2[*iter_i_idx];
+
+                        y %= (uint64_t) field().characteristic();
+                    }
+
+                    return res = (Element) y;
+                }
+            }
         
 	};
 
-	//! Specialization of MVProductDomain for uint16_t modular field
+        //! Specialization of MVProductDomain for uint16_t modular field
 
-	template <>
 	template<class Compute_t>
 	class MVProductDomain<Givaro::Modular<uint16_t,Compute_t> > {
 	public:
 
 		typedef uint16_t Element;
-                typedef Givaro::Modular<uint16_t,Compute_t> Field;
+        typedef Givaro::Modular<uint16_t,Compute_t> Field;
 	protected:
 		template <class Vector1, class Matrix, class Vector2>
 		inline Vector1 &mulColDense
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v) const
-		{
-			return mulColDenseSpecialized (VD, w, A, v, VectorTraits<typename Matrix::Column>::VectorCategory ());
-		}
+            {
+                return mulColDenseSpecialized (VD, w, A, v, VectorTraits<typename Matrix::Column>::VectorCategory ());
+            }
 
 	private:
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::DenseVectorTag) const;
+		 VectorCategories::DenseVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j = v.begin (), j_end;
+                typename Matrix::Column::const_iterator k;
+                    // Dan Roche, 7-1-04
+                    // std::vector<uint32_t>::iterator l, l_end;
+                std::vector<uint64_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            *l += *k * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseSequenceVectorTag) const;
+		 VectorCategories::SparseSequenceVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::const_iterator k;
+                    // Dan Roche, 7-1-04
+                    // std::vector<uint32_t>::iterator l, l_end;
+                std::vector<uint64_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            _tmp[k->first] += k->second * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseAssociativeVectorTag) const;
+		 VectorCategories::SparseAssociativeVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::const_iterator k;
+                    // Dan Roche, 7-1-04
+                    // std::vector<uint32_t>::iterator l, l_end;
+                std::vector<uint64_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                    j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
+                            _tmp[k->first] += k->second * *j;
+
+                    j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseParallelVectorTag) const;
+		 VectorCategories::SparseParallelVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j, j_end;
+                typename Matrix::Column::first_type::const_iterator k_idx;
+                typename Matrix::Column::second_type::const_iterator k_elt;
+                    // Dan Roche, 7-1-04
+                    // std::vector<uint32_t>::iterator l, l_end;
+                std::vector<uint64_t>::iterator l, l_end;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                l_end = _tmp.begin () +(ptrdiff_t) w.size ();
+
+                do {
+                    j = v.begin ();
+                        //Dan Roche, 7-2-04
+                        //j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
+                    j_end = j + __LINBOX_MIN (A.coldim (), VD.faxpy()._k);
+
+                    for (; j != j_end; ++j, ++i)
+                        for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
+                             k_idx != i->first.end ();
+                             ++k_idx, ++k_elt, ++l)
+                            _tmp[*k_idx] += *k_elt * *j;
+
+                        //j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+                    j_end += __LINBOX_MIN (A.coldim () - (j_end - v.begin ()), VD.faxpy()._k);
+
+                    for (l =_tmp.begin (); l != l_end; ++l)
+                        *l %= VD.field ().characteristic();
+
+                } while (j_end != v.end ());
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = *l;
+
+                return w;
+            }
 
 		mutable std::vector<uint64_t> _tmp;
 	};
@@ -359,9 +815,8 @@ namespace LinBox { /*  uint32_t */
 	class MVProductDomain;
 
 
-	/*! Specialization of FieldAXPY for unsigned short modular field */
+        /*! Specialization of FieldAXPY for unsigned short modular field */
 
-	template <>
 	template<class Compute_t>
 	class FieldAXPY<Givaro::Modular<uint32_t, Compute_t> > {
 	public:
@@ -370,62 +825,62 @@ namespace LinBox { /*  uint32_t */
 		typedef Givaro::Modular<uint32_t, Compute_t> Field;
 
 		FieldAXPY (const Field &F) :
-			_field (&F), _y(0)
-		{
-			_two_64 = (uint64_t(1) << 32) % uint64_t(F.characteristic());
-			_two_64 = (_two_64 * _two_64) % uint64_t(F.characteristic());
-		}
+                _field (&F), _y(0)
+            {
+                _two_64 = (uint64_t(1) << 32) % uint64_t(F.characteristic());
+                _two_64 = (_two_64 * _two_64) % uint64_t(F.characteristic());
+            }
 
 		FieldAXPY (const FieldAXPY &faxpy) :
-			_two_64 (faxpy._two_64), _field (faxpy._field), _y (0)
-		{}
+                _two_64 (faxpy._two_64), _field (faxpy._field), _y (0)
+            {}
 
 		FieldAXPY<Field > &operator = (const FieldAXPY &faxpy)
-		{
-			_field = faxpy._field;
-			_y = faxpy._y;
-			_two_64 = faxpy._two_64;
-			return *this;
-		}
+            {
+                _field = faxpy._field;
+                _y = faxpy._y;
+                _two_64 = faxpy._two_64;
+                return *this;
+            }
 
 		inline uint64_t& mulacc (const Element &a, const Element &x)
-		{
-			uint64_t t = (uint64_t) a * (uint64_t) x;
-			_y += t;
+            {
+                uint64_t t = (uint64_t) a * (uint64_t) x;
+                _y += t;
 
-			if (_y < t)
-				return _y += _two_64;
-			else
-				return _y;
-		}
+                if (_y < t)
+                    return _y += _two_64;
+                else
+                    return _y;
+            }
 
 		inline uint64_t& accumulate (const Element &t)
-		{
-			_y += t;
+            {
+                _y += t;
 
-			if (_y < t)
-				return _y += _two_64;
-			else
-				return _y;
-		}
+                if (_y < t)
+                    return _y += _two_64;
+                else
+                    return _y;
+            }
 
 		inline uint64_t& accumulate_special (const Element &t)
-		{
-			return _y += t;
-		}
+            {
+                return _y += t;
+            }
 
 		inline Element &get (Element &y) const
-		{
-			const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
-			//if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
-			return y = (uint32_t) _y;
-		}
+            {
+                const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
+                    //if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
+                return y = (uint32_t) _y;
+            }
 
 		inline FieldAXPY &assign (const Element y)
-		{
-			_y = y;
-			return *this;
-		}
+            {
+                _y = y;
+                return *this;
+            }
 
 		inline void reset() {
 			_y = 0;
@@ -443,9 +898,8 @@ namespace LinBox { /*  uint32_t */
 		uint64_t _y;
 	};
 
-	//! Specialization of DotProductDomain for uint32_t modular field
+        //! Specialization of DotProductDomain for uint32_t modular field
 
-	template <>
 	template<class Compute_t>
 	class DotProductDomain<Givaro::Modular<uint32_t,Compute_t> > : public VectorDomainBase<Givaro::Modular<uint32_t,Compute_t> > {
 	public:
@@ -455,24 +909,61 @@ namespace LinBox { /*  uint32_t */
 
 		DotProductDomain () {}
 		DotProductDomain (const Field &F) :
-			VectorDomainBase<Field > (F)
-		{}
+                VectorDomainBase<Field > (F)
+            {}
 		using VectorDomainBase<Field >::field;
 		using VectorDomainBase<Field >::faxpy;
 
 	protected:
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::const_iterator i;
+                typename Vector2::const_iterator j;
+
+                uint64_t y = 0;
+                uint64_t t;
+
+                for (i = v1.begin (), j = v2.begin (); i < v1.end (); ++i, ++j) {
+                    t = (uint64_t) *i * (uint64_t) *j;
+                    y += t;
+
+                    if (y < t)
+                        y += faxpy()._two_64;
+                }
+
+                y %= (uint64_t) field().characteristic();
+
+                return res = (uint32_t) y;
+            }
         
 
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const
+            {
+                typename Vector1::first_type::const_iterator i_idx;
+                typename Vector1::second_type::const_iterator i_elt;
+
+                uint64_t y = 0;
+                uint64_t t = 0;
+
+                for (i_idx = v1.first.begin (), i_elt = v1.second.begin (); i_idx != v1.first.end (); ++i_idx, ++i_elt) {
+                    t = (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
+                    y += t;
+                    if (y < t)
+                        y += faxpy()._two_64;
+                }
+
+                y %= (uint64_t) field().characteristic();
+
+                return res = (uint32_t)y;
+            }
+
         
 	};
 
-	//! Specialization of MVProductDomain for uint32_t modular field
+        //! Specialization of MVProductDomain for uint32_t modular field
 
-	template <>
 	template <class Compute_t>
 	class MVProductDomain<Givaro::Modular<uint32_t,Compute_t> > {
 	public:
@@ -484,27 +975,174 @@ namespace LinBox { /*  uint32_t */
 		template <class Vector1, class Matrix, class Vector2>
 		inline Vector1 &mulColDense
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v) const
-		{
-			return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
-		}
+            {
+                return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
+            }
 
 	private:
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::DenseVectorTag) const;
+		 VectorCategories::DenseVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint64_t>::iterator l;
+
+                uint64_t t;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                for (j = v.begin (); j != v.end (); ++j, ++i) {
+                    for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
+                        t = ((uint64_t) *k) * ((uint64_t) *j);
+
+                        *l += t;
+
+                        if (*l < t)
+                            *l += VD.faxpy()._two_64;
+                    }
+                }
+
+                typename Vector1::iterator w_j;
+                typedef typename Vector1::value_type element;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = (element)(*l % VD.field ().characteristic());
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseSequenceVectorTag) const;
+		 VectorCategories::SparseSequenceVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint64_t>::iterator l;
+
+                uint64_t t;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t) w.size (), 0);
+
+                for (j = v.begin (); j != v.end (); ++j, ++i) {
+                    for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
+                        t = ((uint64_t) k->second) * ((uint64_t) *j);
+
+                        _tmp[k->first] += t;
+
+                        if (_tmp[k->first] < t)
+                            _tmp[k->first] += VD.faxpy()._two_64;
+                    }
+                }
+
+                typename Vector1::iterator             w_j;
+                typedef typename Vector1::value_type val_t;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = val_t(*l % VD.field ().characteristic());
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseAssociativeVectorTag) const;
+		 VectorCategories::SparseAssociativeVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j;
+                typename Matrix::Column::const_iterator k;
+                std::vector<uint64_t>::iterator l;
+
+                uint64_t t;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                for (j = v.begin (); j != v.end (); ++j, ++i) {
+                    for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
+                        t = ((uint64_t) k->second) * ((uint64_t) *j);
+
+                        _tmp[k->first] += t;
+
+                        if (_tmp[k->first] < t)
+                            _tmp[k->first] += VD.faxpy()._two_64;
+                    }
+                }
+
+                typename Vector1::iterator w_j;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = (uint32_t) (uint32_t)*l % VD.field ().characteristic();
+
+                return w;
+            }
+
 		template <class Vector1, class Matrix, class Vector2>
 		Vector1 &mulColDenseSpecialized
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-		 VectorCategories::SparseParallelVectorTag) const;
+		 VectorCategories::SparseParallelVectorTag) const
+            {
+                linbox_check (A.coldim () == v.size ());
+                linbox_check (A.rowdim () == w.size ());
+
+                typename Matrix::ConstColIterator i = A.colBegin ();
+                typename Vector2::const_iterator j;
+                typename Matrix::Column::first_type::const_iterator k_idx;
+                typename Matrix::Column::second_type::const_iterator k_elt;
+                std::vector<uint64_t>::iterator l;
+
+                uint64_t t;
+
+                if (_tmp.size () < w.size ())
+                    _tmp.resize (w.size ());
+
+                std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
+
+                for (j = v.begin (); j != v.end (); ++j, ++i) {
+                    for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
+                         k_idx != i->first.end ();
+                         ++k_idx, ++k_elt, ++l)
+                    {
+                        t = ((uint64_t) *k_elt) * ((uint64_t) *j);
+
+                        _tmp[*k_idx] += t;
+
+                        if (_tmp[*k_idx] < t)
+                            _tmp[*k_idx] += VD.faxpy()._two_64;
+                    }
+                }
+
+                typename Vector1::iterator     w_j;
+                typedef typename Vector1::value_type val_t;
+
+                for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
+                    *w_j = val_t(*l % VD.field ().characteristic());
+
+                return w;
+            }
+
 
 		mutable std::vector<uint64_t> _tmp;
 	};
@@ -522,9 +1160,8 @@ namespace LinBox { /*  uint64_t */
 	template<class Field>
 	class MVProductDomain;
 
-	/*! Specialization of FieldAXPY for unsigned short modular field */
+        /*! Specialization of FieldAXPY for unsigned short modular field */
 
-	template <>
 	template<typename Compute_t>
 	class FieldAXPY<Givaro::Modular<uint64_t,Compute_t> > {
 	public:
@@ -533,61 +1170,61 @@ namespace LinBox { /*  uint64_t */
 		typedef Givaro::Modular<uint64_t,Compute_t> Field;
 
 		FieldAXPY (const Field &F) :
-			_field (&F), _y(0)
-		{
-			_two_64 = (uint64_t(1) << 32) % uint64_t(F.characteristic());
-			_two_64 = (_two_64 * _two_64) % uint64_t(F.characteristic());
-		}
+                _field (&F), _y(0)
+            {
+                _two_64 = (uint64_t(1) << 32) % uint64_t(F.characteristic());
+                _two_64 = (_two_64 * _two_64) % uint64_t(F.characteristic());
+            }
 
 		FieldAXPY (const FieldAXPY &faxpy) :
-			_two_64 (faxpy._two_64), _field (faxpy._field), _y (0)
-		{}
+                _two_64 (faxpy._two_64), _field (faxpy._field), _y (0)
+            {}
 
 		FieldAXPY<Field > &operator = (const FieldAXPY &faxpy)
-		{
-			_field = faxpy._field;
-			_y = faxpy._y;
-			return *this;
-		}
+            {
+                _field = faxpy._field;
+                _y = faxpy._y;
+                return *this;
+            }
 
 		inline uint64_t& mulacc (const Element &a, const Element &x)
-		{
-			uint64_t t = (uint64_t) a * (uint64_t) x;
-			_y += t;
+            {
+                uint64_t t = (uint64_t) a * (uint64_t) x;
+                _y += t;
 
-			if (_y < t)
-				return _y += _two_64;
-			else
-				return _y;
-		}
+                if (_y < t)
+                    return _y += _two_64;
+                else
+                    return _y;
+            }
 
 		inline uint64_t& accumulate (const Element &t)
-		{
-			_y += t;
+            {
+                _y += t;
 
-			if (_y < t)
-				return _y += _two_64;
-			else
-				return _y;
-		}
+                if (_y < t)
+                    return _y += _two_64;
+                else
+                    return _y;
+            }
 
 		inline uint64_t& accumulate_special (const Element &t)
-		{
-			return _y += t;
-		}
+            {
+                return _y += t;
+            }
 
 		inline Element &get (Element &y) const
-		{
-			const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
-			//if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
-			return y = (uint64_t) _y;
-		}
+            {
+                const_cast<FieldAXPY<Field>*>(this)->_y %= (uint64_t) field().characteristic();
+                    //if ((int64_t) _y < 0) const_cast<FieldAXPY<Field>*>(this)->_y += field().characteristic();
+                return y = (uint64_t) _y;
+            }
 
 		inline FieldAXPY &assign (const Element y)
-		{
-			_y = y;
-			return *this;
-		}
+            {
+                _y = y;
+                return *this;
+            }
 
 		inline void reset() {
 			_y = 0;
@@ -605,12 +1242,11 @@ namespace LinBox { /*  uint64_t */
 		uint64_t _y;
 	};
 
-	//! Specialization of DotProductDomain for uint64_t modular field
+        //! Specialization of DotProductDomain for uint64_t modular field
 
-	template <>
 	template <typename Compute_t>
 	class DotProductDomain<Givaro::Modular<uint64_t,Compute_t>> : public VectorDomainBase<Givaro::Modular<uint64_t,Compute_t> > {
-	public:
+      public:
 
 		typedef uint64_t Element;
 		typedef Givaro::Modular<uint64_t,Compute_t> Field;
@@ -622,19 +1258,60 @@ namespace LinBox { /*  uint64_t */
 		using VectorDomainBase<Field >::field;
 		using VectorDomainBase<Field >::faxpy;
 
-	protected:
+      protected:
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const;
-        
+            inline Element &dotSpecializedDD (Element &res, const Vector1 &v1, const Vector2 &v2) const
+        {
+
+			typename Vector1::const_iterator i;
+			typename Vector2::const_iterator j;
+
+			uint64_t y = 0;
+			uint64_t t;
+
+			for (i = v1.begin (), j = v2.begin (); i < v1.end (); ++i, ++j)
+			{
+				t = ( (uint64_t) *i ) * ( (uint64_t) *j );
+				y += t;
+
+				if (y < t)
+					y += faxpy()._two_64;
+			}
+
+			y %= (uint64_t) field().characteristic();
+			return res = (Element)y;
+
+		}
 
 		template <class Vector1, class Vector2>
-		inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const;
+            inline Element &dotSpecializedDSP (Element &res, const Vector1 &v1, const Vector2 &v2) const
+		{
+			typename Vector1::first_type::const_iterator i_idx;
+			typename Vector1::second_type::const_iterator i_elt;
+
+			uint64_t y = 0;
+			uint64_t t;
+
+			for (i_idx = v1.first.begin (), i_elt = v1.second.begin (); i_idx != v1.first.end (); ++i_idx, ++i_elt)
+			{
+				t = ( (uint64_t) *i_elt ) * ( (uint64_t) v2[*i_idx] );
+				y += t;
+
+				if (y < t)
+					y += faxpy()._two_64;
+			}
+
+
+			y %= (uint64_t) field().characteristic();
+
+			return res = (Element) y;
+		}	
+
         
 	};
 
-	//! Specialization of MVProductDomain for uint64_t modular field
+        //! Specialization of MVProductDomain for uint64_t modular field
 
-	template <>
 	template <typename Compute_t>
 	class MVProductDomain<Givaro::Modular<uint64_t,Compute_t> > {
 	public:
@@ -646,9 +1323,9 @@ namespace LinBox { /*  uint64_t */
 		template <class Vector1, class Matrix, class Vector2>
 		inline Vector1 &mulColDense
 		(const VectorDomain<Field > &VD, Vector1 &w, const Matrix &A, const Vector2 &v) const
-		{
-			return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
-		}
+            {
+                return mulColDenseSpecialized (VD, w, A, v, typename VectorTraits<typename Matrix::Column>::VectorCategory ());
+            }
 
 	private:
 		template <class Vector1, class Matrix, class Vector2>
@@ -673,14 +1350,13 @@ namespace LinBox { /*  uint64_t */
 
 }
 
-#include "linbox/ring/modular/modular-unsigned.inl"
 
 #endif // __LINBOX_field_modular_unsigned_H
 
 // Local Variables:
 // mode: C++
-// tab-width: 8
+// tab-width: 4
 // indent-tabs-mode: nil
-// c-basic-offset: 8
+// c-basic-offset: 4
 // End:
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
diff --git a/linbox/ring/modular/modular-unsigned.inl b/linbox/ring/modular/modular-unsigned.inl
deleted file mode 100755
index d50ad34..0000000
--- a/linbox/ring/modular/modular-unsigned.inl
+++ /dev/null
@@ -1,828 +0,0 @@
-/* linbox/field/modular.inl
- * Copyright (C) 2002 Bradford Hovinen
- * Copyright (C) 2002 Ahmet Duran
- * Copyright (C) 2002 B. David Saunders
- *
- * Written by Bradford Hovinen <hovinen at cis.udel.edu>,
- *            Ahmet Duran <duran at cis.udel.edu>,
- *            Dave Saunders <saunders at cis.udel.edu>
- *
- * ------------------------------------
- *
- *
- * ========LICENCE========
- * This file is part of the library LinBox.
- *
- * LinBox is free software: you can redistribute it and/or modify
- * it under the terms of the  GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- * ========LICENCE========
- *.
- */
-
-#ifndef __LINBOX_field_modular_INL
-#define __LINBOX_field_modular_INL
-
-//Dan Roche 7-2-04
-#ifndef __LINBOX_MIN
-#define __LINBOX_MIN(a,b) ( (a) < (b) ? (a) : (b) )
-#endif
-
-#include <iostream>
-
-namespace LinBox {
-
-        template<typename Compute_t>
-	template <class Vector1, class Vector2>
-	inline uint8_t &DotProductDomain<Givaro::Modular<uint8_t,Compute_t> >::dotSpecializedDD
-	(uint8_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::const_iterator i = v1.begin ();
-		typename Vector2::const_iterator j = v2.begin ();
-
-		typename Vector1::const_iterator iterend = v1.begin () + (ptrdiff_t)(v1.size() % faxpy()._k);
-
-		uint64_t y = 0;
-
-		for (; i != iterend; ++i, ++j)
-			y += (uint64_t) *i * (uint64_t) *j;
-
-		y %= (uint64_t) field().characteristic();
-
-		for (; iterend != v1.end (); j += (ptrdiff_t)faxpy()._k) {
-			typename Vector1::const_iterator iter_i = iterend;
-			typename Vector2::const_iterator iter_j;
-
-			iterend += (ptrdiff_t)faxpy()._k;
-
-			for (iter_j = j; iter_i != iterend; ++iter_i, ++iter_j)
-				y += (uint64_t) *iter_i * (uint64_t) *j;
-
-			y %= (uint64_t) field().characteristic();
-		}
-
-		return res = (uint8_t) y;
-	}
-
-        template<typename Compute_t>
-	template <class Vector1, class Vector2>
-	inline uint8_t &DotProductDomain<Givaro::Modular<uint8_t,Compute_t> >::dotSpecializedDSP
-	(uint8_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::first_type::const_iterator i_idx = v1.first.begin ();
-		typename Vector1::second_type::const_iterator i_elt = v1.second.begin ();
-
-		uint64_t y = 0;
-
-		if (v1.first.size () < faxpy()._k) {
-			for (; i_idx != v1.first.end (); ++i_idx, ++i_elt)
-				y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
-
-			return res = uint8_t (y % (uint64_t) field().characteristic());
-		}
-		else {
-			typename Vector1::first_type::const_iterator iterend = v1.first.begin () +(ptrdiff_t)( v1.first.size() % faxpy()._k);
-
-			for (; i_idx != iterend; ++i_idx, ++i_elt)
-				y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
-
-			y %= (uint64_t) field().characteristic();
-
-			while (iterend != v1.first.end ()) {
-				typename Vector1::first_type::const_iterator iter_i_idx = iterend;
-				typename Vector1::second_type::const_iterator iter_i_elt = i_elt;
-
-				iterend += (ptrdiff_t)faxpy()._k;
-				i_elt += (ptrdiff_t)faxpy()._k;
-
-				for (; iter_i_idx != iterend; ++iter_i_idx, ++iter_i_elt)
-					y += (uint64_t) *iter_i_elt * (uint64_t) v2[*iter_i_idx];
-
-				y %= (uint64_t) field().characteristic();
-			}
-
-			return res = (uint8_t) y;
-		}
-	}
-
-        template<typename Compute_t>
-	template <class Vector1, class Vector2>
-	inline uint16_t &DotProductDomain<Givaro::Modular<uint16_t,Compute_t> >::dotSpecializedDD
-	(uint16_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::const_iterator i = v1.begin ();
-		typename Vector2::const_iterator j = v2.begin ();
-
-		typename Vector1::const_iterator iterend = v1.begin () + (ptrdiff_t)(v1.size() % faxpy()._k);
-
-		uint64_t y = 0;
-
-		for (; i != iterend; ++i, ++j)
-			y += (uint64_t) *i * (uint64_t) *j;
-
-		y %= (uint64_t) field().characteristic();
-
-		for (; iterend != v1.end (); j += faxpy()._k) {
-			typename Vector1::const_iterator iter_i = iterend;
-			typename Vector2::const_iterator iter_j;
-
-			iterend += faxpy()._k;
-
-			for (iter_j = j; iter_i != iterend; ++iter_i, ++iter_j)
-				y += (uint64_t) *iter_i * (uint64_t) *j;
-
-			y %= (uint64_t) field().characteristic();
-		}
-
-		return res = (uint16_t) y;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Vector2>
-	inline uint16_t &DotProductDomain<Givaro::Modular<uint16_t,Compute_t> >::dotSpecializedDSP
-	(uint16_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::first_type::const_iterator i_idx = v1.first.begin ();
-		typename Vector1::second_type::const_iterator i_elt = v1.second.begin ();
-
-		uint64_t y = 0;
-
-		if (v1.first.size () < faxpy()._k) {
-			for (; i_idx != v1.first.end (); ++i_idx, ++i_elt)
-				y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
-
-			return res = (uint16_t) (y % (uint64_t) field().characteristic());
-		}
-		else {
-			typename Vector1::first_type::const_iterator iterend = v1.first.begin () +(ptrdiff_t)( v1.first.size() % faxpy()._k );
-
-			for (; i_idx != iterend; ++i_idx, ++i_elt)
-				y += (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
-
-			y %= (uint64_t) field().characteristic();
-
-			while (iterend != v1.first.end ()) {
-				typename Vector1::first_type::const_iterator iter_i_idx = iterend;
-				typename Vector1::second_type::const_iterator iter_i_elt = i_elt;
-
-				iterend += faxpy()._k;
-				i_elt += faxpy()._k;
-
-				for (; iter_i_idx != iterend; ++iter_i_idx, ++iter_i_elt)
-					y += (uint64_t) *iter_i_elt * (uint64_t) v2[*iter_i_idx];
-
-				y %= (uint64_t) field().characteristic();
-			}
-
-			return res = (Element) y;
-		}
-	}
-
-	template<typename Compute_t> template <class Vector1, class Vector2>
-	inline uint32_t &DotProductDomain<Givaro::Modular<uint32_t,Compute_t> >::dotSpecializedDD
-	(uint32_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::const_iterator i;
-		typename Vector2::const_iterator j;
-
-		uint64_t y = 0;
-		uint64_t t;
-
-		for (i = v1.begin (), j = v2.begin (); i < v1.end (); ++i, ++j) {
-			t = (uint64_t) *i * (uint64_t) *j;
-			y += t;
-
-			if (y < t)
-				y += faxpy()._two_64;
-		}
-
-		y %= (uint64_t) field().characteristic();
-
-		return res = (uint32_t) y;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Vector2>
-	inline uint32_t &DotProductDomain<Givaro::Modular<uint32_t,Compute_t> >::dotSpecializedDSP
-	(uint32_t &res, const Vector1 &v1, const Vector2 &v2) const
-	{
-		typename Vector1::first_type::const_iterator i_idx;
-		typename Vector1::second_type::const_iterator i_elt;
-
-		uint64_t y = 0;
-		uint64_t t = 0;
-
-		for (i_idx = v1.first.begin (), i_elt = v1.second.begin (); i_idx != v1.first.end (); ++i_idx, ++i_elt) {
-			t = (uint64_t) *i_elt * (uint64_t) v2[*i_idx];
-			y += t;
-			if (y < t)
-				y += faxpy()._two_64;
-		}
-
-		y %= (uint64_t) field().characteristic();
-
-		return res = (uint32_t)y;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Vector2>
-	inline uint64_t &DotProductDomain<Givaro::Modular<uint64_t,Compute_t> >::dotSpecializedDD
-	(uint64_t &res, const Vector1 &v1, const Vector2 &v2) const
-		{
-
-			typename Vector1::const_iterator i;
-			typename Vector2::const_iterator j;
-
-			uint64_t y = 0;
-			uint64_t t;
-
-			for (i = v1.begin (), j = v2.begin (); i < v1.end (); ++i, ++j)
-			{
-				t = ( (uint64_t) *i ) * ( (uint64_t) *j );
-				y += t;
-
-				if (y < t)
-					y += faxpy()._two_64;
-			}
-
-			y %= (uint64_t) field().characteristic();
-			return res = (Element)y;
-
-		}
-
-
-	template<typename Compute_t> template <class Vector1, class Vector2>
-	inline uint64_t &DotProductDomain<Givaro::Modular<uint64_t,Compute_t> >::dotSpecializedDSP
-	(uint64_t &res, const Vector1 &v1, const Vector2 &v2) const
-		{
-			typename Vector1::first_type::const_iterator i_idx;
-			typename Vector1::second_type::const_iterator i_elt;
-
-			uint64_t y = 0;
-			uint64_t t;
-
-			for (i_idx = v1.first.begin (), i_elt = v1.second.begin (); i_idx != v1.first.end (); ++i_idx, ++i_elt)
-			{
-				t = ( (uint64_t) *i_elt ) * ( (uint64_t) v2[*i_idx] );
-				y += t;
-
-				if (y < t)
-					y += faxpy()._two_64;
-			}
-
-
-			y %= (uint64_t) field().characteristic();
-
-			return res = (Element) y;
-		}	
-
-
-
-
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint8_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint8_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::DenseVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint32_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					*l += *k * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint8_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint8_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseSequenceVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint32_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
-
-		l_end = _tmp.begin () + (ptrdiff_t)w.size ();
-
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					_tmp[k->first] += k->second * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint8_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint8_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseAssociativeVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint32_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					_tmp[k->first] += k->second * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint8_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint8_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseParallelVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::first_type::const_iterator k_idx;
-		typename Matrix::Column::second_type::const_iterator k_elt;
-		std::vector<uint32_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t)w.size (), 0);
-
-		l_end = _tmp.begin () + (ptrdiff_t)w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + (ptrdiff_t)__LINBOX_MIN (uint64_t (A.coldim ()), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
-				     k_idx != i->first.end ();
-				     ++k_idx, ++k_elt, ++l)
-					_tmp[*k_idx] += *k_elt * *j;
-
-			j_end += (ptrdiff_t) __LINBOX_MIN (uint64_t (A.coldim () - (size_t)(j_end - v.begin ())), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-		typedef typename Vector1::value_type val_t ;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = (val_t) *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint16_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint16_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::DenseVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j = v.begin (), j_end;
-		typename Matrix::Column::const_iterator k;
-		// Dan Roche, 7-1-04
-		// std::vector<uint32_t>::iterator l, l_end;
-		std::vector<uint64_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					*l += *k * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint16_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint16_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseSequenceVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::const_iterator k;
-		// Dan Roche, 7-1-04
-		// std::vector<uint32_t>::iterator l, l_end;
-		std::vector<uint64_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					_tmp[k->first] += k->second * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint16_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint16_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseAssociativeVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::const_iterator k;
-		// Dan Roche, 7-1-04
-		// std::vector<uint32_t>::iterator l, l_end;
-		std::vector<uint64_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l)
-					_tmp[k->first] += k->second * *j;
-
-			j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint16_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint16_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseParallelVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j, j_end;
-		typename Matrix::Column::first_type::const_iterator k_idx;
-		typename Matrix::Column::second_type::const_iterator k_elt;
-		// Dan Roche, 7-1-04
-		// std::vector<uint32_t>::iterator l, l_end;
-		std::vector<uint64_t>::iterator l, l_end;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		l_end = _tmp.begin () +(ptrdiff_t) w.size ();
-
-		do {
-			j = v.begin ();
-			//Dan Roche, 7-2-04
-			//j_end = j + __LINBOX_MIN (A->coldim (), VD.faxpy()._k);
-			j_end = j + __LINBOX_MIN (A.coldim (), VD.faxpy()._k);
-
-			for (; j != j_end; ++j, ++i)
-				for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
-				     k_idx != i->first.end ();
-				     ++k_idx, ++k_elt, ++l)
-					_tmp[*k_idx] += *k_elt * *j;
-
-			//j_end += __LINBOX_MIN (A->coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-			j_end += __LINBOX_MIN (A.coldim () - (j_end - v.begin ()), VD.faxpy()._k);
-
-			for (l =_tmp.begin (); l != l_end; ++l)
-				*l %= VD.field ().characteristic();
-
-		} while (j_end != v.end ());
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = *l;
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint32_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint32_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::DenseVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
-
-		uint64_t t;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		for (j = v.begin (); j != v.end (); ++j, ++i) {
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
-				t = ((uint64_t) *k) * ((uint64_t) *j);
-
-				*l += t;
-
-				if (*l < t)
-					*l += VD.faxpy()._two_64;
-			}
-		}
-
-		typename Vector1::iterator w_j;
-		typedef typename Vector1::value_type element;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = (element)(*l % VD.field ().characteristic());
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint32_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint32_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseSequenceVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
-
-		uint64_t t;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () + (ptrdiff_t) w.size (), 0);
-
-		for (j = v.begin (); j != v.end (); ++j, ++i) {
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
-
-				_tmp[k->first] += t;
-
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += VD.faxpy()._two_64;
-			}
-		}
-
-		typename Vector1::iterator             w_j;
-		typedef typename Vector1::value_type val_t;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = val_t(*l % VD.field ().characteristic());
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint32_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint32_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseAssociativeVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::const_iterator k;
-		std::vector<uint64_t>::iterator l;
-
-		uint64_t t;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		for (j = v.begin (); j != v.end (); ++j, ++i) {
-			for (k = i->begin (), l = _tmp.begin (); k != i->end (); ++k, ++l) {
-				t = ((uint64_t) k->second) * ((uint64_t) *j);
-
-				_tmp[k->first] += t;
-
-				if (_tmp[k->first] < t)
-					_tmp[k->first] += VD.faxpy()._two_64;
-			}
-		}
-
-		typename Vector1::iterator w_j;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = (uint32_t) (uint32_t)*l % VD.field ().characteristic();
-
-		return w;
-	}
-
-	template<typename Compute_t> template <class Vector1, class Matrix, class Vector2>
-	Vector1 &MVProductDomain<Givaro::Modular<uint32_t,Compute_t> >::mulColDenseSpecialized
-	(const VectorDomain<Givaro::Modular<uint32_t,Compute_t> > &VD, Vector1 &w, const Matrix &A, const Vector2 &v,
-	 VectorCategories::SparseParallelVectorTag) const
-	{
-		linbox_check (A.coldim () == v.size ());
-		linbox_check (A.rowdim () == w.size ());
-
-		typename Matrix::ConstColIterator i = A.colBegin ();
-		typename Vector2::const_iterator j;
-		typename Matrix::Column::first_type::const_iterator k_idx;
-		typename Matrix::Column::second_type::const_iterator k_elt;
-		std::vector<uint64_t>::iterator l;
-
-		uint64_t t;
-
-		if (_tmp.size () < w.size ())
-			_tmp.resize (w.size ());
-
-		std::fill (_tmp.begin (), _tmp.begin () +(ptrdiff_t) w.size (), 0);
-
-		for (j = v.begin (); j != v.end (); ++j, ++i) {
-			for (k_idx = i->first.begin (), k_elt = i->second.begin (), l = _tmp.begin ();
-			     k_idx != i->first.end ();
-			     ++k_idx, ++k_elt, ++l)
-			{
-				t = ((uint64_t) *k_elt) * ((uint64_t) *j);
-
-				_tmp[*k_idx] += t;
-
-				if (_tmp[*k_idx] < t)
-					_tmp[*k_idx] += VD.faxpy()._two_64;
-			}
-		}
-
-		typename Vector1::iterator     w_j;
-		typedef typename Vector1::value_type val_t;
-
-		for (w_j = w.begin (), l = _tmp.begin (); w_j != w.end (); ++w_j, ++l)
-			*w_j = val_t(*l % VD.field ().characteristic());
-
-		return w;
-	}
-
-}
-
-#endif // __LINBOX_field_modular_INL
-
-
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,:0,t0,+0,=s
-// Local Variables:
-// mode: C++
-// tab-width: 8
-// indent-tabs-mode: nil
-// c-basic-offset: 8
-// End:
-
diff --git a/linbox/ring/ntl/ntl-gf2e.h b/linbox/ring/ntl/ntl-gf2e.h
index 77b0457..67e6fd3 100644
--- a/linbox/ring/ntl/ntl-gf2e.h
+++ b/linbox/ring/ntl/ntl-gf2e.h
@@ -128,7 +128,7 @@ namespace LinBox
 
 		const Element zero,one,mOne ;
 
-		NTL_GF2E (const integer &p, const integer &k) :
+		NTL_GF2E (const integer &p, const int32_t &k) :
 			NTL_GF2E_Initialiser(p,k),Father_t ()
 			,zero( NTL::to_GF2E(0)),one( NTL::to_GF2E(1)),mOne(-one)
 		{ }
diff --git a/linbox/ring/ntl/ntl-lzz_pex.h b/linbox/ring/ntl/ntl-lzz_pex.h
index a34fef5..d047f53 100644
--- a/linbox/ring/ntl/ntl-lzz_pex.h
+++ b/linbox/ring/ntl/ntl-lzz_pex.h
@@ -92,7 +92,7 @@ namespace LinBox
 		/** Standard LinBox field constructor.  The paramters here
 		 * (prime, exponent) are only used to initialize the coefficient field.
 		 */
-		NTL_zz_pEX( const integer& p, size_t e = 1 ) :
+		NTL_zz_pEX( const integer& p, int32_t e = 1 ) :
 			// Givaro::ZRing<NTL::zz_pEX>(p, e), _CField(p,e)
 			NTL_zz_pEX_Initialiser(p,e),Father_t ()
 			, zero( NTL::to_zz_pEX(0)),one( NTL::to_zz_pEX(1)),mOne(-one)
diff --git a/linbox/ring/ntl/ntl-zz_p.h b/linbox/ring/ntl/ntl-zz_p.h
index dda2dc6..b1d3067 100644
--- a/linbox/ring/ntl/ntl-zz_p.h
+++ b/linbox/ring/ntl/ntl-zz_p.h
@@ -81,7 +81,7 @@ namespace Givaro
 
 		x = 0;
 		for (ptrdiff_t i = 0; i < nb; i++) {
-			x += Integer( txt[i] )<<(8*i) ;
+			x += Integer( txt[i] )<< int32_t(8*i) ;
 		}
 		delete [] txt;
 		return x;
@@ -283,7 +283,7 @@ namespace LinBox
 
 			x = 0;
 			for (ptrdiff_t i = 0; i < nb; i++) {
-				x += LinBox::integer( txt[i] )<<(8*i) ;
+				x += LinBox::integer( txt[i] )<<int32_t(8*i) ;
 			}
 			delete [] txt;
 			return x;
diff --git a/linbox/ring/ntl/ntl-zz_pe.h b/linbox/ring/ntl/ntl-zz_pe.h
index f61d9a7..a540d3e 100644
--- a/linbox/ring/ntl/ntl-zz_pe.h
+++ b/linbox/ring/ntl/ntl-zz_pe.h
@@ -139,7 +139,7 @@ namespace LinBox
 		const Element zero,one,mOne ;
 
 
-		NTL_ZZ_pE (const integer &p, const integer &k) :
+		NTL_ZZ_pE (const integer &p, const int32_t &k) :
 			NTL_ZZ_pE_Initialiser(p,k),Father_t ()
 			,zero( NTL::to_ZZ_pE(0)),one( NTL::to_ZZ_pE(1)),mOne(-one)
 
@@ -370,8 +370,8 @@ namespace LinBox
 	public:
 		typedef NTL::ZZ_pE Element;
 		UnparametricRandIter<NTL::ZZ_pE>(const NTL_ZZ_pE & F ,
-						 const integer& size =0,
-						 const integer& seed =0
+						 const int32_t& size =0,
+						 const int32_t& seed =0
 						) :
                 _size(size), _seed(seed), _ring(F)
 		{
diff --git a/linbox/ring/pir-modular-int32.h b/linbox/ring/pir-modular-int32.h
index 4b7d4d3..002d1f2 100644
--- a/linbox/ring/pir-modular-int32.h
+++ b/linbox/ring/pir-modular-int32.h
@@ -26,6 +26,8 @@
 #define __LINBOX_pir_modular_int32_H
 
 #include <givaro/modular-int32.h>
+//#include <linbox/util/debug.h>
+#include <linbox/vector/vector-domain.h>
 
 //#include "linbox/ring/modular.h"
 #ifndef LINBOX_MAX_INT
diff --git a/linbox/ring/pir-ntl-zz_p.h b/linbox/ring/pir-ntl-zz_p.h
index c4bca76..673832e 100644
--- a/linbox/ring/pir-ntl-zz_p.h
+++ b/linbox/ring/pir-ntl-zz_p.h
@@ -643,7 +643,7 @@ namespace LinBox
 	class DotProductDomain;
 
 	template <>
-	class DotProductDomain<PIR_ntl_ZZ_p> : private  VectorDomainBase<PIR_ntl_ZZ_p> {
+	class DotProductDomain<PIR_ntl_ZZ_p> : public  VectorDomainBase<PIR_ntl_ZZ_p> {
 
 	public:
 		typedef PIR_ntl_ZZ_p::Element Element;
diff --git a/linbox/solutions/smith-form.h b/linbox/solutions/smith-form.h
index 94d5a05..9b62d20 100644
--- a/linbox/solutions/smith-form.h
+++ b/linbox/solutions/smith-form.h
@@ -39,25 +39,32 @@
 
 namespace LinBox
 {
-	//! no doc.
-	template<class I1, class Lp>
-	void distinct (I1 a, I1 b, Lp& c)
+
+	// EC: pair(e,c) denotes c repetitions of element e.
+#define EC(Elt) std::pair<typename Elt, size_t>
+	// EC_LIST: list of such pairs, compact form of invariant list.
+#define EC_LIST(Elt) std::list<EC(Elt) > 
+	// Convert from vector of invariants (with repeats) to EC_LIST form.
+	template<class Ring>
+	EC_LIST(Ring::Element) & 
+	distinct(EC_LIST(Ring::Element) & c, const BlasVector<Ring>& v) 
 	{
-		typename iterator_traits<I1>::value_type e;
+		typename Ring::Element e;
 		size_t count = 0;
-		if (a != b) {e = *a; ++a; count = 1;}
-		else return;
-		while (a != b)
-		{	if (*a == e)
-			++count;
+		const Ring& R = v.field();
+		size_t n = v.size();
+		if (n > 0) R.assign(e, v[0]); else return c;
+		count = 1;
+		for (size_t i = 1; i < v.size(); ++i) 
+		{	if (R.areEqual(v[i], e))
+				++count;
 			else
-			{	c.push_back(typename Lp::value_type(e, count));
-				e = *a; count = 1;
+			{	c.push_back(EC(Ring::Element)(e, count));
+				R.assign(e, v[i]); count = 1;
 			}
-			++a;
 		}
-		c.push_back(typename Lp::value_type(e, count));
-		return;
+		c.push_back(EC(Ring::Element)(e, count));
+		return c;
 	}
 
 
@@ -75,14 +82,37 @@ namespace LinBox
 	 For now see the examples/smith.C
 	 for ways to call other smith form algorithms.
 	 */
-	template <class Output, class Blackbox, class MyMethod>
-	Output &smithForm(Output & S,
-			  const Blackbox                              &A,
-			  const MyMethod                           &M)
+	/*
+	BB has to be dense matrix
+	PL means EC_list (list of value repcount pairs)
+	VL means diag of smith form as a BlasVector.
+	SNF function forms:
+	template<BB> smithForm(PL, BB) -> add Hybrid
+	template<BB> smithForm(VL, BB) -> add Hybrid
+	template<BB,Meth> smithForm(PL, BB, Meth) -> add IntegerTag
+	template<BB,Meth> smithForm(VL, BB, Meth) -> add IntegerTag
+	smithForm(PL, BB, IntegerTag, Hybrid) -> call adaptive
+	smithForm(VL, BB, IntegerTag, Hybrid) -> call adaptive
+	*/
+
+	template <class Blackbox, class Method>
+	EC_LIST(Blackbox::Field::Element) & 
+	smithForm(EC_LIST(Blackbox::Field::Element) & S,
+			  const Blackbox                     & A,
+			  const Method                     & M)
 	{
 		smithForm(S, A, typename FieldTraits<typename Blackbox::Field>::categoryTag(), M);
 		return S;
 	}
+	template <class Blackbox, class Method>
+	BlasVector<typename Blackbox::Field> & 
+	smithForm(BlasVector<typename Blackbox::Field> & V,
+			  const Blackbox                     & A,
+			  const Method                     & M)
+	{
+		smithForm(V, A, typename FieldTraits<typename Blackbox::Field>::categoryTag(), M);
+		return V;
+	}
 
 #if 0
 	// for specialization with respect to the DomainCategory
@@ -97,14 +127,22 @@ namespace LinBox
 
 #endif
 	// The smithForm with default Method
-	template<class Output, class Blackbox>
-	Output &smithForm(Output& S,
+	template<class Blackbox>
+	EC_LIST(Blackbox::Field::Element) & 
+	smithForm(EC_LIST(Blackbox::Field::Element) & S,
 			  const Blackbox& A)
 	{
-
 		smithForm(S, A, Method::Hybrid());
 		return S;
 	}
+	template<class Blackbox>
+	BlasVector<typename Blackbox::Field> & 
+	smithForm(BlasVector<typename Blackbox::Field> & V,
+			  const Blackbox& A)
+	{
+		smithForm(V, A, Method::Hybrid());
+		return V;
+	}
 
 #if 0
 	// The smithForm for ModularTag
@@ -125,7 +163,7 @@ namespace LinBox
 		}
 		else
 		{
-			integr x; size_t c;
+			integer x; size_t c;
 			for(x = p, c = 0; divides(2, x); x /= 2, ++c);
 
 			if (x == 1 && c <= 32) // (a low power of 2)
@@ -157,8 +195,8 @@ namespace LinBox
 	std::list<std::pair<integer, size_t> > &
 	smithForm(std::list<std::pair<integer, size_t> >& S,
 	*/
-	template<class Output> Output&
-	smithForm(Output & S,
+	EC_LIST(Givaro::ZRing<Integer>::Element) &
+	smithForm(EC_LIST(Givaro::ZRing<Integer>::Element) & S,
 		  const BlasMatrix<Givaro::ZRing<Integer> > 	&A,
 		  const RingCategories::IntegerTag      &tag,
 		  const Method::Hybrid			& M)
@@ -166,9 +204,18 @@ namespace LinBox
 		Givaro::ZRing<Integer> Z;
 		BlasVector<Givaro::ZRing<Integer> > v (Z,A.rowdim() < A.coldim() ? A.rowdim() : A.coldim());
 		SmithFormAdaptive::smithForm(v, A);
-		distinct(v.begin(), v.end(), S);
-
-		return S;
+		//distinct(v.begin(), v.end(), S);
+		return distinct(S,v);
+	}
+	BlasVector<typename Givaro::ZRing<Integer> > &
+	smithForm(BlasVector<typename Givaro::ZRing<Integer> > & V,
+		  const BlasMatrix<Givaro::ZRing<Integer> > 	&A,
+		  const RingCategories::IntegerTag      &tag,
+		  const Method::Hybrid			& M)
+	{
+		Givaro::ZRing<Integer> Z;
+		SmithFormAdaptive::smithForm(V, A);
+		return V;
 	}
 
 //#endif
@@ -185,6 +232,8 @@ namespace LinBox
 	}
 #endif
 
+#undef EC 
+#undef EC_LIST
 
 } // end of LinBox namespace
 #endif // __LINBOX_smith_form_H
diff --git a/linbox/util/Makefile.am b/linbox/util/Makefile.am
index 7feebe7..32786c9 100644
--- a/linbox/util/Makefile.am
+++ b/linbox/util/Makefile.am
@@ -21,7 +21,7 @@
 
 #we now need to include givaro headers for timer ?
 
-AM_CPPFLAGS= -I$(top_srcdir)/linbox $(DEPS_CFLAGS)
+AM_CPPFLAGS= -I$(top_srcdir)/linbox $(DEPS_CFLAGS) $(DEFAULT_CFLAGS)
 LDADD = $(DEPS_LIBS) $(LDFLAGS)
 
 SUBDIRS=formats
diff --git a/linbox/vector/blas-vector.h b/linbox/vector/blas-vector.h
index 8dcdef5..4ee7675 100644
--- a/linbox/vector/blas-vector.h
+++ b/linbox/vector/blas-vector.h
@@ -198,7 +198,7 @@ namespace LinBox { /* BlasVector */
 #if (__GNUC__ == 4 && __GNUC_MINOR__ ==4 && __GNUC_PATCHLEVEL__==5)
 		BlasVector (const _Field &F, const long &m, const Element e=Element()) :
 			Father_t(),
-			_size((size_t)m),_1stride(1),_rep((size_t)_size, e),_ptr(&_rep[0]),_field(&F)
+			_size((uint32_t)m),_1stride(1),_rep(_size, e),_ptr(&_rep[0]),_field(&F)
 		{
 			// Father_t is garbage until then:
 			setIterators();
@@ -212,7 +212,7 @@ namespace LinBox { /* BlasVector */
 #if defined(__APPLE__) || (defined(__s390__) && !defined(__s390x__))
 		BlasVector (const _Field &F, const unsigned long &m, const Element e=Element())  :
 			Father_t(),
-			_size((size_t)m),_1stride(1),_rep((size_t)_size, e),_ptr(&_rep[0]),_field(&F)
+			_size((uint32_t)m),_1stride(1),_rep(_size, e),_ptr(&_rep[0]),_field(&F)
 		{
 			// Father_t is garbage until then:
 			setIterators();
@@ -273,7 +273,7 @@ namespace LinBox { /* BlasVector */
 
 		BlasVector (const _Field &F, const Integer & m, const Element e=Element())  :
 			Father_t(),
-			_size((size_t)m),_1stride(1),_rep((size_t)_size, e),_ptr(&_rep[0]),_field(&F)
+			_size((uint32_t)m),_1stride(1),_rep(_size, e),_ptr(&_rep[0]),_field(&F)
 		{
 	// Father_t is garbage until then:
 			setIterators();
diff --git a/macros/fflas-ffpack-check.m4 b/macros/fflas-ffpack-check.m4
index ec18e85..8ce2f6f 100644
--- a/macros/fflas-ffpack-check.m4
+++ b/macros/fflas-ffpack-check.m4
@@ -58,7 +58,7 @@ dnl FFLAS-FFPACK VERSION dnl
 dnl -------------------- dnl
 
 version_min=20200
-version_max=20202
+version_max=20300
 
 
 dnl Check for existence
@@ -75,7 +75,7 @@ for FFLAS_FFPACK_HOME in ${FFLAS_FFPACK_HOME_PATH}
 	FFLAS_FFPACK_LIBS=`$FFLAS_FFPACK_HOME/bin/fflas-ffpack-config --libs`
 	FFLAS_FFPACK_CFLAGS=`$FFLAS_FFPACK_HOME/bin/fflas-ffpack-config --cflags`
 
-       CXXFLAGS="${BACKUP_CXXFLAGS} ${FFLAS_FFPACK_CFLAGS}"
+       CXXFLAGS="${BACKUP_CXXFLAGS} ${FFLAS_FFPACK_CFLAGS} -O2"
        LIBS="${BACKUP_LIBS} ${FFLAS_FFPACK_LIBS}"
 
        AC_TRY_LINK(
diff --git a/tests/.gitignore b/tests/.gitignore
index 8158140..174a91a 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,4 +1,4 @@
-checker
+machecker
 test-bitonic-sort
 test-blackbox-block-container
 test-blas-domain
diff --git a/tests/jenkins-maker.sh b/tests/jenkins-maker.sh
new file mode 100755
index 0000000..ee3b3d5
--- /dev/null
+++ b/tests/jenkins-maker.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# This file is part of the LinBox library.
+# It is distributed under the terms of the LGPL licence version 2.1 or later 
+# (see COPYING)
+# Created by AB - 2014/12/03
+# Modified by AC - 2016/06/20
+# Modified by CP - 2016/06/22
+
+# Some influential environment variables:
+#	CXX			C++ compiler command
+#	CXXFLAGS	C++ compiler flags
+
+# Note: This script is intended to be launched
+# by the Jenkins web interface whenever it needs
+# to compile the project.
+# It is launched from the svn:trunk root directory.
+# But should be stored in /<slave_jenkins_path>/makers/
+
+SOURCE_DIRECTORY=$( cd "$( dirname "$0" )" && pwd )
+
+#=============================#
+# Change only these variables #
+#=============================#
+CXX=`pwd | awk -F/ '{print $(NF-2)}'`
+NTL=`pwd | awk -F/ '{print $NF}'`
+JENKINS_DIR=${SOURCE_DIRECTORY%%/workspace/*}
+LOCAL_DIR="$JENKINS_DIR"/local
+# Add path to compilers (if needed)
+export PATH=$PATH:/usr/local/bin:"$LOCAL_DIR/$CXX/bin"
+echo $PATH
+# Add specific locations (if needed)
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":/usr/local/lib:"$LOCAL_DIR/$CXX/lib":"$LOCAL_DIR/$CXX/withSSE/lib"
+echo "LD_LIBRARY_PATH = $LD_LIBRARY_PATH"
+export PKG_CONFIG_PATH=${PKG_CONFIG_PATH}:"$LOCAL_DIR/$CXX/lib/pkgconfig":"$LOCAL_DIR/$CXX/withSSE/lib/pkgconfig"
+echo "PKG_CONFIG_PATH = $PKG_CONFIG_PATH"
+
+# Where to install linbox binaries
+# Keep default for local installation.
+PREFIX_INSTALL="$LOCAL_DIR/$CXX"
+
+# Job Linbox with Ntl option flag
+if [ "$NTL" == "withNTL" ]; then
+  LINBOX_NTLFLAG="--with-ntl=$PREFIX_INSTALL"
+fi
+
+# /!\ Warning /!\ This could be an issue if you changed
+# the local installation directory
+rm -rf "$PREFIX_INSTALL"/bin/linbox* "$PREFIX_INSTALL"/include/linbox* "$PREFIX_INSTALL"/lib/liblinbox*
+
+#================#
+# Setup Variables#
+#================#
+
+if [ "$CXX" == "icpc" ]; then
+     distribution=`uname -m`
+     if [ "$distribution" == "i686" ]; then 	
+	source /usr/local/bin/compilervars.sh ia32
+     else
+	source /usr/local/bin/compilervars.sh intel64
+     fi
+fi
+
+# Particular case for Fedora23: g++=g++-5.3
+#vm_name=`uname -n | cut -d"-" -f1`
+#if [[ "$vm_name" == "fedora" && "$CXX" == "g++-5.3" ]]; then
+#   CXX="g++"
+#fi
+
+#==================================#
+# Automated installation and tests #
+#==================================#
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| ./autogen.sh CXX=$CXX CXXFLAGS=$CXXFLAGS --prefix=$PREFIX_INSTALL $LINBOX_NTLFLAG $LINBOX_FFLASFFPACKFLAG""
+./autogen.sh CXX=$CXX CXXFLAGS=$CXXFLAGS --prefix="$PREFIX_INSTALL" "$LINBOX_NTLFLAG" "$LINBOX_FFLASFFPACKFLAG"
+V="$?"; if test "x$V" != "x0";then exit "$V"; fi
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| make install"
+make install
+V="$?"; if test "x$V" != "x0"; then exit "$V"; fi
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| make perfpublisher"
+make perfpublisher
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| make examples"
+make examples
+V="$?"; if test "x$V" != "x0"; then exit "$V"; fi
+
diff --git a/tests/perfpublisher.sh b/tests/perfpublisher.sh
index 2c3c452..8c751de 100755
--- a/tests/perfpublisher.sh
+++ b/tests/perfpublisher.sh
@@ -8,12 +8,24 @@ XMLFILE=$1
 tests=$2
 COMPILER=$3
 
+# choose gdate on OS X
+if command -v "gdate" >/dev/null; then
+    DATE=gdate
+else
+    DATE=date
+fi
 #=================#
 # Plateform infos #
 #=================#
 
 COMPILERVERSION=$($COMPILER --version 2>&1 | head -1)
-CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+
+if command -v "lscpu" >/dev/null; then
+    CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+else
+    CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000))
+fi
+
 ARCH=$(uname -m)
 OSNAME=$(uname -s)
 OSVERSION=$(uname -r)
@@ -45,8 +57,8 @@ echo '<report name="tests-report" categ="tests">' >> $XMLFILE
 #=======#
 
 echo '<start>' >> $XMLFILE
-echo '<date format="YYYYMMDD" val="'$(date +%Y%m%d)'" />' >> $XMLFILE
-echo '<time format="HHMMSS" val="'$(date +%H%M%S)'" />' >> $XMLFILE
+echo '<date format="YYYYMMDD" val="'$($DATE +%Y%m%d)'" />' >> $XMLFILE
+echo '<time format="HHMMSS" val="'$($DATE +%H%M%S)'" />' >> $XMLFILE
 echo '</start>' >> $XMLFILE
 
 #=======#
@@ -59,9 +71,9 @@ do
 	then
 		#File does not exist: compile it
 		echo '[Compiling]' $test
-		COMPILESTART=$(date +%s%3N)
+		COMPILESTART=$($DATE +%s%3N)
 		COMPILELOG=$(make $test 2>&1; echo 'Returned state: '$?)
-		COMPILEEND=$(date +%s%3N)
+		COMPILEEND=$($DATE +%s%3N)
 		COMPILETIME=$(($COMPILEEND - $COMPILESTART))
 		COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$')
 		COMPILETIMERELEVANT='true'
@@ -92,9 +104,9 @@ do
 		#Compilation success
 		echo '[Executing]' $test
 		EXECUTED='yes'
-		EXECUTIONSTART=$(date +%s%3N)
+		EXECUTIONSTART=$($DATE +%s%3N)
 		EXECUTIONLOG=$(./$test  2>&1; echo 'Returned state: '$?)
-		EXECUTIONEND=$(date +%s%3N)
+		EXECUTIONEND=$($DATE +%s%3N)
 		EXECUTIONTIME=$(($EXECUTIONEND - $EXECUTIONSTART))
 		EXECUTIONCHECK=$(echo $EXECUTIONLOG | grep -o '[^ ]*$')
 		
diff --git a/tests/test-charpoly.C b/tests/test-charpoly.C
index 348b470..381da8f 100644
--- a/tests/test-charpoly.C
+++ b/tests/test-charpoly.C
@@ -194,9 +194,9 @@ static bool testSageBug(){
 
         Givaro::ZRing<Givaro::Integer> Z;
         DenseMatrix<Givaro::ZRing<Givaro::Integer> > A(Z,4,4);
-        for (size_t i=0; i<4; ++i)
-                for (size_t j=0; j<4; ++j)
-                        A.setEntry(i,j, i*4+j+1);
+        for (uint32_t i=0; i<4; ++i)
+                for (uint32_t j=0; j<4; ++j)
+                        A.setEntry(i,j, Givaro::Integer(i*4+j+1));
         typedef BlasVector <Givaro::ZRing<Givaro::Integer> > Polynomial;
         Polynomial phi(Z);
         charpoly(phi,A);
diff --git a/tests/test-field.h b/tests/test-field.h
index b5189b7..959799e 100755
--- a/tests/test-field.h
+++ b/tests/test-field.h
@@ -1321,10 +1321,10 @@ namespace field_subtests {
 		// C++ ints. Otherwise, I don't know how to place the numbers into
 		// categories in any well-defined manner.
 		for (i = 0; i < num_trials; ++i) {
-			LinBox::integer ix, id;
+			LinBox::integer ix;
 			F.convert(ix, iter.random (x));
-                        
-                        LinBox::Integer ix2 = ix % num_categories;
+                        int32_t id;
+                        int32_t ix2 = ix % num_categories;
                         if (ix2<0) ix2+=num_categories;
 			categories1[ix2]++;
 			categories2[(unsigned int) (double (ix2) / double (card) * num_categories) %num_categories]++;
@@ -1337,7 +1337,7 @@ namespace field_subtests {
 				F.convert(id, F.sub (d, *x_queue_iter, x));
                                 id %= num_categories;
                                 if (id<0) id += num_categories;
-				(*diff_cat_iter)[(size_t) id]++;
+				(*diff_cat_iter)[id]++;
 			}
 
 			x_queue.push_front (x);
diff --git a/tests/test-order-basis.C b/tests/test-order-basis.C
index 18e0d0a..ea024e1 100644
--- a/tests/test-order-basis.C
+++ b/tests/test-order-basis.C
@@ -13,11 +13,13 @@
 using namespace LinBox;
 using namespace std;
 
-ostream& report = commentator().report();
+
+//ostream& report = commentator().report();
 //ostream& report = std::cout;
 
 template<typename Field, typename Mat>
 string check_sigma(const Field& F, const Mat& sigma,  Mat& serie, size_t ord){
+	ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
 	Mat T(F,sigma.rowdim(),serie.coldim(),sigma.size()+serie.size()-1);
 	PolynomialMatrixMulDomain<Field> PMD(F);
 	PMD.mul(T,sigma,serie);
@@ -30,11 +32,11 @@ string check_sigma(const Field& F, const Mat& sigma,  Mat& serie, size_t ord){
 		i++;
 	}
 	if (i<ord){
-		cout<<"error at degree="<<i<<endl;
+		report<<"error at degree="<<i<<endl;
 		T[i].write(report, Tag::FileFormat::Plain);
-		cout<<"***"<<endl;
-		cout<<serie<<endl;
-		cout<<sigma<<endl;	
+		report<<"***"<<endl;
+		report<<serie<<endl;
+		report<<sigma<<endl;	
 	}
 	
 	
@@ -47,9 +49,10 @@ string check_sigma(const Field& F, const Mat& sigma,  Mat& serie, size_t ord){
 
 template<typename MatPol>
 bool operator==(const MatPol& A, const MatPol& B){
+	ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
 	MatrixDomain<typename MatPol::Field> MD(A.field());
 	if (A.real_degree()!=B.real_degree()|| A.rowdim()!= B.rowdim() || A.coldim()!=B.coldim()){
-		cout<<A.size()<<"("<<A.rowdim()<<"x"<<A.coldim()<<") <> "
+		report<<A.size()<<"("<<A.rowdim()<<"x"<<A.coldim()<<") <> "
 		    <<B.size()<<"("<<B.rowdim()<<"x"<<B.coldim()<<") <> "<<endl;
 		return false;
 	}
@@ -58,8 +61,8 @@ bool operator==(const MatPol& A, const MatPol& B){
 		i++;
 
 	if (i<=A.real_degree() && A.rowdim()<10 && A.coldim()<10){
-		cout<<"first:"<<endl<<A<<endl;
-		cout<<"second:"<<endl<<B<<endl;
+		report<<"first:"<<endl<<A<<endl;
+		report<<"second:"<<endl<<B<<endl;
 	}
 
 	return i>A.real_degree();
@@ -68,6 +71,7 @@ bool operator==(const MatPol& A, const MatPol& B){
 
 template<typename Field, typename RandIter>
 void check_sigma(const Field& F, RandIter& Gen, size_t m, size_t n, size_t d) {
+	ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
 	//typedef typename Field::Element Element;
 	typedef PolynomialMatrix<PMType::matfirst,PMStorage::plain,Field> MatrixP;
 	//typedef PolynomialMatrix<PMType::polfirst,PMStorage::plain,Field> MatrixP;
@@ -90,17 +94,17 @@ void check_sigma(const Field& F, RandIter& Gen, size_t m, size_t n, size_t d) {
 
 	SB.M_Basis(Sigma3, Serie, d, shift3);
 	report << "M-Basis       : " <<check_sigma(F,Sigma3,Serie,d)<<endl;
-	SB.PM_Basis2(Sigma1,Serie, d, shift);
+	SB.PM_Basis(Sigma1,Serie, d, shift);
 	report << "PM-Basis      : " <<check_sigma(F,Sigma1,Serie,d)<<endl;
 	//SB.oPM_Basis(Sigma2, Serie, d, shift2);
 	//report << "PM-Basis iter : " <<check_sigma(F,Sigma2,Serie,d)<<endl;
 
 	// if (!(Sigma1==Sigma2)){
-	// cout<<"---> different basis for PM-Basis and PM-Basis iter"<<endl;
-	// cout<<Sigma1<<endl;
-	// cout<<Sigma2<<endl;
+	// report<<"---> different basis for PM-Basis and PM-Basis iter"<<endl;
+	// report<<Sigma1<<endl;
+	// report<<Sigma2<<endl;
 	// }
-	cout<<endl;
+	report<<endl;
 }
 
 int main(int argc, char** argv){
@@ -118,14 +122,17 @@ int main(int argc, char** argv){
 		{ 's', "-s s", "Set the random seed to a specific value", TYPE_INT, &seed},
 		END_OF_ARGUMENTS
 	};
-
+	
 	parseArguments (argc, argv, args);
 
 	typedef Givaro::Modular<double>              SmallField;	
 	typedef Givaro::Modular<Givaro::Integer>      LargeField;
 
 	size_t logd=integer((uint64_t)d).bitsize();
+	commentator().start ("Testing order basis computation", "testOrderBasis", 1);
+
 	
+	ostream &report = commentator().report (Commentator::LEVEL_ALWAYS, INTERNAL_DESCRIPTION);
 	report<<"###  matrix series is of size "<<m<<" x "<<n<<" of degree "<<d<<std::endl;
 	if (b < 26){
 		if (logd>b-2){
@@ -149,8 +156,7 @@ int main(int argc, char** argv){
 		check_sigma(F,G,m,n,d);
 	}
 
-
-	
+	commentator().stop (MSG_STATUS (true), (const char *) 0, "testOrderBasis"); 
 	return 0;
 }
 
diff --git a/tests/test-smith-form-adaptive.C b/tests/test-smith-form-adaptive.C
index f7f1aee..b8838a0 100644
--- a/tests/test-smith-form-adaptive.C
+++ b/tests/test-smith-form-adaptive.C
@@ -1,7 +1,7 @@
 /* Copyright (C) LinBox
  *
  *  Author: Zhendong Wan
- *
+ *  mods: bds
  *
  * ========LICENCE========
  * This file is part of the library LinBox.
@@ -40,191 +40,56 @@
 #include "linbox/util/commentator.h"
 #include "linbox/vector/stream.h"
 #include "linbox/algorithms/smith-form-adaptive.h"
-#include "test-common.h"
-using namespace LinBox; // fragile
-
-
-template <class Ring, class SmithForm, class Vector>
-bool testRandom(const Ring& R,
-		const SmithForm& SF,
-		LinBox::VectorStream<Vector>& stream1)
-{
-
-
-	std::ostringstream str;
-
-	str << "Testing the adaptive algorithm for Smith form computation:\n";
-
-	commentator().start (str.str ().c_str (), "testRandom");//, stream1.m ());
-
-	bool ret = true;
-
-	VectorDomain<Ring> VD (R);
-
-	Vector d(R), x(R);
-
-	VectorWrapper::ensureDim (d, stream1.n ());
-
-	VectorWrapper::ensureDim (x, stream1.n ());
-
-
-	int n = (int)d. size();
-
-	while (stream1) {
-
-		commentator().startIteration ((unsigned)stream1.j ());
-
-		std::ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
-
-		bool iter_passed = true;
-
-		stream1.next (d);
-
-		report << "Input vector:  ";
-		VD.write (report, d);
-		report << endl;
-
-		BlasMatrix<Ring> D(R, n, n), L(R, n, n), U(R, n, n), A(R,n,n);
-
-		int i, j;
-
-		for(i = 0; i < n; ++i) {
-			R. assign (D[(size_t)i][(size_t)i], d[(size_t)i]);
-			R. assign(L[(size_t)i][(size_t)i], R.one);
-			R. assign (U[(size_t)i][(size_t)i], R.one);}
-
-			for (i = 0; i < n; ++ i)
-
-				for (j = 0; j < i; ++ j) {
-
-					R.init(L[(size_t)i][(size_t)j], rand() % 10);
-
-					R.init(U[(size_t)j][(size_t)i], rand() % 10);
-				}
-
-
-			BlasVector<Ring> tmp1(R,(size_t)n), tmp2(R,(size_t)n), e(R,(size_t)n);
-
-			typename BlasMatrix<Ring>::ColIterator col_p;
-
-			i = 0;
-			for (col_p = A.colBegin();
-			     col_p != A.colEnd(); ++ col_p, ++ i) {
-
-				R.assign(e[(size_t)i],R.one);
-				U.apply(tmp1, e);
-				D.apply(tmp2, tmp1);
-				// LinBox::BlasSubvector<BlasVector<Ring> > col_p_v (R, *col_p);
-				// L.apply(col_p_v, tmp2);
-				L.apply(*col_p, tmp2);
-				R.assign(e[(size_t)i],R.zero);
-			}
-
-
-
-			Givaro::ZRing<Integer> Z; //! why switch from Ring ????
-			BlasVector<Givaro::ZRing<Integer> > xi(Z,A. rowdim());
-
-			SF.smithForm (xi, A);
-			typename Vector::iterator x_p;
-			BlasVector<Givaro::ZRing<Integer> >::iterator xi_p;
-			for (x_p = x. begin(), xi_p = xi. begin(); x_p != x. end(); ++ x_p, ++ xi_p)
-				A. field (). init (*x_p, *xi_p);
-
+using namespace LinBox; 
 
-			report << "Computed Smith form: \n";
-
-			VD. write (report, x);
-
-			report << '\n';
-
-
-			typename BlasVector<Ring>::iterator p1, p2;
-			typename Ring::Element g;
-
-
-			for (p1 = d.begin(); p1 != d.end(); ++ p1) {
-
-				for ( p2 = p1 + 1; p2 != d.end(); ++ p2) {
-
-					if (R. isUnit(*p1))  break;
-
-					else if (R. isZero (*p2)) continue;
-
-					else if (R. isZero (*p1)) {
-						std::swap (*p1, *p2);
-					}
-
-					else {
-						R. gcd (g, *p1, *p2);
-
-						R. divin (*p2, g);
-
-						R. mulin (*p2, *p1);
-
-						R. assign (*p1, g);
-					}
-				}
-			}
-
-
-			report << "Expected smith form:\n";
-
-			VD.write (report, d);
-
-			report << '\n';
-
-			if (!VD.areEqual (d, x))
-
-				ret = iter_passed = false;
-
-			if (!iter_passed)
-
-				commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_ERROR)
-				<< "ERROR: Computed Smith form is incorrect" << endl;
-
-
-
-			commentator().stop ("done");
-
-			commentator().progress ();
-
-	}
-
-	//stream1.reset ();
-
-	commentator().stop (MSG_STATUS (ret), (const char *) 0, "testRandom");
-
-	return ret;
-
-}
+#include "test-smith-form.h"
 
 int main(int argc, char** argv)
 {
 
 	bool pass = true;
+	static size_t m = 3;
 	static size_t n = 35;
-	static unsigned int iterations = 1;
 	static Argument args[] = {
+		{ 'm', "-m M", "Set order of test matrices to M.", TYPE_INT,  &m },
 		{ 'n', "-n N", "Set order of test matrices to N.", TYPE_INT,  &n },
-		{ 'i', "-i I", "Perform each test for I iterations.", TYPE_INT, &iterations },
 		END_OF_ARGUMENTS
 	};
 
 	parseArguments (argc, argv, args);
-	SmithFormAdaptive sf;
 
 	commentator().start("Smith form adaptive algorithm test suite", "EGV++");
-	commentator().getMessageClass (INTERNAL_DESCRIPTION).setMaxDepth (5);
 
-	//typedef NTL_ZZ Ring; Ring R;
-	typedef Givaro::ZRing<Integer> Ring; Ring R; Ring::RandIter gen(R);
-	RandomDenseStream<Ring> s1 (R, gen, n, iterations);
-	pass = testRandom(R, sf, s1);
+	//!@bug should be tried on NTZ_LL too
+	typedef Givaro::ZRing<Integer> PIR;
+	PIR R;
+
+	size_t k = std::min(m,n);
+	DenseMatrix<PIR> A(R,m,n);
+	BlasVector<PIR> d(R,k), x(R,k), bumps(R,k), lumps(R,19);
+	for (size_t i = 0; i <10; ++i) lumps[i] = i;
+	for (size_t i = 10; i <19; ++i) lumps[i] = i-19;
+
+	makeBumps(bumps, 0);
+	makeSNFExample(A,d,bumps,lumps);
+	SmithFormAdaptive::smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 1);
+	makeSNFExample(A,d,bumps,lumps);
+	SmithFormAdaptive::smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 2);
+	makeSNFExample(A,d,bumps,lumps);
+	SmithFormAdaptive::smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 3);
+	makeSNFExample(A,d,bumps,lumps);
+	SmithFormAdaptive::smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
 
-	typedef Givaro::ZRing<Integer> Ring2; Ring2 S;Ring2::RandIter gen2(S);
-	RandomDenseStream<Ring2> s2 (S, gen2, n, iterations);
-	pass = pass && testRandom(S, sf, s2);
 
 	commentator().stop(MSG_STATUS(pass));
 	return pass ? 0 : -1;
diff --git a/tests/test-smith-form-binary.C b/tests/test-smith-form-binary.C
index b2f75d9..89b4caf 100644
--- a/tests/test-smith-form-binary.C
+++ b/tests/test-smith-form-binary.C
@@ -1,7 +1,6 @@
 /* Copyright (C) LinBox
  *
- *
- *  Author: Zhendong Wan
+ *  Author: Zhendong Wan, mods -bds
  *
  * ========LICENCE========
  * This file is part of the library LinBox.
@@ -25,18 +24,14 @@
 
 /*! @file  tests/test-smith-form-binary.C
  * @ingroup tests
- * @brief no doc.
- * @test no doc.
+ * @brief Test the EGV divide and conquer SNF alg.
  */
 
-
-
 #include "linbox/linbox-config.h"
 
 #ifdef __LINBOX_HAVE_NTL
 #include "linbox/ring/ntl.h"
 #endif
-//#include "linbox/ring/modular.h"
 #include "linbox/randiter/random-prime.h"
 #include "linbox/algorithms/matrix-rank.h"
 #include "linbox/algorithms/last-invariant-factor.h"
@@ -45,197 +40,75 @@
 #include "linbox/blackbox/scompose.h"
 #include "linbox/blackbox/random-matrix.h"
 #include "linbox/algorithms/rational-solver.h"
-#include <time.h>
+//#include <time.h>
 #include <givaro/modular.h>
 
 #include "linbox/util/commentator.h"
-#include "linbox/vector/stream.h"
-#include "test-common.h"
+//#include "linbox/vector/stream.h"
+//#include "test-common.h"
 using namespace LinBox;
 
-template <class Ring, class SmithForm, class Vector>
-bool testRandom(const Ring& R,
-		const SmithForm& SF,
-		LinBox::VectorStream<Vector>& stream1)
-{
-
-    std::ostringstream str;
-
-	str << "Testing Smith Form binary(EGV++):";
-
-        commentator().start (str.str ().c_str (), "testSmithform");//, stream1.m ());
-
-        bool ret = true;
-
-        LinBox::VectorDomain<Ring> VD (R);
-
-	int n = (int) stream1.n();
-	Vector d(R,n), x(R,n);
-
-	// VectorWrapper::ensureDim (d, stream1.n ());
-	// VectorWrapper::ensureDim (x, stream1.n ());
-
-
-
-	 while (stream1) {
-
-                commentator().startIteration ((unsigned)stream1.j ());
-
-		std::ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
-
-                bool iter_passed = true;
-
-                stream1.next (d);
-
-		report << "Input vector:  ";
-		VD.write (report, d);
-                report << endl;
-
-		DenseMatrix<Ring> D(R, n, n), L(R, n, n), U(R, n, n), A(R,n,n);
-
-		int i, j;
-
-		for(i = 0; i < n; ++i) {
-			R. assign (D[(size_t)i][(size_t)i], d[(size_t)i]);
-			R. assign (L[(size_t)i][(size_t)i], R.one);
-			R. assign (U[(size_t)i][(size_t)i], R.one);}
-
-		for (i = 0; i < n; ++ i)
-
-			for (j = 0; j < i; ++ j) {
-
-				R.init(L[(size_t)i][(size_t)j], (uint64_t)(rand() % 10));
-
-				R.init(U[(size_t)j][(size_t)i], (uint64_t)(rand() % 10));
-			}
-
-
-		BlasVector<Ring> tmp1(R,(size_t)n), tmp2(R,(size_t)n), e(R,(size_t)n);
-
-		typename DenseMatrix<Ring>::ColIterator col_p;
-
-		i = 0;
-		for (col_p = A.colBegin();
-		     col_p != A.colEnd(); ++ col_p, ++ i) {
-
-			R.assign (e[(size_t)i],R.one);
-			U.apply(tmp1, e);
-			D.apply(tmp2, tmp1);
-			// LinBox::BlasSubvector<BlasVector<Ring> > col_p_v (R, *col_p);
-			// L.apply(col_p_v, tmp2);
-			L.apply(*col_p, tmp2);
-			R.assign(e[(size_t)i],R.zero);
-		}
-
-
-
-		SF.smithFormBinary (x, A);
-
-
-		report << "Computed Smith form: \n";
-
-		VD. write (report, x);
-
-		report << '\n';
-
-
-		typename BlasVector<Ring>::iterator p1, p2;
-		typename Ring::Element g;
-
-
-		for (p1 = d.begin(); p1 != d.end(); ++ p1) {
-
-			for ( p2 = p1 + 1; p2 != d.end(); ++ p2) {
-
-                                    // CP: changed isUnit to isOne || isMOne as it is only called with ZRing<Integer>.
-				if (R. isOne (*p1) || R. isMOne(*p1))  break;
-
-				else if (R. isZero (*p2)) continue;
-
-				else if (R. isZero (*p1)) {
-                                                std::swap (*p1, *p2);
-				}
-
-				else {
-					R. gcd (g, *p1, *p2);
-
-					R. divin (*p2, g);
-
-					R. mulin (*p2, *p1);
-
-					R. assign (*p1, g);
-				}
-			}
-		}
-		// normalize to positive
-		for (p1 = d.begin(); p1 != d.end(); ++ p1) if (*p1 < 0) R.negin(*p1);
-
-		VD.write (report << "Expected smith form:\n", d) << '\n';
-
-		if (!VD.areEqual (d, x))
-			ret = iter_passed = false;
-
-        if (!iter_passed)
-            commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_ERROR)
-				<< "ERROR: Computed Smith form is incorrect" << endl;
-
-        commentator().stop ("done");
-
-        commentator().progress ();
-
-	 }
-
-	 //stream1.reset ();
-
-	  commentator().stop (MSG_STATUS (ret), (const char *) 0, "testSmithform");
-
-	  return ret;
-
-}
+#include "test-smith-form.h"
 
 int main(int argc, char** argv)
 {
-
 	bool pass = true;
 
+	static size_t m =2;
 	static size_t n =5;
 
-	static int iterations = 1;
-
 	static Argument args[] = {
-		{ 'n', "-n N", "Set order of test matrices to N.", TYPE_INT,     &n },
-		{ 'i', "-i I", "Perform each test for I iterations.", TYPE_INT,     &iterations },
+		{ 'm', "-m M", "Set row dimension of test matrices to M.", TYPE_INT,     &m },
+		{ 'n', "-n N", "Set col dimension  of test matrices to N.", TYPE_INT,     &n },
 		END_OF_ARGUMENTS
 	};
 
 	parseArguments (argc, argv, args);
 
 	commentator().start("SmithFormBinary test suite", "SmithFormBinary");
-	std::ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
 
+	commentator().report() << std::endl << "EGV++ algorithm test suite with LinBox/Givaro ZRing:\n";
 	{
 //		typedef Givaro::IntegerDom Ring;
-		typedef Givaro::ZRing<Integer> Ring;
+		typedef Givaro::ZRing<Integer> PIR;
+		PIR R;
 
-		Ring R; Ring::RandIter gen(R);
+		typedef Givaro::Modular<int64_t> Field;
+		typedef RationalSolver<PIR, Field, LinBox::RandomPrimeIterator> Solver;
+		typedef LastInvariantFactor<PIR, Solver> LIF;
+		typedef OneInvariantFactor<PIR, LIF, SCompose, RandomMatrix>  OIF;
+		typedef SmithFormBinary<PIR, OIF, MatrixRank<PIR, Field > > SF;
 
-		report << std::endl << "EGV++ algorithm test suite with LinBox/Givaro PID:\n";
+		SF sf;
+		sf. setOIFThreshold (30);
+		sf. setLIFThreshold (30);
 
-		commentator().getMessageClass (INTERNAL_DESCRIPTION).setMaxDepth (5);
+	size_t k = std::min(m,n);
+	DenseMatrix<PIR> A(R,m,n);
+	BlasVector<PIR> d(R,k), x(R,k), bumps(R,k), lumps(R,19);
+	for (size_t i = 0; i <10; ++i) lumps[i] = i;
+	for (size_t i = 10; i <19; ++i) lumps[i] = i-19;
 
-		RandomDenseStream<Ring> s1 (R, gen, n, (unsigned int) iterations);
+	makeBumps(bumps, 0);
+	makeSNFExample(A,d,bumps,lumps);
+	sf.smithFormBinary (x, A);
+	pass = pass and checkSNFExample(d,x);
 
-		typedef Givaro::Modular<int32_t> Field;
-		typedef RationalSolver<Ring, Field, LinBox::RandomPrimeIterator> Solver;
-		typedef LastInvariantFactor<Ring, Solver> LIF;
-		typedef OneInvariantFactor<Ring, LIF, SCompose, RandomMatrix>  OIF;
-		typedef SmithFormBinary<Ring, OIF, MatrixRank<Ring, Field > > SF;
+	makeBumps(bumps, 1);
+	makeSNFExample(A,d,bumps,lumps);
+	sf.smithFormBinary (x, A);
+	pass = pass and checkSNFExample(d,x);
 
-		SF sf;
-		sf. setOIFThreshold (30);
-		sf. setLIFThreshold (30);
+	makeBumps(bumps, 2);
+	makeSNFExample(A,d,bumps,lumps);
+	sf.smithFormBinary (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 3);
+	makeSNFExample(A,d,bumps,lumps);
+	sf.smithFormBinary (x, A);
+	pass = pass and checkSNFExample(d,x);
 
-		if (!testRandom(R, sf, s1)) pass = false;
 	}
 
 #if 0
diff --git a/tests/test-smith-form-iliopoulos.C b/tests/test-smith-form-iliopoulos.C
index 50314a8..263a9a6 100644
--- a/tests/test-smith-form-iliopoulos.C
+++ b/tests/test-smith-form-iliopoulos.C
@@ -53,6 +53,7 @@
 
 using namespace LinBox;
 
+#if 0
 template <class Ring>
 bool testRead(const Ring& R, string file) {
 	BlasMatrix<Ring> A(R);
@@ -82,6 +83,7 @@ bool testRead(const Ring& R, string file) {
 	SmithFormIliopoulos::smithFormIn (B);
 	return BMD.areEqual(A, B);
 }
+#endif
 
 template <class Ring>
 bool testRandom(const Ring& R, size_t n)
diff --git a/tests/test-smith-form.C b/tests/test-smith-form.C
index a986fe2..2ea29f9 100644
--- a/tests/test-smith-form.C
+++ b/tests/test-smith-form.C
@@ -1,7 +1,7 @@
 /* Copyright (C) LinBox
  *
  *  Author: Zhendong Wan
- *
+ *  Mods: bds
  *
  * ========LICENCE========
  * This file is part of the library LinBox.
@@ -31,194 +31,65 @@
 
 
 #include <linbox/linbox-config.h>
+#include "linbox/solutions/smith-form.h"
 
-#include <time.h>
 #include "givaro/zring.h"
-#include "givaro/givinteger.h"
 #include "linbox/util/commentator.h"
-#include "linbox/vector/stream.h"
-#include "test-common.h"
+#include "linbox/matrix/dense-matrix.h"
 #include "linbox/vector/blas-vector.h"
-#include "linbox/solutions/smith-form.h"
-using LinBox::parseArguments;
-using LinBox::commentator;
-using LinBox::Commentator;
-using Givaro::Integer;
-using Givaro::ZRing;
-using LinBox::BlasMatrix;
-using LinBox::BlasVector;
-
-template <class Ring, class Vector>
-bool testRandom(const Ring& R,
-		LinBox::VectorStream<Vector>& stream1)
-{
-
-	std::ostringstream str;
-
-	str << "Testing the smithForm function in solutions directory:\n";
-
-        commentator().start (str.str ().c_str (), "testRandom");//, stream1.m ());
-
-        bool ret = true;
-
-        LinBox::VectorDomain<Ring> VD (R);
-
-	Vector d(R), x(R);
-
-	LinBox::VectorWrapper::ensureDim (d, stream1.n ());
-
-	LinBox::VectorWrapper::ensureDim (x, stream1.n ());
-
-
-	int n = (int)d. size();
-
-	 while (stream1) {
-
-                commentator().startIteration ((unsigned)stream1.j ());
-
-		std::ostream &report = commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_DESCRIPTION);
-
-                bool iter_passed = true;
-
-                stream1.next (d);
-
-		report << "Input vector:  ";
-		VD.write (report, d);
-                report << endl;
-
-		BlasMatrix<Ring> D(R, (size_t)n, (size_t)n), L(R, (size_t)n, (size_t)n), U(R, (size_t)n, (size_t)n), A(R,(size_t)n,(size_t)n);
-
-		int i, j;
-
-		for(i = 0; i < n; ++i) {
-			R. assign (D[(size_t)i][(size_t)i], d[(size_t)i]);
-			R. assign(L[(size_t)i][(size_t)i], R.one);
-			R. assign(U[(size_t)i][(size_t)i], R.one);}
-
-		for (i = 0; i < n; ++ i)
-
-			for (j = 0; j < i; ++ j) {
-
-				R.init(L[(size_t)i][(size_t)j], rand() % 10);
-
-				R.init(U[(size_t)j][(size_t)i], rand() % 10);
-			}
+using namespace LinBox;
 
-
-		BlasVector<Ring> tmp1(R,(size_t)n), tmp2(R,(size_t)n), e(R,(size_t)n);
-
-		typename BlasMatrix<Ring>::ColIterator col_p;
-
-		i = 0;
-		for (col_p = A.colBegin();
-		     col_p != A.colEnd(); ++ col_p, ++ i) {
-
-			R.assign(e[(size_t)i],R.one);
-			U.apply(tmp1, e);
-			D.apply(tmp2, tmp1);
-			// LinBox::BlasSubvector<BlasVector<Ring> > col_p_v (R, *col_p);
-			// L.apply(col_p_v, tmp2);
-			L.apply(*col_p, tmp2); //! @internal @bug  should use Triangular apply ? We are doing this many times, factor somewhere in test-utils.h ? why not some ftrtr routine for that ?
-			R.assign(e[(size_t)i],R.zero);
-		}
-
-		typename Vector::iterator x_p;
-		Givaro::ZRing<Integer> Z;
-		BlasVector<Givaro::ZRing<Integer> > xi(Z,A. rowdim());
-		BlasVector<Givaro::ZRing<Integer> >::iterator xi_p;
-		std::list<std::pair<Integer, size_t> > cpt;
-		smithForm (cpt, A);
-		std::list<std::pair<Integer, size_t> >::iterator cpt_p;
-
-		xi_p = xi. begin();
-		for (cpt_p = cpt.begin(); cpt_p != cpt.end(); ++ cpt_p) {
-			for (size_t ii = 0; ii < cpt_p -> second; ++ ii, ++ xi_p)
-				*xi_p = cpt_p -> first;
-		}
-
-		for (x_p = x. begin(), xi_p = xi. begin(); x_p != x. end(); ++ x_p, ++ xi_p)
-			A. field (). init (*x_p, *xi_p);
-
-		report << "Computed Smith form: \n";
-		VD. write (report, x);
-
-		report << '\n';
-
-		typename BlasVector<Ring>::iterator p1, p2;
-		typename Ring::Element g;
-
-		for (p1 = d.begin(); p1 != d.end(); ++ p1) {
-			for ( p2 = p1 + 1; p2 != d.end(); ++ p2) {
-				if (R. isUnit(*p1))  break;
-				else if (R. isZero (*p2)) continue;
-				else if (R. isZero (*p1)) std::swap (*p1, *p2);
-				else { // (*p1, *p2) <-- (g, *p1 * *p2 / g), where g = gcd(*p1, *p2)
-					R. gcd (g, *p1, *p2);
-					R. divin (*p2, g);
-					R. mulin (*p2, *p1);
-					R. assign (*p1, g);
-				}
-			}
-		}
-
-		report << "Expected smith form:\n";
-		VD.write (report, d) << endl;
-
-		if (!VD.areEqual (d, x))
-			ret = iter_passed = false;
-
-		if (!iter_passed)
-			commentator().report (Commentator::LEVEL_IMPORTANT, INTERNAL_ERROR)
-				<< "ERROR: Computed Smith form is incorrect" << endl;
-
-		commentator().stop ("done");
-		commentator().progress ();
-	 }
-
-	//stream1.reset ();
-
-	commentator().stop (MSG_STATUS (ret), (const char *) 0, "testRandom");
-
-	return ret;
-}
+#include "test-smith-form.h"
 
 int main(int argc, char** argv)
 {
 
 	bool pass = true;
-	static size_t n =3;
+	static size_t m =30;
+	static size_t n =20;
 	static int iterations = 2;
 	static Argument args[] = {
-		{ 'n', "-n N", "Set order of test matrices to N.", TYPE_INT,  &n },
+		{ 'm', "-n M", "Set row dim of test matrices to N.", TYPE_INT,  &m },
+		{ 'n', "-n N", "Set col dim of test matrices to N.", TYPE_INT,  &n },
 		{ 'i', "-i I", "Perform each test for I iterations.", TYPE_INT, &iterations },
 		END_OF_ARGUMENTS
 	};
 
 	parseArguments (argc, argv, args);
 	//!@bug should be tried on NTZ_LL too
-	typedef Givaro::ZRing<Integer>      Ring;
-
-	Ring R; Ring::RandIter gen(R);
-    
-
-	commentator().start("Smith form test suite", "Smith");
-	commentator().getMessageClass (INTERNAL_DESCRIPTION).setMaxDepth (5);
-
-	LinBox::RandomDenseStream<Ring> s1 (R, gen, n, (unsigned int)iterations);
-	if (!testRandom(R, s1)) pass = false;
-
-#if 0
-#ifdef __LINBOX_HAVE_NTL
-	typedef LinBox::NTL_ZZ      Ring2; 
-	Ring2 R2;Ring2::RandIter gen2(R2);
-
-	LinBox::RandomDenseStream<Ring2> s2 (R2, gen2, n, (unsigned int)iterations);
-	if (!testRandom(R2, s2)) pass = false;
-
-#endif
-#endif
-
-	commentator().stop("Smith form test suite");
+	typedef Givaro::ZRing<Integer> PIR;
+	PIR R;
+
+	commentator().start("Smith form test", "Smith");
+
+	size_t k = std::min(m,n);
+	DenseMatrix<PIR> A(R,m,n);
+	BlasVector<PIR> d(R,k), x(R,k), bumps(R,k), lumps(R,19);
+	for (size_t i = 0; i <10; ++i) lumps[i] = i;
+	for (size_t i = 10; i <19; ++i) lumps[i] = i-19;
+
+	makeBumps(bumps, 0);
+	makeSNFExample(A,d,bumps,lumps);
+	smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 1);
+	makeSNFExample(A,d,bumps,lumps);
+	smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 2);
+	makeSNFExample(A,d,bumps,lumps);
+	smithForm (x, A);
+	//SmithFormAdaptive::compute_local_long(x, A, 2, 64);
+	pass = pass and checkSNFExample(d,x);
+
+	makeBumps(bumps, 3);
+	makeSNFExample(A,d,bumps,lumps);
+	smithForm (x, A);
+	pass = pass and checkSNFExample(d,x);
+
+	commentator().stop("Smith form test");
 	return pass ? 0 : -1;
 
 }
@@ -230,4 +101,3 @@ int main(int argc, char** argv)
 // c-basic-offset: 8
 // End:
 // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-
diff --git a/tests/test-smith-form.h b/tests/test-smith-form.h
new file mode 100644
index 0000000..73d7e5b
--- /dev/null
+++ b/tests/test-smith-form.h
@@ -0,0 +1,166 @@
+/* Copyright (C) LinBox
+ *
+ *  Author: bds
+ *
+ * ========LICENCE========
+ * This file is part of the library LinBox.
+ *
+  * LinBox is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ */
+
+/*! @file tests/test-smith-form.h
+ * @ingroup tests
+ * @brief tools for making matrix with known SNF.
+ */
+
+#ifndef __TEST_SMITH_FORM_H
+#define __TEST_SMITH_FORM_H
+#include <linbox/linbox-config.h>
+
+#include "linbox/util/commentator.h"
+#include "linbox/matrix/dense-matrix.h"
+#include "linbox/vector/blas-vector.h"
+using std::endl;
+using namespace LinBox;
+
+template <class PIR> // This is for PIR = Z or Z_n
+BlasVector<PIR> & makeBumps(BlasVector<PIR> & b, int choice) {
+	const PIR & R = b.field();
+	typename PIR::Element two, three, nine, x;
+	R.init(two,2);
+	R.init(three,3);
+	R.init(nine,9);
+	R.init(x,202);
+	// b is a single row 
+	size_t n = b.size();
+	switch (choice) {
+		case 0: // all zero
+				for(size_t i = 0; i < n; ++i) b.setEntry(i,R.zero);
+					break;
+		case 1: // identity
+				for(size_t i = 0; i < n; ++i) b.setEntry(i,R.one);
+					break;
+		case 2: // powers of 2
+				for(size_t i = 0; i < n; ++i) b.setEntry(i,two);
+					break;
+		case 3: // random followed by 202,0.  Random part is largely 1's.
+				for(size_t i = 0; i < n-2; ++i) {
+					size_t r = rand()%20;
+					if (r < 17) b.setEntry(i,R.one);
+					if (r == 17) b.setEntry(i,two);
+					if (r == 18) b.setEntry(i,three);
+					if (r == 19) b.setEntry(i,nine);
+				}
+				b.setEntry(n-2,x);
+				b.setEntry(n-1,R.zero);
+	}
+	// negate a few
+	for (size_t k = rand()%4; k > 0; --k){
+		size_t i = rand()%n;
+		b.getEntry(x,i);
+		b.setEntry(i,R.negin(x));
+	}
+
+	return b;
+}
+						
+
+// For any PIR, build an increasing sequence of smith invariants d from "bumps" b.
+template <class PIR>
+BlasVector<PIR> & prefixProduct (BlasVector<PIR> & d, const BlasVector<PIR> & b) {
+	const PIR& R = d.field();
+	typename PIR::Element x,y; R.init(x); R.init(y); 
+	d.setEntry(0,b.getEntry(x,0));
+	for (size_t i = 1; i < d.size(); ++i){
+		d.getEntry(x,i-1); 
+		b.getEntry(y,i);
+		d.setEntry(i,R.mulin(x, y));
+	}
+	return d;
+}
+
+// Generate A with snf = diag(d) (up to sign), based on the bumps.
+// Think of bumps[i] as s_i/s_{i-1}, quotient of smith invariants.
+// The lumps are used for off diagonal entries in L,U (triangular scramblers).
+template <class PIR>
+void makeSNFExample(DenseMatrix<PIR>& A, 
+					BlasVector<PIR> & d, 
+			  const BlasVector<PIR> & bumps,
+			  const BlasVector<PIR> & lumps) {
+	//LinBox::VectorWrapper::ensureDim (d, bumps.size());
+	//LinBox::VectorWrapper::ensureDim (d, std::min(A.rowdim(), A.coldim()));
+	prefixProduct(d, bumps);
+
+	// make A = UDL for random unimodular L,U
+	const PIR & R = A.field();
+	DenseMatrix<PIR> L(R, A.coldim(), A.coldim()), 
+					U(R, A.rowdim(), A.rowdim());
+	typename PIR::Element x; R.init(x);
+	size_t i, j, k;
+	k = lumps.size();
+	A.zero();
+	for(i = 0; i < d.size(); ++i) A.setEntry(i,i,d.getEntry(x,i));
+
+
+	L.zero();
+	for(i = 0; i < L.rowdim(); ++i) L.setEntry(i,i,R.one);
+	for (i = 0; i < L.rowdim(); ++ i)
+		for (j = 0; j < i; ++ j) L.setEntry(i,j,lumps[rand()%k]);
+
+	U.zero();
+	for(i = 0; i < U.rowdim(); ++i) U.setEntry(i,i,R.one);
+	for (i = 0; i < U.rowdim(); ++ i)
+		for (j = i+1; j < U.coldim(); ++ j) U.setEntry(i,j,lumps[rand()%k]);
+
+
+
+	// A <- UAL
+	BlasMatrixDomain<PIR> MD(R);
+	MD.mulin_left(A,L); 
+	MD.mulin_right(U,A); 
+
+	for (i = 0; i < d.size(); ++ i)
+		d.setEntry(i,R.abs(x, d.getEntry(x,i)));
+	// Now A is matrix equivalent to diag prefix product of bumps.
+	// Now d is SNF diagonal (vector of invariants) for A.
+}
+
+template <class PIR>
+bool checkSNFExample( const BlasVector<PIR>& d, const BlasVector<PIR>& x ){
+	VectorDomain<PIR> VD(d.field());
+	std::ostream & report = commentator().report();
+
+	report << "Computed Smith form:" << endl;
+	VD. write (report, x) << endl;
+
+	report << "Expected smith form:" << endl;
+	VD.write (report, d) << endl;
+
+	if (not VD.areEqual (d, x)) {
+		report << "ERROR: Computed not as Expected" << endl;
+		return false;
+	} else 
+		return true;
+}
+#endif // __TEST_SMITH_FORM_H
+
+// Local Variables:
+// mode: C++
+// tab-width: 8
+// indent-tabs-mode: nil
+// c-basic-offset: 8
+// End:
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/linbox.git