[fflas-ffpack] 01/02: Imported Upstream version 2.2.2

Wed Aug 10 03:47:07 UTC 2016

This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch master
in repository fflas-ffpack.

commit b9a319f5bd8619c950d81767b7aa6e8d72748de5
Author: Doug Torrance <dtorrance at piedmont.edu>
Date:   Tue Aug 9 23:46:41 2016 -0400

    Imported Upstream version 2.2.2
---
 .gitignore                                         |   71 +-
 AUTHORS                                            |    1 +
 ChangeLog                                          |    8 +
 Makefile.am                                        |    2 +-
 README                                             |   30 -
 README.md                                          |   97 ++
 autogen.sh                                         |    6 +-
 benchmarks/Makefile.am                             |    3 +-
 benchmarks/benchmark-charpoly.C                    |    4 +-
 benchmarks/benchmark-checkers.C                    |  306 +++++
 benchmarks/benchmark-dgemm.C                       |   18 +-
 benchmarks/benchmark-dgetrf.C                      |    2 +-
 benchmarks/benchmark-dgetri.C                      |    2 +-
 benchmarks/benchmark-dtrsm.C                       |    2 +-
 benchmarks/benchmark-dtrtri.C                      |    2 +-
 benchmarks/benchmark-fgemm-mp.C                    |    8 +-
 benchmarks/benchmark-fgemm.C                       |    6 +-
 benchmarks/benchmark-fgemv-mp.C                    |    7 +-
 benchmarks/benchmark-ftrsm.C                       |    6 +-
 benchmarks/benchmark-ftrtri.C                      |    2 +-
 benchmarks/benchmark-inverse.C                     |    4 +-
 benchmarks/benchmark-lqup.C                        |    2 +-
 benchmarks/benchmark-pluq.C                        |   31 +-
 benchmarks/perfpublisher.sh                        |   26 +-
 configure.ac                                       |   77 +-
 examples/{2x2-fgemm.C => 101-fgemm.C}              |    0
 examples/2x2-fgemm.C                               |   42 +-
 examples/2x2-pluq.C                                |   63 ++
 examples/Makefile.am                               |    5 +-
 examples/pluq.C                                    |   63 ++
 fflas-ffpack-config.in                             |    6 +-
 fflas-ffpack.pc.in                                 |    8 +-
 fflas-ffpack/Makefile.am                           |    2 +-
 fflas-ffpack/{field => checkers}/Makefile.am       |   30 +-
 fflas-ffpack/checkers/checker_charpoly.inl         |  165 +++
 .../simd128.inl => checkers/checker_empty.h}       |   37 +-
 fflas-ffpack/checkers/checker_fgemm.inl            |  100 ++
 fflas-ffpack/checkers/checker_ftrsm.inl            |  119 ++
 fflas-ffpack/checkers/checker_invert.inl           |   83 ++
 fflas-ffpack/checkers/checker_pluq.inl             |  147 +++
 .../{Makefile.am => checkers/checkers.doxy}        |   19 +-
 fflas-ffpack/checkers/checkers_fflas.h             |   79 ++
 .../simd128.inl => checkers/checkers_fflas.inl}    |   35 +-
 fflas-ffpack/checkers/checkers_ffpack.h            |   89 ++
 .../simd128.inl => checkers/checkers_ffpack.inl}   |   37 +-
 fflas-ffpack/fflas-ffpack-config.h                 |   22 +-
 fflas-ffpack/fflas/fflas.h                         |   12 +
 fflas-ffpack/fflas/fflas_fadd.h                    |    4 +-
 fflas-ffpack/fflas/fflas_fadd.inl                  |    4 +-
 fflas-ffpack/fflas/fflas_fgemm.inl                 |   79 +-
 fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl |    4 +-
 .../fflas/fflas_fgemm/fgemm_classical_mp.inl       |  168 +--
 fflas-ffpack/fflas/fflas_fgemv.inl                 |    4 +-
 fflas-ffpack/fflas/fflas_freduce.h                 |    4 +-
 fflas-ffpack/fflas/fflas_freduce.inl               |   10 +-
 fflas-ffpack/fflas/fflas_fscal.inl                 |    4 +-
 fflas-ffpack/fflas/fflas_ftrmm.inl                 |    2 +-
 fflas-ffpack/fflas/fflas_ftrmm_src.inl             |    6 +-
 fflas-ffpack/fflas/fflas_ftrsm.inl                 |    2 +
 fflas-ffpack/fflas/fflas_ftrsm_mp.inl              |    3 +
 fflas-ffpack/fflas/fflas_ftrsv.inl                 |    1 -
 fflas-ffpack/fflas/fflas_helpers.inl               |   28 +-
 fflas-ffpack/fflas/fflas_igemm/igemm.h             |    4 +-
 fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl   |    6 +-
 fflas-ffpack/fflas/fflas_level1.inl                |    1 +
 fflas-ffpack/fflas/fflas_level3.inl                |    2 +-
 fflas-ffpack/fflas/fflas_pfgemm.inl                |    2 +-
 fflas-ffpack/fflas/fflas_simd.h                    |  333 +++---
 fflas-ffpack/fflas/fflas_simd/simd128.inl          |   75 +-
 fflas-ffpack/fflas/fflas_simd/simd128_double.inl   |  668 ++++++-----
 fflas-ffpack/fflas/fflas_simd/simd128_float.inl    |  704 ++++++------
 fflas-ffpack/fflas/fflas_simd/simd128_int16.inl    |  924 ++++++++-------
 fflas-ffpack/fflas/fflas_simd/simd128_int32.inl    | 1007 ++++++++++-------
 fflas-ffpack/fflas/fflas_simd/simd128_int64.inl    | 1110 ++++++++++--------
 fflas-ffpack/fflas/fflas_simd/simd256.inl          |  151 ++-
 fflas-ffpack/fflas/fflas_simd/simd256_double.inl   |  740 ++++++------
 fflas-ffpack/fflas/fflas_simd/simd256_float.inl    |  779 +++++++------
 fflas-ffpack/fflas/fflas_simd/simd256_int16.inl    | 1130 +++++++++++--------
 fflas-ffpack/fflas/fflas_simd/simd256_int32.inl    | 1110 ++++++++++--------
 fflas-ffpack/fflas/fflas_simd/simd256_int64.inl    | 1173 ++++++++++++--------
 fflas-ffpack/fflas/fflas_simd/simd_modular.inl     |  228 ++--
 fflas-ffpack/fflas/fflas_sparse.h                  |    2 +-
 fflas-ffpack/fflas/fflas_sparse.inl                |   24 +-
 fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl   |    6 +-
 fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl  |    8 +-
 fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl   |    8 +-
 fflas-ffpack/fflas/fflas_sparse/csr/csr_utils.inl  |    0
 .../fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl   |    2 +-
 .../fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl    |    4 +-
 fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl  |    4 +-
 fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl   |    8 +-
 .../fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl |    6 +-
 .../fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl  |    6 +-
 .../fflas/fflas_sparse/ell_simd/ell_simd_utils.inl |    4 +-
 fflas-ffpack/fflas/fflas_sparse/hyb_zo.h           |    0
 .../fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl     |    4 +-
 .../fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl      |    4 +-
 .../fflas/fflas_sparse/hyb_zo/hyb_zo_utils.inl     |    0
 .../fflas/fflas_sparse/sell/sell_pspmv.inl         |    6 +-
 fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl |    6 +-
 .../fflas/fflas_sparse/sell/sell_utils.inl         |    2 +-
 .../fflas/fflas_sparse/sparse_matrix_traits.h      |    4 +-
 fflas-ffpack/ffpack/ffpack.h                       |   72 +-
 fflas-ffpack/ffpack/ffpack_charpoly.inl            |    5 +-
 fflas-ffpack/ffpack/ffpack_invert.inl              |   17 +-
 fflas-ffpack/ffpack/ffpack_ludivine.inl            |    4 +-
 fflas-ffpack/ffpack/ffpack_permutation.inl         |  296 ++++-
 fflas-ffpack/ffpack/ffpack_pluq.inl                |   60 +-
 fflas-ffpack/ffpack/ffpack_ppluq.inl               |    1 +
 fflas-ffpack/field/Makefile.am                     |    4 +-
 fflas-ffpack/field/field-traits.h                  |   14 +-
 fflas-ffpack/field/modular-extended.h              |  333 ------
 fflas-ffpack/field/rns-double-recint.inl           |  315 ++++++
 fflas-ffpack/field/rns-double.h                    |   60 +-
 fflas-ffpack/field/rns-double.inl                  |    4 +-
 fflas-ffpack/field/rns-integer-mod.h               |    5 +-
 fflas-ffpack/interfaces/libs/fflas_L1_inst.C       |    0
 fflas-ffpack/interfaces/libs/fflas_L2_inst.C       |    0
 fflas-ffpack/interfaces/libs/fflas_L3_inst.C       |    0
 .../interfaces/libs/fflas_L3_inst_implem.inl       |    2 +-
 fflas-ffpack/interfaces/libs/ffpack_inst.C         |    0
 fflas-ffpack/paladin/blockcuts.inl                 |    1 +
 fflas-ffpack/paladin/fflas_pfinit.h                |    0
 fflas-ffpack/paladin/parallel.h                    |    0
 fflas-ffpack/utils/Matio.h                         |   31 +-
 fflas-ffpack/utils/align-allocator.h               |    2 +-
 fflas-ffpack/utils/bit_manipulation.h              |    9 +-
 fflas-ffpack/utils/fflas_memory.h                  |    2 +-
 macros/avx-check.m4                                |   19 +-
 macros/ax_check_x86_features.m4                    |   77 ++
 macros/ax_gcc_x86_cpu_supports.m4                  |  104 ++
 macros/givaro-check.m4                             |    2 +-
 macros/simd-check.m4                               |  137 +++
 macros/sse2-check.m4                               |   23 +-
 tests/Makefile.am                                  |   39 +-
 tests/jenkins-maker.sh                             |  103 ++
 tests/perfpublisher.sh                             |   27 +-
 tests/test-charpoly-check.C                        |  106 ++
 tests/test-charpoly.C                              |    6 +-
 tests/test-fgemm-check.C                           |  102 ++
 tests/test-fgemm.C                                 |   21 +-
 tests/test-fger.C                                  |    2 +-
 tests/test-ftrsm-check.C                           |  110 ++
 tests/test-ftrsm.C                                 |    3 +
 tests/test-interfaces-c.c                          |   26 +
 tests/test-invert-check.C                          |   93 ++
 tests/test-invert.C                                |  171 +--
 tests/test-lu.C                                    |   61 +-
 tests/test-maxdelayeddim.C                         |   86 ++
 tests/test-permutations.C                          |  118 ++
 tests/test-pluq-check.C                            |  104 ++
 tests/test-pluq.C                                  |    1 +
 tests/test-simd.C                                  |  465 ++++----
 153 files changed, 10216 insertions(+), 5491 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1eff7bf..30e0e7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,9 +6,18 @@ autom4te.cache
 benchmarks/Makefile
 benchmarks/Makefile.in
 build-aux
+*.o
+*.lo
+*.la
+*~
+*.libs
+*.trs
+*log
+*status
+*cache
+*aux
 config.h
 config.h.in
-config.log
 config.status
 configure
 doc/Makefile
@@ -66,7 +75,6 @@ macros/ltoptions.m4
 macros/ltsugar.m4
 macros/ltversion.m4
 macros/lt~obsolete.m4
-optim.log
 optimiser/Makefile
 optimiser/Makefile.in
 stamp-h1
@@ -75,69 +83,26 @@ tests/Makefile.in
 tests/data/Makefile
 tests/data/Makefile.in
 benchmarks/benchmark-fgemm
-benchmarks/benchmark-fgemm.o
 benchmarks/benchmark-pluq
-benchmarks/benchmark-pluq.o
 tests/regression-check
-tests/regression-check.log
-tests/regression-check.o
-tests/regression-check.trs
 tests/test-compressQ
-tests/test-compressQ.log
-tests/test-compressQ.o
-tests/test-compressQ.trs
 tests/test-det
-tests/test-det.log
-tests/test-det.o
-tests/test-det.trs
 tests/test-echelon
-tests/test-echelon.o
 tests/test-fadd
-tests/test-fadd.log
-tests/test-fadd.o
-tests/test-fadd.trs
 tests/test-fgemm
-tests/test-fgemm.log
-tests/test-fgemm.o
-tests/test-fgemm.trs
 tests/test-fger
-tests/test-fger.log
-tests/test-fger.o
-tests/test-fger.trs
 tests/test-finit
-tests/test-finit.log
-tests/test-finit.o
-tests/test-finit.trs
 tests/test-fscal
-tests/test-fscal.log
-tests/test-fscal.o
-tests/test-fscal.trs
 tests/test-ftrsm
-tests/test-ftrsm.o
 tests/test-lu
-tests/test-lu.o
 tests/test-multifile
-tests/test-multifile.log
-tests/test-multifile.trs
-tests/test-multifile1.o
-tests/test-multifile2.o
 tests/test-rankprofiles
-tests/test-rankprofiles.log
-tests/test-rankprofiles.o
-tests/test-rankprofiles.trs
-benchmarks/benchmark-charpoly.o
-benchmarks/benchmark-dgemm.o
-benchmarks/benchmark-dgetrf.o
-benchmarks/benchmark-dgetri.o
-benchmarks/benchmark-dtrsm.o
-benchmarks/benchmark-dtrtri.o
-benchmarks/benchmark-fgemm-mp.o
+tests/test-bini-p
+tests/test-charpoly-check
+tests/test-fgemm-check
+tests/test-ftrsm-check
+tests/test-invert-check
+tests/test-permutations
+tests/test-pluq-check
+tests/test-simd
 benchmarks/benchmark-ftrsm
-benchmarks/benchmark-ftrsm-mp.o
-benchmarks/benchmark-ftrsm.o
-benchmarks/benchmark-ftrtri.o
-benchmarks/benchmark-inverse.o
-benchmarks/benchmark-lqup-mp.o
-benchmarks/benchmark-lqup.o
-benchmarks/benchmark-wino.o
-benchmarks/benchmark_sgemm-benchmark-dgemm.o
diff --git a/AUTHORS b/AUTHORS
index fd87c66..da82aad 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -4,6 +4,7 @@ Alexis Breust <alexis.breust at imag.fr>
 Jean-Guillaume Dumas <jean-guillaume.dumas at imag.fr>
 Pascal Giorgi <pascal.giorgi at lirmm.fr>
 Gavin Harisson
+Ashley Lesdalons
 Clément Pernet <clement.pernet at imag.fr>
 Ziad Sultan <ziad.sultan at imag.fr>
 Bastien Vialla <bastien.vialla at lirmm.fr>
diff --git a/ChangeLog b/ChangeLog
index 4f35a1b..19f215b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2016-07-30 v2.2.2
+	* many bug fixes ensuring a consistent support of clang, gcc-4.8 5.3 6.1
+	icpc on i386 x86_64, ubuntu and fedora, ppcle and osx
+	* new SIMD detection
+	* use pkgconfig
+	* new feature: checkers for Freivalds based verification
+	* improved performance of permutation application
+	* 
 2016-04-08 v2.2.1
 	* many fixes to the build system
 	* more consistent use of flags and dependency to precompiled code
diff --git a/Makefile.am b/Makefile.am
index 81653d2..82e11b4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -77,7 +77,7 @@ bin_SCRIPTS=fflas-ffpack-config
 git:
 	git commit -a; git pull; git push
 
-VERSION=2.2.1
+VERSION=2.2.2
 
 #  EXTRA_DIST=incremente-versions
 
diff --git a/README b/README
deleted file mode 100644
index bd3e0a6..0000000
--- a/README
+++ /dev/null
@@ -1,30 +0,0 @@
-  ******  FFLAS-FFPACK : Finite Field Linear Algebra Subroutines/Package ******  
-
-Version 2.2.1
-
-PURPOSE:
-
-The FFLAS-FFPACK library provides a set of basic routines for dense and some sparse linear algebra over a finite field or the ring of integers.
-
-INSTALLATION:
-
-see INSTALL
-
-AVAILABILITY: from https://github.com/linbox-team/fflas-ffpack
-
-REQUIREMENTS:
- * A BLAS library: for ex. OpenBLAS or ATLAS
- * Givaro version at least 4.0.1 (https://github.com/linbox-team/givaro)
-
-This library requires the GNU C++ compiler (gcc-4.7 or newer) or any 
-compiler supporting advanced template features.
-
-
-==========================================================
-The FFLAS-FFPACK website is http://linbox-team.github.io/fflas-ffpack/
-
-Please address your bug reports, suggestions and comments to 
-the discussion group http://groups.google.com/group/ffpack-devel
- 
-Last update : March 2016
- 
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..deee8c3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,97 @@
+# FFLAS-FFPACK: Finite Field Linear Algebra Subroutines/Package
+
+[![Build Status](https://ci.inria.fr/linbox/buildStatus/icon?job=FFLAS-FFPACK)](https://ci.inria.fr/linbox/view/LinBox%20ecosystem/job/FFLAS-FFPACK/)
+
+## PURPOSE
+
+The FFLAS-FFPACK library provides a set of basic routines for linear algebra over a finite field or the ring of integers with dense and sparse matrices.
+
+It is inspired by the BLAS interface (Basic Linear Algebra Subprograms) and the LAPACK library for numerical linear algebra, and shares part of their design. Yet it differs in many aspects due to the specifities of computing over exact domains such as a finite fields and the field of rationals:
+- it is generic with respect to the finite field, so as to accomodate a large variety of field sizes and implementations;
+- consequently all routines use the C++ template genericity and the library is primarily meant to be used as a source code library, to be included and compiled in the user's software.
+- However, we also provide a compiled version instantiating most common routines over the most common finite fields.
+
+## LICENSE
+
+FFLAS-FFPACK is distributed unded the terms of the GNU LGPL v2.1 or later (see LICENSE).
+
+## REQUIREMENTS:
+- a C++ compiler supporting C++11 standard. This means g++ v4.7 or greater, clang++ v3.4 or greater, icpc v16 or greater (earlier versions of clang and icpc might also work but have not been tested)
+- A BLAS library conforming to either the C or Fortran BLAS standard: OpenBLAS (recommended), or ATLAS. Make sure to use a single threaded version of the BLAS library.
+- [Givaro](https://github.com/linbox-team/givaro) version at least 4.0.1, providing the implementations of the coefficient fields/rings. 
+
+## INSTALLATION
+
+In brief:
+```./configure <options> && make && make install```
+
+The most commonly used option include:
+- `--with-blas-libs=<libs>` : to specify the arguments for the linker to find the BLAS
+- `--enable-optimization` : to run configure-time optimizations
+
+Type `./configure --help` to list all options available.
+Note that `givaro` is automatically detected by pkg-config, so you no longer need to pass a `--with-givaro=...` option.
+You may need to set the `PKG_CONFIG_PATH` environment variable to `<givaro-prefix>/lib/pkgconfig` if you have installed it in a non standard directory.
+
+For example on a x86_64 architecture:
+- Using OpenBLAS in Fedora: 
+ - install the package `openblas-devel.x86_64`,
+ - run `./configure --enable-optimization --with-blas-libs="-lopenblas"`
+- Using OpenBLAS in Debian, Ubuntu, Mint, and all debian based distribution:
+ - avoid using the distribution's package, as it is threaded by default. You need to
+   compile openblas yourself on these systems,
+ - run `./configure --enable-optimization --with-blas-libs="-lopenblas"`
+- Using ATLAS in Debian, Ubuntu, Mint: 
+ - install the package `libatlas-dev`,
+ - run `./configure --enable-optimization --with-blas-libs="-latlas -lcblas"`
+- Using ATLAS in Fedora:
+ - install the package `atlas-devel.x86_64`,
+ - run `./configure --enable-optimization --with-blas-libs="-L/usr/lib64/atlas -lsatlas"`.
+- Using Accelerate Framework on OS-X:
+ - run `./configure --enable-optimization --with-blas-libs="-framework Accelerate"`.
+see INSTALL for further details.
+
+## AVAILABILITY
+
+ from [linbox-team/fflas-ffpack](https://github.com/linbox-team/fflas-ffpack)
+
+## AUTHORS
+
+The FFLAS-FFPACK group (see AUTHORS file for a list of contributors).
+
+## Citing FFLAS-FFPACK
+
+If your research depends on the FFLAS-FFPACK library, please consider citing the project as
+
+```
+ at manual{fflas-ffpack,
+title = {{FFLAS-FFPACK}: {F}inite {F}ield {L}inear {A}lgebra {S}ubroutines / {P}ackage},
+author = {The FFLAS-FFPACK group},
+edition = {v2.2.1},
+year = {2016},
+note = {\url{http://github.com/linbox-team/fflas-ffpack}}
+}
+```
+
+Or you may also consider citing the related research article:
+```
+ at article{DGP:2008,
+author = {Jean-Guillaume Dumas and Pascal Giorgi and Cl{\'e}ment Pernet},
+title = {Dense Linear Algebra over Word-Size Prime Fields: the FFLAS and FFPACK Packages},
+journal = {ACM Trans. on Mathematical Software (TOMS)},
+volume = {35},
+number = {3},
+year = {2008},
+issn = {0098-3500},
+pages = {1--42},
+doi = {10.1145/1391989.1391992},
+publisher = {ACM Press},
+address = {New York, NY, USA}
+}
+```
+
+## Contact and discussion
+
+For any bug report, feature or help request, please file an issue on github's [issue tracker](https://github.com/linbox-team/fflas-ffpack/issues).
+
+Please address any other request, suggestion and comment to the discussion group [ffpack-devel](http://groups.google.com/group/ffpack-devel).
diff --git a/autogen.sh b/autogen.sh
index 1b06ba5..052080f 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -68,8 +68,10 @@ LIBTOOLIZE=libtoolize
 (uname -a|grep -v Darwin) < /dev/null > /dev/null 2>&1 ||
 {
 echo "....Adding fix for OSX"
-LIBTOOL=glibtool
-LIBTOOLIZE=glibtoolize
+if command -v "glibtoolize" >/dev/null; then
+    LIBTOOL=glibtool
+    LIBTOOLIZE=glibtoolize
+fi
 }
 
 
diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
old mode 100644
new mode 100755
index e9bb878..f01a39f
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -31,7 +31,7 @@ AM_LDFLAGS=-static $(PARLIBS)
 
 PERFPUBLISHERFILE=benchmarks-report.xml
 
-FFLA_BENCH =    benchmark-fgemm benchmark-wino benchmark-ftrsm  benchmark-ftrtri  benchmark-inverse  benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp
+FFLA_BENCH =    benchmark-fgemm benchmark-wino benchmark-ftrsm  benchmark-ftrtri  benchmark-inverse  benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers
 BLAS_BENCH =    benchmark-sgemm$(EXEEXT) benchmark-dgemm benchmark-dtrsm
 LAPA_BENCH =    benchmark-dtrtri benchmark-dgetri benchmark-dgetrf
 
@@ -69,6 +69,7 @@ benchmark_charpoly_SOURCES = benchmark-charpoly.C
 benchmark_lqup_SOURCES = benchmark-lqup.C
 benchmark_lqup_mp_SOURCES = benchmark-lqup-mp.C
 benchmark_pluq_SOURCES = benchmark-pluq.C
+benchmark_checkers_SOURCES = benchmark-checkers.C
 
 benchmark_sgemm_CXXFLAGS = $(AM_CXXFLAGS) -D__SGEMM__
 
diff --git a/benchmarks/benchmark-charpoly.C b/benchmarks/benchmark-charpoly.C
index cf445d3..8e642d8 100644
--- a/benchmarks/benchmark-charpoly.C
+++ b/benchmarks/benchmark-charpoly.C
@@ -39,7 +39,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 131071;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   	static int variant =0;
 
@@ -100,7 +100,7 @@ int main(int argc, char** argv) {
 	// -----------
 	// Standard output for benchmark - Alexis Breust 2014/11/14
 	std::cerr << "Time: " << time / double(iter)
-		  << " Gflops: " << "irrelevant";
+		  << " Gflops: " << "Irrelevant";
 	FFLAS::writeCommandString(std::cerr, as) << std::endl;
 
   return 0;
diff --git a/benchmarks/benchmark-checkers.C b/benchmarks/benchmark-checkers.C
new file mode 100644
index 0000000..aa0651d
--- /dev/null
+++ b/benchmarks/benchmark-checkers.C
@@ -0,0 +1,306 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+#define ENABLE_ALL_CHECKINGS 1 // DO NOT CHANGE
+#define _NR_TESTS 5
+#define _MAX_SIZE_MATRICES 1000
+
+#include "fflas-ffpack/config-blas.h"
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+#include "fflas-ffpack/utils/fflas_randommatrix.h"
+#include "fflas-ffpack/utils/timer.h"
+#include "fflas-ffpack/fflas/fflas.h"
+#include "fflas-ffpack/checkers/checkers_fflas.h"
+#include "fflas-ffpack/checkers/checkers_ffpack.h"
+#include <fstream>
+
+using namespace std;
+
+int main(int argc, char** argv) {
+	size_t NR_TESTS = _NR_TESTS;
+	int    q    = 131071;
+	size_t    MAX_SIZE_MATRICES    = _MAX_SIZE_MATRICES;
+	size_t Range = 500;
+	size_t seed( (int) time(NULL) );
+	std::string file("checkers_report.txt");
+
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",  TYPE_INT , &q },
+		{ 'n', "-n N", "Set the dimension of the matrix.",               TYPE_INT , &MAX_SIZE_MATRICES },
+		{ 'i', "-i R", "Set number of repetitions.",                     TYPE_INT , &NR_TESTS },
+		{ 'r', "-r R", "Set the range of matrix sizes.",                     TYPE_INT , &Range },
+        { 's', "-s N", "Set the seed.", TYPE_INT , &seed },
+		{ 'f', "-f FILE", "Set the output file.",  TYPE_STR , &file },
+ 		END_OF_ARGUMENTS
+	};
+
+	std::ofstream stats_f(file.c_str());
+
+	FFLAS::parseArguments(argc,argv,as);
+
+	srand (seed);
+
+	typedef Givaro::Modular<double> Field;
+	typedef std::vector<Field::Element> Polynomial;
+
+	Field F(q);
+	Field::RandIter Rand(F,0,seed);
+	Field::NonZeroRandIter NZRand(Rand);
+
+	size_t pass;
+	FFLAS::Timer chrono,global;
+	double gffop(0.);
+	global.start();
+	double time1, time2;
+
+	Field::Element_ptr A = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range);
+	Field::Element_ptr B = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range);
+	Field::Element_ptr C = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range);
+	typename Field::Element alpha,beta,tmp;
+	F.init(alpha, rand()%1000+1);
+	F.init(beta,  rand()%1000+1);
+	size_t m,n,k,lda,ldb,ldc;
+	FFLAS::FFLAS_TRANSPOSE ta,tb;
+
+        stats_f << "     Matrix size\tSuccess rate\t\tTime comput.\t\tTime checker\n\n";
+
+	// #####   FGEMM   #####
+	stats_f << "FGEMM:\n";
+	for (size_t i=0; i<MAX_SIZE_MATRICES; i+=Range) {
+		pass = 0; time1 = 0.0; time2 = 0.0;
+		for (size_t j=0; j<NR_TESTS; ++j) {
+			m = rand() % Range + i;
+			n = rand() % Range + i;
+			k = rand() % Range + i;
+			gffop += (2.*double(m)/1000.*double(n)/1000.*double(k)/1000.0);
+			
+			ta = FFLAS::FflasNoTrans;//rand()%2 ? FFLAS::FflasNoTrans : FFLAS::FflasTrans,
+			tb = FFLAS::FflasNoTrans;//rand()%2 ? FFLAS::FflasNoTrans : FFLAS::FflasTrans;
+			lda = ta == FFLAS::FflasNoTrans ? k : m,
+			ldb = tb == FFLAS::FflasNoTrans ? n : k,
+			ldc = n;
+
+			PAR_BLOCK { FFLAS::pfrand(F,Rand, m,k,A,m/MAX_THREADS); }
+			PAR_BLOCK { FFLAS::pfrand(F,Rand, k,n,B,k/MAX_THREADS); }
+			PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,C,n/MAX_THREADS); }
+
+			chrono.clear(); chrono.start();
+			FFLAS::ForceCheck_fgemm<Field> checker1(Rand,m,n,k,beta,C,ldc);
+			chrono.stop(); time1 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			FFLAS::fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+			chrono.stop(); time2 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			pass += checker1.check(ta,tb,alpha,A,lda,B,ldb,C) ? 1 : 0;
+			chrono.stop(); time1 += chrono.usertime();
+		}
+		time1 /= NR_TESTS;
+		time2 /= NR_TESTS;
+		stats_f << "     " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 
+				<< "\t\t" << time1 << endl;
+	}
+	stats_f << endl;
+
+
+
+	// #####   FTRSM   #####
+	stats_f << "FTRSM:\n";
+	for (size_t i=0; i<MAX_SIZE_MATRICES; i+=Range) {
+		pass = 0; time1 = 0.0; time2 = 0.0;
+		for (size_t j=0; j<NR_TESTS; ++j) {
+			m = rand() % Range + i;
+			n = rand() % Range + i;
+			gffop += (double(m)/1000.*double(m)/1000.*double(n)/1000.0);
+			
+			FFLAS::FFLAS_SIDE side = rand()%2?FFLAS::FflasLeft:FFLAS::FflasRight;
+			FFLAS::FFLAS_UPLO uplo = rand()%2?FFLAS::FflasLower:FFLAS::FflasUpper;
+			FFLAS::FFLAS_TRANSPOSE trans = rand()%2?FFLAS::FflasNoTrans:FFLAS::FflasTrans;
+			FFLAS::FFLAS_DIAG diag = rand()%2?FFLAS::FflasNonUnit:FFLAS::FflasUnit;
+			k = (side==FFLAS::FflasLeft?m:n);
+
+			for( size_t i = 0; i < m*n; ++i ) Rand.random( *(B+i) );
+			for (size_t i=0;i<k;++i) {
+				for (size_t j=0;j<i;++j)
+					A[i*k+j]= (uplo == FFLAS::FflasLower)? Rand.random(tmp) : F.zero;
+				A[i*k+i]= (diag == FFLAS::FflasNonUnit)? NZRand.random(tmp) : F.one;
+				for (size_t j=i+1;j<k;++j)
+					A[i*k+j]= (uplo == FFLAS::FflasUpper)? Rand.random(tmp) : F.zero;
+			}
+
+			chrono.clear(); chrono.start();
+                        FFLAS::ForceCheck_ftrsm<Field> checker2(Rand, m, n, alpha, B, n);
+			chrono.stop(); time1 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			FFLAS::ftrsm(F, side, uplo, trans, diag, m, n, alpha, A, k, B, n);
+			chrono.stop(); time2 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			pass += checker2.check(side, uplo, trans, diag, m, n, A, k, B, n);
+			chrono.stop(); time1 += chrono.usertime();
+		}
+		time1 /= NR_TESTS;
+		time2 /= NR_TESTS;
+		stats_f << "     " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 
+				<< "\t\t" << time1 << endl;
+	}
+	stats_f << endl;
+
+
+
+	// #####   INVERT   #####
+	stats_f << "INVERT:\n";
+	int nullity;
+	for (size_t i=0; i<MAX_SIZE_MATRICES; i+=Range) {
+		pass = 0; time1 = 0.0; time2 = 0.0;
+		for (size_t j=0; j<NR_TESTS; ++j) {
+			m = rand() % Range + i;
+			gffop += 2*(double(m)/1000.*double(m)/1000.*double(m)/1000.0);
+			
+			FFPACK::RandomMatrixWithRankandRandomRPM(F,A,m,m,m,m);
+
+			try {
+				chrono.clear(); chrono.start();
+				FFPACK::ForceCheck_invert<Field> checker3(Rand,m,A,m);
+				chrono.stop(); time1 += chrono.usertime();
+				
+				chrono.clear(); chrono.start();
+				FFPACK::Invert(F,m,A,m,nullity);
+				chrono.stop(); time2 += chrono.usertime();
+				
+				chrono.clear(); chrono.start();
+				pass += checker3.check(A,nullity);
+				chrono.stop(); time1 += chrono.usertime();
+			} catch(FailureInvertCheck &e) {
+				stats_f << " invert verification failed! " << nullity << std::endl;
+			} catch(FailurePLUQCheck &e) {
+				stats_f << " internal PLUQ verification failed! " << std::endl;
+			}
+		}
+		time1 /= NR_TESTS;
+		time2 /= NR_TESTS;
+		stats_f << "     " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 
+				<< "\t\t" << time1 << endl;
+	}
+	stats_f << endl;
+	
+
+
+
+	// #####   PLUQ   #####
+	stats_f << "PLUQ:\n";
+	for (size_t i=0; i<MAX_SIZE_MATRICES; i+=Range) {
+		pass = 0; time1 = 0.0; time2 = 0.0;
+		for (size_t j=0; j<NR_TESTS; ++j) {
+			m = rand() % Range + i;
+			n = rand() % Range + i;
+
+			PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,A,m/MAX_THREADS); }
+
+			size_t *P = FFLAS::fflas_new<size_t>(m);
+			size_t *Q = FFLAS::fflas_new<size_t>(n);
+
+			chrono.clear(); chrono.start();
+                        FFPACK::ForceCheck_PLUQ<Field> checker4 (Rand,m,n,A,n);
+			chrono.stop(); time1 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			k = FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q);
+			chrono.stop(); time2 += chrono.usertime();
+
+#define CUBE(x) ((x)*(x)*(x))
+			gffop += 2.0/3.0*CUBE(double(k)/1000.0) +2*m/1000.0*n/1000.0*double(k)/1000.0  - double(k)/1000.0*double(k)/1000.0*(m+n)/1000;
+
+			chrono.clear(); chrono.start();
+			pass += checker4.check(A,n,k,P,Q);
+			chrono.stop(); time1 += chrono.usertime();
+
+			FFLAS::fflas_delete(P,Q);
+		}
+		time1 /= NR_TESTS;
+		time2 /= NR_TESTS;
+		stats_f << "     " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 
+				<< "\t\t" << time1 << endl;
+	}
+	stats_f << endl;
+	global.stop();
+
+
+
+	// #####   CharPoly   #####
+	stats_f << "CharPoly:\n";
+	for (size_t i=0; i<MAX_SIZE_MATRICES; i+=Range) {
+		pass = 0; time1 = 0.0; time2 = 0.0;
+		for (size_t j=0; j<NR_TESTS; ++j) {
+			n = rand() % Range + i;
+
+			PAR_BLOCK { FFLAS::pfrand(F,Rand, n,n,A,n/MAX_THREADS); }
+
+			try {
+			Polynomial g(n);
+
+			chrono.clear(); chrono.start();
+            FFPACK::ForceCheck_charpoly<Field,Polynomial> checker5(Rand,n,A,n);
+			chrono.stop(); time1 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			FFPACK::CharPoly(F,g,n,A,n,FFPACK::FfpackLUK);
+			chrono.stop(); time2 += chrono.usertime();
+
+			chrono.clear(); chrono.start();
+			pass += checker5.check(g);
+			chrono.stop(); time1 += chrono.usertime();
+			} catch(FailureCharpolyCheck &e) {
+				stats_f << " charpoly verification failed! " << std::endl;
+			} catch(FailurePLUQCheck &e) {
+				stats_f << " internal PLUQ verification failed! " << std::endl;
+			}
+		}
+		time1 /= NR_TESTS;
+		time2 /= NR_TESTS;
+		stats_f << "     " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 
+				<< "\t\t" << time1 << endl;
+	}
+
+
+	FFLAS::fflas_delete(A);
+	FFLAS::fflas_delete(B);
+	FFLAS::fflas_delete(C);
+
+	std::cout << "Time: " << global.realtime()
+			  << " Gflops: " << gffop/global.realtime() << std::endl;
+
+	return 0;
+}
diff --git a/benchmarks/benchmark-dgemm.C b/benchmarks/benchmark-dgemm.C
index be1c356..b923cd3 100644
--- a/benchmarks/benchmark-dgemm.C
+++ b/benchmarks/benchmark-dgemm.C
@@ -54,7 +54,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int n    = 2000;
+	size_t n    = 2000;
 	std::string file1 = "";
 	std::string file2 = "";
   
@@ -88,8 +88,8 @@ int main(int argc, char** argv) {
       Field::RandIter G(F);
       A = FFLAS::fflas_new<Element>(n*n);
 #pragma omp parallel for
-      for (int i=0; i<n; ++i)
-          for (int j=0; j<n; ++j)
+      for (size_t i=0; i<n; ++i)
+          for (size_t j=0; j<n; ++j)
               G.random(*(A+i*n+j));
     }
 
@@ -100,8 +100,8 @@ int main(int argc, char** argv) {
       Field::RandIter G(F);
       B = FFLAS::fflas_new<Element>(n*n);
 #pragma omp parallel for
-      for (int i=0; i<n; ++i)
-          for (int j=0; j<n; ++j)
+      for (size_t i=0; i<n; ++i)
+          for (size_t j=0; j<n; ++j)
               G.random(*(B+i*n+j));
     }
 
@@ -124,8 +124,8 @@ int main(int argc, char** argv) {
       Field::RandIter G(F);
       A = FFLAS::fflas_new<Element>(n*n);
 #pragma omp parallel for
-      for (int i=0; i<n; ++i)
-          for (int j=0; j<n; ++j)
+      for (size_t i=0; i<n; ++i)
+          for (size_t j=0; j<n; ++j)
               G.random(*(A+i*n+j));
     }
 
@@ -136,8 +136,8 @@ int main(int argc, char** argv) {
       Field::RandIter G(F);
       B = FFLAS::fflas_new<Element>(n*n);
 #pragma omp parallel for
-      for (int i=0; i<n; ++i)
-          for (int j=0; j<n; ++j)
+      for (size_t i=0; i<n; ++i)
+          for (size_t j=0; j<n; ++j)
               G.random(*(B+i*n+j));
     }
 
diff --git a/benchmarks/benchmark-dgetrf.C b/benchmarks/benchmark-dgetrf.C
index 30884e6..7b5b58f 100644
--- a/benchmarks/benchmark-dgetrf.C
+++ b/benchmarks/benchmark-dgetrf.C
@@ -54,7 +54,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
 	
 	size_t NBK = MAX_THREADS;
diff --git a/benchmarks/benchmark-dgetri.C b/benchmarks/benchmark-dgetri.C
index 0eebd50..de387d6 100644
--- a/benchmarks/benchmark-dgetri.C
+++ b/benchmarks/benchmark-dgetri.C
@@ -60,7 +60,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   
 	Argument as[] = {
diff --git a/benchmarks/benchmark-dtrsm.C b/benchmarks/benchmark-dtrsm.C
index a8303d5..5448618 100644
--- a/benchmarks/benchmark-dtrsm.C
+++ b/benchmarks/benchmark-dtrsm.C
@@ -44,7 +44,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file1 = "";
 	std::string file2 = "";
   
diff --git a/benchmarks/benchmark-dtrtri.C b/benchmarks/benchmark-dtrtri.C
index 0c3b5c5..3063e7c 100644
--- a/benchmarks/benchmark-dtrtri.C
+++ b/benchmarks/benchmark-dtrtri.C
@@ -47,7 +47,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   
 	Argument as[] = {
diff --git a/benchmarks/benchmark-fgemm-mp.C b/benchmarks/benchmark-fgemm-mp.C
old mode 100755
new mode 100644
index f7d49b3..d446cf9
--- a/benchmarks/benchmark-fgemm-mp.C
+++ b/benchmarks/benchmark-fgemm-mp.C
@@ -240,8 +240,12 @@ int tmain(){
 
 	double Gflops=(2.*double(m)/1000.*double(n)/1000.*double(k)/1000.0) / time * double(iters);
 // 	Gflops*=p.bitsize()/16.;
-	cout<<typeid(Ints).name()
-        << " | Time: "<< (time/double(iters)) << " (total:" << time <<")  Gflops: "<<Gflops<<"  | perword: "<< (Gflops*double(p.bitsize()))/64. ;
+	cout  << "Time: "<< (time/double(iters))
+	      <<" Gflops: "<<Gflops
+	      << " (total:" << time <<") "
+	      <<typeid(Ints).name()
+	      <<"  | perword: "<< (Gflops*double(p.bitsize()))/64. ;
+
 	FFLAS::writeCommandString(std::cout << '|' << p << " (" << p.bitsize()<<")|", as) << "  | Freivalds: "<< timev/double(iters) << std::endl;
 
 #ifdef BENCH_FLINT	
diff --git a/benchmarks/benchmark-fgemm.C b/benchmarks/benchmark-fgemm.C
index 11e23e6..6b85def 100644
--- a/benchmarks/benchmark-fgemm.C
+++ b/benchmarks/benchmark-fgemm.C
@@ -64,7 +64,7 @@ int main(int argc, char** argv) {
 	size_t k = 2000 ;
 	size_t n = 2000 ;
 	int nbw = -1 ;
-	int p=3;
+	int p=0;
 	int t=MAX_THREADS;
 	int NBK = -1;
 
@@ -108,7 +108,6 @@ int main(int argc, char** argv) {
   PAR_BLOCK { pfrand(F,G, k,n,B,k/NBK); }	
 
   C = fflas_new(F,m,n,Alignment::CACHE_PAGESIZE);
-  
 //#pragma omp parallel for collapse(2) schedule(runtime) 
   PAR_BLOCK { pfzero(F, m,n,C,m/NBK); }
   
@@ -134,7 +133,8 @@ int main(int argc, char** argv) {
 	      typedef StrategyParameter::ThreeDAdaptive  threeda;
 	      typedef StrategyParameter::ThreeDInPlace  threedip;
 	      PAR_BLOCK{
-	      if (i) chrono.start();
+              if (i) { chrono.start(); }
+              
 	      switch (p){
 		  case 1:{
 			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<block,threads> > WH(F,nbw, SPLITTER(t,block,threads));
diff --git a/benchmarks/benchmark-fgemv-mp.C b/benchmarks/benchmark-fgemv-mp.C
index 2a3cef6..7874578 100644
--- a/benchmarks/benchmark-fgemv-mp.C
+++ b/benchmarks/benchmark-fgemv-mp.C
@@ -167,10 +167,11 @@ int tmain(){
 
     double Mflops=((2.*double(m)-1)/1000.*double(k)/1000.0) /time * double(iters);
 // 	Mflops*=p.bitsize()/16.;
-    cout<<typeid(Ints).name()
-        << " | Time: "<< (time/double(iters))  << " (total:" << time <<") | Mflops: "<<Mflops<<"  | perword: "<< (Mflops*double(p.bitsize()))/64. ;
+    cout << "Time: "<< (time/double(iters))  <<" Mflops: "<<Mflops
+	 << " (total:" << time <<") "
+	 <<typeid(Ints).name()
+	 <<" perword: "<< (Mflops*double(p.bitsize()))/64. ;
     FFLAS::writeCommandString(std::cout << " | " << p << " (" << p.bitsize()<<")|", as)  << std::endl;
-
     return 0;
 }
  
diff --git a/benchmarks/benchmark-ftrsm.C b/benchmarks/benchmark-ftrsm.C
index 21009c8..f1c99fe 100644
--- a/benchmarks/benchmark-ftrsm.C
+++ b/benchmarks/benchmark-ftrsm.C
@@ -37,13 +37,13 @@ int main(int argc, char** argv) {
   
 	size_t iter = 3;
 	int    q    = 1009;
-	int    m    = 2000 ;
-	int    n    = 2000;
+	size_t    m    = 2000 ;
+	size_t    n    = 2000;
 	std::string file1 = "";
 	std::string file2 = "";
 	int t=MAX_THREADS;
 	int NBK = -1;
-	int p = 3; // 0 for sequential 1 for pIter-sRec ; 2 for pRec; 3 for hybrid
+	int p = 0; // 0 for sequential 1 for pIter-sRec ; 2 for pRec; 3 for hybrid
 
 	Argument as[] = {
 		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",  TYPE_INT , &q },
diff --git a/benchmarks/benchmark-ftrtri.C b/benchmarks/benchmark-ftrtri.C
index 7575cc1..d70a261 100644
--- a/benchmarks/benchmark-ftrtri.C
+++ b/benchmarks/benchmark-ftrtri.C
@@ -39,7 +39,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   
 	Argument as[] = {
diff --git a/benchmarks/benchmark-inverse.C b/benchmarks/benchmark-inverse.C
index 2593144..34ce925 100644
--- a/benchmarks/benchmark-inverse.C
+++ b/benchmarks/benchmark-inverse.C
@@ -39,7 +39,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   
 	Argument as[] = {
@@ -52,7 +52,7 @@ int main(int argc, char** argv) {
 
 	FFLAS::parseArguments(argc,argv,as);
 
-  typedef Givaro::Modular<double> Field;
+  typedef Givaro::ModularBalanced<double> Field;
   typedef Field::Element Element;
 
   Field F(q);
diff --git a/benchmarks/benchmark-lqup.C b/benchmarks/benchmark-lqup.C
index a4793fe..a64f761 100644
--- a/benchmarks/benchmark-lqup.C
+++ b/benchmarks/benchmark-lqup.C
@@ -38,7 +38,7 @@ int main(int argc, char** argv) {
   
 	size_t iter = 1;
 	int    q    = 1009;
-	int    n    = 2000;
+	size_t    n    = 2000;
 	std::string file = "";
   
 	Argument as[] = {
diff --git a/benchmarks/benchmark-pluq.C b/benchmarks/benchmark-pluq.C
index 1f7e786..cb691ea 100644
--- a/benchmarks/benchmark-pluq.C
+++ b/benchmarks/benchmark-pluq.C
@@ -30,6 +30,21 @@
 //#define  __FFLASFFPACK_FORCE_SEQ
 //#define WINOPAR_KERNEL
 //#define CLASSIC_SEQ
+// #define PROFILE_PLUQ
+// #define MONOTONIC_CYCLES
+// #define MONOTONIC_MOREPIVOTS
+// #define MONOTONIC_FEWPIVOTS
+
+#ifdef MONOTONIC_CYCLES
+  #define MONOTONIC_APPLYP
+#endif
+#ifdef MONOTONIC_MOREPIVOTS
+  #define MONOTONIC_APPLYP
+#endif
+#ifdef MONOTONIC_FEWPIVOTS
+  #define MONOTONIC_APPLYP
+#endif
+
 #include "fflas-ffpack/fflas-ffpack-config.h"
 #include <givaro/modular.h>
 #include <givaro/givranditer.h>
@@ -49,9 +64,9 @@
 
 using namespace std;
 
-//typedef Givaro::ModularBalanced<double> Field;
+typedef Givaro::ModularBalanced<double> Field;
 //typedef Givaro::ModularBalanced<float> Field;
-typedef Givaro::ZRing<double> Field;
+//typedef Givaro::ZRing<double> Field;
 //typedef Givaro::UnparametricZRing<double> Field;
 
 void verification_PLUQ(const Field & F, typename Field::Element * B, typename Field::Element * A,
@@ -150,14 +165,13 @@ int main(int argc, char** argv) {
 	
 	size_t iter = 3 ;
 	int q = 131071 ;
-	Field F(q);
 	int m = 2000 ;
 	int n = 2000 ;
 	int r = 2000 ;
 	int v = 0;
 	int t=MAX_THREADS;
 	int NBK = -1;
-	bool par=true;
+	bool par=false;
 	Argument as[] = {
 		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",         TYPE_INT , &q },
 		{ 'm', "-m M", "Set the row dimension of A.",      TYPE_INT , &m },
@@ -171,12 +185,12 @@ int main(int argc, char** argv) {
 		END_OF_ARGUMENTS
 	};
 	FFLAS::parseArguments(argc,argv,as);
-
+	Field F(q);
 	if (r > std::min(m,n)){
 		std::cerr<<"Warning: rank can not be greater than min (m,n). It has been forced to min (m,n)"<<std::endl;
 		r=std::min(m,n);
 	}
-	if (!par) t=1;NBK=1;
+	if (!par) { t=1;NBK=1;}
 	if (NBK==-1) NBK = t;
 
 	Field::Element_ptr A,  Acop;
@@ -229,8 +243,9 @@ int main(int argc, char** argv) {
 				BC = n/NUM_THREADS;
 			}
 		}
-		else
+		else{
 			R = FFPACK::PLUQ(F, diag, m, n, A, n, P, Q);
+		}
 		if (i) {chrono.stop(); time[i-1]=chrono.realtime();}
 		
 	}
@@ -245,7 +260,7 @@ int main(int argc, char** argv) {
 			  << " Gflops: " << gflop / meantime << " BC: "<<BC;
 	FFLAS::writeCommandString(std::cout, as) << std::endl;
 	
-		//verification
+	//verification
 	if(v)
 		verification_PLUQ(F,Acop,A,P,Q,m,n,R);
 	
diff --git a/benchmarks/perfpublisher.sh b/benchmarks/perfpublisher.sh
index 8be3168..b010b39 100755
--- a/benchmarks/perfpublisher.sh
+++ b/benchmarks/perfpublisher.sh
@@ -8,12 +8,24 @@ XMLFILE=$1
 benchmarks=$2
 COMPILER=$3
 
+# choose gdate on OS X
+if command -v "gdate" >/dev/null; then
+    DATE=gdate
+else
+    DATE=date
+fi
 #=================#
 # Plateform infos #
 #=================#
 
 COMPILERVERSION=$($COMPILER --version 2>&1 | head -1)
-CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+
+if command -v "lscpu" >/dev/null; then
+    CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+else
+    CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000))
+fi
+
 ARCH=$(uname -m)
 OSNAME=$(uname -s)
 OSVERSION=$(uname -r)
@@ -45,8 +57,8 @@ echo '<report name="benchmarks-report" categ="benchmarks">' >> $XMLFILE
 #=======#
 
 echo '<start>' >> $XMLFILE
-echo '<date format="YYYYMMDD" val="'$(date +%Y%m%d)'" />' >> $XMLFILE
-echo '<time format="HHMMSS" val="'$(date +%H%M%S)'" />' >> $XMLFILE
+echo '<date format="YYYYMMDD" val="'$($DATE +%Y%m%d)'" />' >> $XMLFILE
+echo '<time format="HHMMSS" val="'$($DATE +%H%M%S)'" />' >> $XMLFILE
 echo '</start>' >> $XMLFILE
 
 #============#
@@ -59,9 +71,9 @@ do
 	then
 		#File does not exist: compile it
 		echo '[Compiling]' $benchmark
-		COMPILESTART=$(date +%s%3N)
+		COMPILESTART=$($DATE +%s%3N)
 		COMPILELOG=$(make $benchmark 2>&1; echo 'Returned state: '$?)
-		COMPILEEND=$(date +%s%3N)
+		COMPILEEND=$($DATE +%s%3N)
 		COMPILETIME=$(($COMPILEEND - $COMPILESTART))
 		COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$')
 		COMPILETIMERELEVANT='true'
@@ -96,7 +108,7 @@ do
 		EXECUTED='yes'
 		EXECUTIONLOG=$(./$benchmark 2>&1)
 
-		if [[ ${EXECUTIONLOG,,} != "time:"* ]]
+		if [[ ${EXECUTIONLOG} != "Time:"* ]]
 		then
 			#Execution failure
 			PASSED='no'
@@ -114,7 +126,7 @@ do
 			EXECUTIONTIME=$(echo $EXECUTIONLOG | cut -d' ' -f2)
 			PERFORMANCEFLOPS=$(echo $EXECUTIONLOG | cut -d' ' -f4)
 			EXECUTIONTIMERELEVANT='true'
-			if [[ ${PERFORMANCEFLOPS,,} != "irrelevant" ]]
+			if [[ ${PERFORMANCEFLOPS} != "Irrelevant" ]]
 			then
 				PERFORMANCEFLOPSRELEVANT='true'
 			else
diff --git a/configure.ac b/configure.ac
index 85dfa71..fc9c205 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,13 +23,18 @@
 
 AC_PREREQ([2.61])
 
-AC_INIT([FFLAS-FFPACK], [2.2.1],[ffpack-devel at googlegroups.com],[fflas-ffpack],
+
+AC_INIT([FFLAS-FFPACK], [2.2.2],[ffpack-devel at googlegroups.com],[fflas-ffpack],
 		[https://github.com/linbox-team/fflas-ffpack])
 
+
 AC_CONFIG_MACRO_DIR([macros])
 AC_CONFIG_AUX_DIR([build-aux])
-AM_INIT_AUTOMAKE([1.8 gnu no-dependencies -Wall -Wno-portability])
 AC_CONFIG_HEADERS([config.h])
+
+AC_CANONICAL_TARGET
+
+AM_INIT_AUTOMAKE([1.8 gnu no-dependencies -Wall -Wno-portability foreign])
 AX_PREFIX_CONFIG_H(fflas-ffpack/config.h, __FFLASFFPACK)
 AC_PATH_PROG(RM, rm, $FALSE)
 RM="$RM -f"
@@ -77,7 +82,7 @@ AC_SUBST([DEFAULT_CFLAGS])
 AC_SUBST([DEBUG_CFLAGS])
 AC_SUBST([TESTS_CFLAGS])
 
-TESTS_CFLAGS="-O0"
+TESTS_CFLAGS="-O2"
 DEBUG_CFLAGS="-g"
 DEFAULT_CFLAGS=""
 WARN_CFLAGS="-Wall"
@@ -154,7 +159,7 @@ fi
 
 
 DEFAULT_CFLAGS="${DEFAULT_CFLAGS} ${WARN_CFLAGS} ${DEBUG_CFLAGS}"
-TESTS_CFLAGS="${TESTS_CFLAGS} ${WARN_CFLAGS} #${DEBUG_CFLAGS}"
+TESTS_CFLAGS="${TESTS_CFLAGS} ${WARN_CFLAGS} ${DEBUG_CFLAGS}"
 
 
 AC_HEADER_STDC
@@ -170,28 +175,33 @@ echo "-----------------------------------------------"
 echo "     START  FFLAS-FFPACK CONFIG                "
 echo "-----------------------------------------------"
 
-
-echo "-----------------------------------------------"
 FF_CHECK_OMP
+# checkes which SIMD instructions are available and defines HAVE_{SSE_4_1,AVX,AVX2}_INSTRUCTIONS and compiler flags
+CUSTOM_SIMD="no"
+FF_CHECK_SIMD
+#FF_CHECK_SSE
+#FF_CHECK_AVX
+arch=`echo $target | cut -d"-" -f1`
+if [[ "x$CUSTOM_SIMD" = "xno" ]] ; then
+   AX_CHECK_X86_FEATURES([][])
+else
+   CXXFLAGS="${CXXFLAGS} ${SSEFLAGS} ${AVXFLAGS}"
+fi
 
-# TODO do FF_CHECK_SIMD and take best, define USE_SSE2/AVX/AVX2/... and have also __FFLASFFPACK_USE_SIMD
-FF_CHECK_SSE
-FF_CHECK_AVX
+dnl echo "CCNAM = $CCNAM $CUSTOM_SIMD"
 
-AVXFLAGS="${SSEFLAGS} ${AVXFLAGS}"
+dnl With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+dnl have overloads for both types without linking error.
+AS_IF([test "x$CCNAM" = "xgcc48"],[CXXFLAGS="${CXXFLAGS} -fabi-version=6"],[])
 
-echo "-----------------------------------------------"
-AC_SUBST([PARFLAGS],['${AVXFLAGS} ${OMPFLAGS}'])
-	case x${CCNAM} in
-		xgcc|xgcc44|xgcc48)
-	# With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
-			# have overloads for both types without linking error.
-			AVXFLAGS="${AVXFLAGS} -fabi-version=6"
-			;;
-		*)
-	esac
 
-AC_SUBST([PARLIBS],['${OMPFLAGS}'])
+PARFLAGS="${OMPFLAGS}"
+PARLIBS="${OMPFLAGS}"
+
+AC_SUBST(PARFLAGS)
+AC_SUBST(PARLIBS)
+
+echo "-----------------------------------------------"
 
 # Machine characteristics
 
@@ -233,17 +243,19 @@ echo "-----------------------------------------------"
 # Getting GMP from Givaro - AB 2014-12-12
 #FF_CHECK_GMP
 
-FF_CHECK_GIVARO(,,[
-echo '*******************************************************************************'
-echo ' WARNING: GIVARO not found!'
-echo
-echo ' GIVARO library is required for some tests in this library.'
-echo ' Please make sure GIVARO is installed and specify its location with the'
-echo ' option --with-givaro=<prefix> when running configure.'
-echo ' Do not forget to set/export LD_LIBRARY_PATH if necessary.'
-echo '*******************************************************************************'
-exit 1
-])
+PKG_CHECK_MODULES([GIVARO],[givaro])
+
+dnl FF_CHECK_GIVARO(,,[
+dnl echo '*******************************************************************************'
+dnl echo ' WARNING: GIVARO not found!'
+dnl echo
+dnl echo ' GIVARO library is required for some tests in this library.'
+dnl echo ' Please make sure GIVARO is installed and specify its location with the'
+dnl echo ' option --with-givaro=<prefix> when running configure.'
+dnl echo ' Do not forget to set/export LD_LIBRARY_PATH if necessary.'
+dnl echo '*******************************************************************************'
+dnl exit 1
+dnl ])
 
 BLAS_FOUND=false
 
@@ -344,6 +356,7 @@ fflas-ffpack/utils/Makefile
 fflas-ffpack/paladin/Makefile
 fflas-ffpack/interfaces/Makefile
 fflas-ffpack/interfaces/libs/Makefile
+fflas-ffpack/checkers/Makefile
 doc/Makefile
 tests/Makefile
 tests/data/Makefile
diff --git a/examples/2x2-fgemm.C b/examples/101-fgemm.C
similarity index 100%
copy from examples/2x2-fgemm.C
copy to examples/101-fgemm.C
diff --git a/examples/2x2-fgemm.C b/examples/2x2-fgemm.C
index 0d76030..7dda181 100644
--- a/examples/2x2-fgemm.C
+++ b/examples/2x2-fgemm.C
@@ -33,44 +33,26 @@ using namespace FFLAS;
 
 int main(int argc, char** argv) {
 
-	typedef Givaro::ModularBalanced<float> Ring;
-	Ring F(101);
+	typedef Givaro::Modular<float> Ring;
+	Ring F(11);
 
-	Ring::Element * A, * B, * C;
+	Ring::Element A[4]{1,2,3,4}, B[4]{5,6,7,8}, * C;
 
-	A = fflas_new(F,2,3);
-	B = fflas_new(F,3,2);
-	C = fflas_new(F,2,2);  
+    size_t m(2),k(2),n(2);
 
-	F.assign(*(A+0),F.one);
-	F.init(*(A+1),2);
-	F.init(*(A+2),3);
-	F.init(*(A+3),5);
-	F.init(*(A+4),7);
-	F.init(*(A+5),11);
-
-        Ring::Element t,u,v; 
-        F.init(t, 2); F.init(u, 4); F.init(v);
-
-	F.assign(*(B+0),F.zero);		// B[0] <- 0
-	F.assign(*(B+1),t);			// B[1] <- 2
-	F.assign(*(B+2),u);			// B[2] <- 4 
-        F.add(v,t,u); F.assign(*(B+3),v);	// B[3] <- 2+4
-	F.mul(*(B+4),t,u);			// B[4] <- 2*4
-	F.add(*(B+5),u,v);			// B[5] <- 4+6
+	C = fflas_new(F,m,n);
 	
-	write_field(F, std::cout << "A:=", A, 2, 3, 3,true) << std::endl;
-	write_field(F, std::cout << "B:=", B, 3, 2, 2,true) << std::endl;
+        // A is mxk with leading dimension k
+	write_field(F, std::cout << "A:=", A, m, k, k, true) << std::endl;
+        // B is kxn with leading dimension n
+	write_field(F, std::cout << "B:=", B, k, n, n, true) << std::endl;
 
-	fgemm (F, FflasNoTrans, FflasNoTrans, 2,2,3, F.one, A, 3, B, 2, F.zero, C, 2 );
+	fgemm (F, FflasNoTrans, FflasNoTrans, m, n, k, F.one, A, m, B, n, F.zero, C, n);
 
-	write_field(F, std::cout << "C:=", C, 2, 2, 2,true) << std::endl;
+        // C is mxn with leading dimension n
+	write_field(F, std::cout << "C:=", C, m, n, n, true) << " modulo 11" << std::endl;
 	
-	fflas_delete( A);
-	fflas_delete( B);
 	fflas_delete( C);
-	
-	
 
   return 0;
 }
diff --git a/examples/2x2-pluq.C b/examples/2x2-pluq.C
new file mode 100644
index 0000000..b33e806
--- /dev/null
+++ b/examples/2x2-pluq.C
@@ -0,0 +1,63 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+
+/* Copyright (c) FFLAS-FFPACK
+* ========LICENCE========
+* This file is part of the library FFLAS-FFPACK.
+*
+* FFLAS-FFPACK is free software: you can redistribute it and/or modify
+* it under the terms of the  GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+* ========LICENCE========
+*/
+
+#include <iostream>
+#include <vector>
+#include <givaro/modular.h>
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/Matio.h"
+
+using namespace std;
+
+int main(int argc, char** argv) {
+	
+	if (argc > 2){
+		std::cerr<<"Usage: 2x2-pluq <p>"<<std::endl;
+		return -1;
+	}
+
+	int64_t p = (argc>1?atoi(argv[1]):5);
+		// Creating the finite field Z/pZ
+	Givaro::Modular<double> F(p);
+
+	size_t m(2),n(2);
+	double A[4] {1,2,3,4};
+	write_field(F,std::cout<<"A = "<<std::endl,A,m,n,n);
+
+    size_t * P = FFLAS::fflas_new<size_t>(m);
+    size_t * Q = FFLAS::fflas_new<size_t>(n);
+
+    FFPACK::PLUQ (F, FFLAS::FflasNonUnit, m, n, A, n, P, Q);
+
+	write_perm(std::cout<<"P = "<<std::endl,P,m);
+	write_field(F,std::cout<<"LU = "<<std::endl,A,m,n,n)<< " modulo " << p << std::endl;
+	write_perm(std::cout<<"Q = "<<std::endl,Q,n);
+
+    FFLAS::fflas_delete( P);
+    FFLAS::fflas_delete( Q);
+		
+	return 0;
+}
+
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 1d81f01..0db1b07 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -29,8 +29,11 @@ AM_CPPFLAGS +=  $(CBLAS_FLAG) $(GIVARO_CFLAGS) $(OPTFLAGS) -I$(top_srcdir)/fflas
 LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS)
 AM_LDFLAGS=-static $(PARLIBS)
 
-FFLA_EXAMP =    2x2-fgemm 
+FFLA_EXAMP =    2x2-fgemm pluq 101-fgemm 2x2-pluq
 2x2_fgemm_SOURCES = 2x2-fgemm.C
+pluq_SOURCES = pluq.C
+101_fgemm_SOURCES = 101-fgemm.C
+2x2_pluq_SOURCES = 2x2-pluq.C
 
 if FFLASFFPACK_HAVE_LAPACK
 USE_LAPACK_EXAMP = $(LAPA_EXAMP)
diff --git a/examples/pluq.C b/examples/pluq.C
new file mode 100644
index 0000000..454c9ec
--- /dev/null
+++ b/examples/pluq.C
@@ -0,0 +1,63 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+
+/* Copyright (c) FFLAS-FFPACK
+* ========LICENCE========
+* This file is part of the library FFLAS-FFPACK.
+*
+* FFLAS-FFPACK is free software: you can redistribute it and/or modify
+* it under the terms of the  GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+* ========LICENCE========
+*/
+
+#include <iostream>
+#include <givaro/modular.h>
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/Matio.h"
+
+using namespace std;
+
+int main(int argc, char** argv) {
+	
+	if (argc != 3){
+		std::cerr<<"Usage: pluq <p> <matrix>"<<std::endl;
+		return -1;
+	}
+
+	int p = atoi(argv[1]);
+	std::string file = argv[2];
+	size_t m,n;
+	
+		// Creating the finite field Z/qZ
+	Givaro::Modular<double> F(p);
+
+		// Reading the matrix from a file
+	double * A = read_field (F, file.c_str(), &m, &n);
+
+    size_t * P = FFLAS::fflas_new<size_t>(m);
+    size_t * Q = FFLAS::fflas_new<size_t>(n);
+
+    FFPACK::PLUQ (F, FFLAS::FflasNonUnit, m, n, A, n, P, Q);
+
+	write_field(F,std::cout<<"PLUQ = "<<std::endl,A,m,n,n);
+
+    FFLAS::fflas_delete( P);
+    FFLAS::fflas_delete( Q);
+    FFLAS::fflas_delete( A);
+		
+	return 0;
+}
+
diff --git a/fflas-ffpack-config.in b/fflas-ffpack-config.in
index 59e7cd8..cf3ae04 100644
--- a/fflas-ffpack-config.in
+++ b/fflas-ffpack-config.in
@@ -100,11 +100,11 @@ while test $# -gt 0; do
 			;;
 
 		--cflags)
-			echo -I${includedir} @CBLAS_FLAG@ @AVXFLAGS@ @OMPFLAGS@ @GIVARO_CFLAGS@ @PRECOMPILE_FLAGS@ # @PARFLAGS@ # @CUDA_CFLAGS@
+			echo -I${includedir} @CBLAS_FLAG@ @CXXFLAGS@ @GIVARO_CFLAGS@ @PRECOMPILE_FLAGS@  @PARFLAGS@ # @CUDA_CFLAGS@
 			;;
 
 		--cflags-full)
-			 echo -I${includedir} @CBLAS_FLAG@ @AVXFLAGS@ @CXXFLAGS@  @OMPFLAGS@ @GIVARO_CFLAGS@ @PRECOMPILE_FLAGS@ # @PARFLAGS@ # @CUDA_CFLAGS@
+			 echo -I${includedir} @CBLAS_FLAG@ @CXXFLAGS@ @GIVARO_CFLAGS@ @PRECOMPILE_FLAGS@  @PARFLAGS@ # @CUDA_CFLAGS@
 			 ;;
 
 		--blas-cflags)
@@ -112,7 +112,7 @@ while test $# -gt 0; do
 			;;
 
 		--libs)
-			echo @PRECOMPILE_LIBS@ @CBLAS_LIBS@ @GIVARO_LIBS@ # @CUDA_LIBS@
+			echo @PARLIBS@ @PRECOMPILE_LIBS@ @CBLAS_LIBS@ @GIVARO_LIBS@ # @CUDA_LIBS@
 			;;
 
 		--blas-libs)
diff --git a/fflas-ffpack.pc.in b/fflas-ffpack.pc.in
index 1f38302..e2aa221 100644
--- a/fflas-ffpack.pc.in
+++ b/fflas-ffpack.pc.in
@@ -1,14 +1,14 @@
 /------------------ fflas-ffpack.pc ------------------------
 prefix=@prefix@
-exec_prefix=@prefix@/bin
+exec_prefix=@prefix@
 libdir=@prefix@/lib
 includedir=@prefix@/include
 
 Name: fflas-ffpack
 Description: Finite Field Linear Algebra Suroutines/Package
-URL: http://linbox-team.github.io/fflas-ffpack/
+URL: http://github.com/linbox-team/fflas-ffpack
 Version: @VERSION@
 Requires: givaro >= 4.0.1
-Libs: @PRECOMPILE_LIBS@ @CBLAS_LIBS@
-Cflags: @DEFAULT_CFLAGS@ @CBLAS_FLAG@ @CXXFLAGS@ @AVXFLAGS@ @OMPFLAGS@ @PRECOMPILE_FLAGS@
+Libs: @PARLIBS@ @PRECOMPILE_LIBS@ @CBLAS_LIBS@
+Cflags: -I at includedir@ @DEFAULT_CFLAGS@ @CBLAS_FLAG@ @CXXFLAGS@ @PARFLAGS@ @PRECOMPILE_FLAGS@
 \-------------------------------------------------------
\ No newline at end of file
diff --git a/fflas-ffpack/Makefile.am b/fflas-ffpack/Makefile.am
index 4bccdb2..eb6957f 100644
--- a/fflas-ffpack/Makefile.am
+++ b/fflas-ffpack/Makefile.am
@@ -22,7 +22,7 @@
 #/
 
 
-SUBDIRS=fflas ffpack field utils paladin interfaces
+SUBDIRS=fflas ffpack field utils paladin interfaces checkers
 
 EXTRA_DIST=fflas-ffpack.doxy
 
diff --git a/fflas-ffpack/field/Makefile.am b/fflas-ffpack/checkers/Makefile.am
similarity index 67%
copy from fflas-ffpack/field/Makefile.am
copy to fflas-ffpack/checkers/Makefile.am
index 42ed9a5..2c337f6 100644
--- a/fflas-ffpack/field/Makefile.am
+++ b/fflas-ffpack/checkers/Makefile.am
@@ -1,5 +1,5 @@
-# Copyright (c) 2011 FFLAS-FFPACK
-# written by Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+# Copyright (c) 2016 FFLAS-FFPACK
+# written by Ashley Lesdalons (ash09) <ashley.lesdalons at e.ujf-grenoble.fr>
 # adapted from LinBox configuration
 #
 # ========LICENCE========
@@ -22,19 +22,19 @@
 #/
 
 
-pkgincludesubdir=$(pkgincludedir)/field
+pkgincludesubdir=$(pkgincludedir)/checkers
 
-RNS=rns.h			        \
-	rns.inl				\
-	rns-double.h			\
-	rns-double-elt.h		\
-	rns-double.inl			\
-	rns-integer.h			\
-	rns-integer-mod.h     \
-	modular-extended.h
+pkgincludesub_HEADERS=        \
+		checkers_fflas.h \
+		checkers_fflas.inl \
+		checkers_ffpack.h \
+		checkers_ffpack.inl \
+		checker_empty.h \
+		checker_pluq.inl \
+		checker_ftrsm.inl \
+		checker_fgemm.inl \
+		checker_charpoly.inl \
+		checker_invert.inl
 
-pkgincludesub_HEADERS=          	\
-	  field-traits.h                \
-	  $(RNS)
 
-EXTRA_DIST=field.doxy
+EXTRA_DIST=checkers.doxy
diff --git a/fflas-ffpack/checkers/checker_charpoly.inl b/fflas-ffpack/checkers/checker_charpoly.inl
new file mode 100644
index 0000000..6abe9c2
--- /dev/null
+++ b/fflas-ffpack/checkers/checker_charpoly.inl
@@ -0,0 +1,165 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/Checker_charpoly.inl
+ * Copyright (C) 2016 Ashley Lesdalons
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checker_charpoly_INL
+#define __FFLASFFPACK_checker_charpoly_INL
+
+#include "fflas-ffpack/ffpack/ffpack.h"
+
+#ifdef TIME_CHECKER_CHARPOLY
+#include <givaro/givtimer.h>
+#endif
+
+namespace FFPACK {
+    template <class Field, class Polynomial> 
+    class CheckerImplem_charpoly {
+
+        const Field& F;
+        const size_t n, lda;
+        typename Field::Element lambda, det;
+        bool pass;
+#ifdef TIME_CHECKER_CHARPOLY
+        Givaro::Timer _time;
+#endif
+
+    public:
+	    CheckerImplem_charpoly(const Field& F_, const size_t n_, typename Field::ConstElement_ptr A, size_t lda_) 
+		: F(F_), n(n_), lda(lda_)
+            {
+                typename Field::RandIter G(F);
+                init(G,A);
+            }
+
+        CheckerImplem_charpoly(typename Field::RandIter &G, const size_t n_, typename Field::ConstElement_ptr A, size_t lda_)
+                : F(G.ring()), n(n_), lda(lda_)
+            {
+                init(G,A);
+            }
+
+        ~CheckerImplem_charpoly() {
+        }
+
+        inline bool check(Polynomial &g) {
+#ifdef TIME_CHECKER_CHARPOLY
+            Givaro::Timer checktime; checktime.start();
+#endif
+            typename Field::Element h = F.zero,
+                t = F.one,
+                u;
+            for (size_t i=0; i < g.size(); ++i) {
+                F.mul(u,g[i],t);
+                F.add(h,h,u);
+                F.mul(t,t,lambda);
+            }
+
+                // is h == det ?
+            pass = pass && F.areEqual(h,det);
+            if (!pass) throw FailureCharpolyCheck();
+
+#ifdef TIME_CHECKER_CHARPOLY
+            checktime.stop(); _time += checktime;
+            std::cerr << "CHARPol CHECK: " << _time << std::endl;
+#endif
+            return pass;
+        }
+
+    private:
+        inline void init(typename Field::RandIter &G, typename Field::ConstElement_ptr A) {
+#ifdef TIME_CHECKER_CHARPOLY
+            Givaro::Timer inittime; inittime.start();
+#endif
+                // random lambda
+            G.random(lambda);
+
+            typename Field::Element_ptr v = FFLAS::fflas_new(F,n,1),
+                w = FFLAS::fflas_new(F,n,1),
+                Ac = FFLAS::fflas_new(F,n,n);
+            FFLAS::frand(F,G,n,v,1);
+
+                // w <- -A.v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, n, n, F.mOne, A, lda, v, 1, F.zero, w, 1);
+
+            if (!F.isZero(lambda)) {
+                    // w <- lambda.v + w
+                FFLAS::faxpy(F, n, lambda, v, 1, w, 1);
+            }
+
+                // Ac <- A - lambda.I
+            FFLAS::fassign(F,n,n,A,lda,Ac,n);
+            for (size_t i=0; i<n; ++i)
+		    F.subin(*(Ac+i*n+i),lambda);
+
+                // w <- Ac.v + w
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, n, n, F.one, Ac, n, v, 1, F.one, w, 1);
+
+                // is w == 0 ?
+            pass = FFLAS::fiszero(F,n,1,w,1);
+            FFLAS::fflas_delete(v,w);
+            if (!pass) throw FailureCharpolyCheck();
+
+                // P,Ac,Q <- PLUQ(Ac)
+            size_t *P = FFLAS::fflas_new<size_t>(n);
+            size_t *Q = FFLAS::fflas_new<size_t>(n);
+
+#ifdef TIME_CHECKER_CHARPOLY
+            Givaro::Timer pluqtime; pluqtime.start();
+#endif
+
+            FFPACK::PLUQ(F, FFLAS::FflasNonUnit, n, n, Ac, n, P, Q);
+
+#ifdef TIME_CHECKER_CHARPOLY
+            pluqtime.stop(); _time -= pluqtime;
+            inittime.stop(); _time += inittime;
+            std::cerr << "CHARPol server PLUQ:" << pluqtime << std::endl;
+            inittime.start();
+#endif
+
+                // compute the determinant of A
+            F.init(det,*Ac);
+            for (size_t i=1; i<n; ++i)
+                F.mul(det,det,*(Ac+i*n+i));
+            if (n%2 == 1) F.neg(det,det);
+
+                // count the number of permutations
+            int t = 0;
+            for (size_t i=0; i<n; ++i) {
+                if (P[i] != i) t++;
+                if (Q[i] != i) t++;
+            }
+            if (t%2 == 1) F.neg(det,det);
+
+            FFLAS::fflas_delete(Ac);
+#ifdef TIME_CHECKER_CHARPOLY
+            inittime.stop(); _time += inittime;
+#endif
+        }
+    };
+    
+}
+
+#endif // __FFLASFFPACK_checker_charpoly_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128.inl b/fflas-ffpack/checkers/checker_empty.h
similarity index 56%
copy from fflas-ffpack/fflas/fflas_simd/simd128.inl
copy to fflas-ffpack/checkers/checker_empty.h
index 81bffef..22ac961 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128.inl
+++ b/fflas-ffpack/checkers/checker_empty.h
@@ -1,10 +1,9 @@
 /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-/*
- * Copyright (C) 2014 the FFLAS-FFPACK group
+/* checkers/checker_empty.h
+ * Copyright (C) 2016 JG Dumas
  *
- * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
- * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Written by Jean-Guillaume Dumas <Jean-Guillaume.Dumas at imag.fr>
  *
  *
  * ========LICENCE========
@@ -27,25 +26,17 @@
  *.
  */
 
-#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
-#define __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#ifndef __FFLASFFPACK_checkers_empty_H
+#define __FFLASFFPACK_checkers_empty_H
 
-template <bool ArithType, bool Int, bool Signed, int Size> struct Simd128_impl;
+#include "fflas-ffpack/fflas-ffpack-config.h"
 
-#include "simd128_float.inl"
-#include "simd128_double.inl"
+namespace FFLAS {
+    template <class Field>
+    struct Checker_Empty {
+        template<typename... Params> Checker_Empty(Params... parameters) {}
+        template<typename... Params> bool check(Params... parameters) { return true; }
+    };
+}
 
-#ifdef SIMD_INT
-// Trop d'instructions SSE manquantes pour les int8_t
-
-#include "simd128_int16.inl"
-#include "simd128_int32.inl"
-#include "simd128_int64.inl"
-
-#endif //#ifdef SIMD_INT
-
-template <class T>
-using Simd128 =
-    Simd128_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
-
-#endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#endif
diff --git a/fflas-ffpack/checkers/checker_fgemm.inl b/fflas-ffpack/checkers/checker_fgemm.inl
new file mode 100644
index 0000000..1739ba6
--- /dev/null
+++ b/fflas-ffpack/checkers/checker_fgemm.inl
@@ -0,0 +1,100 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/checker_fgemm.inl
+ * Copyright (C) 2016 Ashley Lesdalons
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checker_fgemm_INL
+#define __FFLASFFPACK_checker_fgemm_INL
+
+namespace FFLAS {
+    
+    template <class Field> 
+    class CheckerImplem_fgemm {
+
+        const Field& F;
+        const size_t m,n,k,ldc;
+        typename Field::Element_ptr v,w1;
+
+    public:
+        CheckerImplem_fgemm(const Field &F_,
+                      const size_t m_, const size_t n_, const size_t k_,
+                      const typename Field::Element beta,
+                      typename Field::Element_ptr C, const size_t ldc_)
+                : F(F_), m(m_), n(n_), k(k_), ldc(ldc_), v(FFLAS::fflas_new(F_,n,1)),w1(FFLAS::fflas_new(F_,m,1))
+            {			
+                typename Field::RandIter G(F);
+                init(G,beta,C);
+            }
+
+        CheckerImplem_fgemm(typename Field::RandIter &G,
+                      const size_t m_, const size_t n_, const size_t k_,
+                      const typename Field::Element beta,
+                      typename Field::Element_ptr C, const size_t ldc_)
+                : F(G.ring()), m(m_), n(n_), k(k_), ldc(ldc_), v(FFLAS::fflas_new(F,n,1)),w1(FFLAS::fflas_new(F,m,1))
+            {
+                init(G,beta,C);
+            }
+
+        ~CheckerImplem_fgemm() {
+            FFLAS::fflas_delete(v,w1);
+        }
+
+        inline bool check(const FFLAS::FFLAS_TRANSPOSE ta,
+                          const FFLAS::FFLAS_TRANSPOSE tb,
+                          const typename Field::Element alpha,
+                          typename Field::ConstElement_ptr A, const size_t lda,
+                          typename Field::ConstElement_ptr B, const size_t ldb,
+                          typename Field::ConstElement_ptr C)
+            {	
+                    // w1 <- C.v - w1
+                FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, C, ldc, v, 1, F.mOne, w1, 1);
+
+                    // w2 <- B.v
+                typename Field::Element_ptr w2 = FFLAS::fflas_new(F,k,1);
+                FFLAS::fgemv(F, tb, k, n, F.one, B, ldb, v, 1, F.zero, w2, 1);
+
+                    // w1 <- alpha.A.w2 - w1
+                FFLAS::fgemv(F, ta, m, k, alpha, A, lda, w2, 1, F.mOne, w1, 1);
+
+                FFLAS::fflas_delete(w2);
+
+                    // is w1 == O ?
+                bool pass = FFLAS::fiszero(F, m, w1, 1);
+                if (!pass) throw FailureFgemmCheck();
+                return pass;
+            }
+
+    private:
+        inline void init(typename Field::RandIter &G, const typename Field::Element beta, typename Field::Element_ptr C) {
+            FFLAS::frand(F,G,n,v,1);
+
+                // w1 <- beta.C.v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, beta, C, ldc, v, 1, F.zero, w1, 1);
+        }
+
+    };
+}
+#endif // __FFLASFFPACK_checker_fgemm_INL
diff --git a/fflas-ffpack/checkers/checker_ftrsm.inl b/fflas-ffpack/checkers/checker_ftrsm.inl
new file mode 100644
index 0000000..5d3bb1f
--- /dev/null
+++ b/fflas-ffpack/checkers/checker_ftrsm.inl
@@ -0,0 +1,119 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/Checker_ftrsm.inl
+ * Copyright (C) 2016 Ashley Lesdalons
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checker_ftrsm_INL
+#define __FFLASFFPACK_checker_ftrsm_INL
+
+namespace FFLAS {
+    
+    template <class Field> 
+    class CheckerImplem_ftrsm {
+
+        const Field& F;	
+        typename Field::Element_ptr v,w;
+
+    public:
+        CheckerImplem_ftrsm(const Field& F_, 
+                      const size_t m, const size_t n,
+                      const typename Field::Element alpha,
+                      const typename Field::ConstElement_ptr B, 
+                      const size_t ldb) 
+                : F(F_), 
+                  v(FFLAS::fflas_new(F_,n,1)), 
+                  w(FFLAS::fflas_new(F_,m,1))
+            {
+                typename Field::RandIter G(F);
+                init(G,m,n,B,ldb,alpha);
+            }
+
+        CheckerImplem_ftrsm(typename Field::RandIter &G, 
+                      const size_t m, const size_t n,
+                      const typename Field::Element alpha,
+                      const typename Field::ConstElement_ptr B, 
+                      const size_t ldb)
+                : F(G.ring()), 
+                  v(FFLAS::fflas_new(F,n,1)), 
+                  w(FFLAS::fflas_new(F,m,1))
+            {
+                init(G,m,n,B,ldb,alpha);
+            }
+
+        ~CheckerImplem_ftrsm() {
+            FFLAS::fflas_delete(v,w);
+        }
+
+        inline bool check(const FFLAS::FFLAS_SIDE side,
+                          const FFLAS::FFLAS_UPLO uplo,
+                          const FFLAS::FFLAS_TRANSPOSE trans,
+                          const FFLAS::FFLAS_DIAG diag,
+                          const size_t m, const size_t n,
+#ifdef __FFLAS__TRSM_READONLY
+                          typename Field::ConstElement_ptr
+#else
+                          typename Field::Element_ptr
+#endif
+                          A, size_t lda,
+                          const typename Field::ConstElement_ptr X, size_t ldx) {
+            size_t k = (side==FFLAS::FflasLeft?m:n);
+
+            typename Field::Element_ptr v1 = FFLAS::fflas_new(F,k,1);
+        
+            if (side==FFLAS::FflasLeft) {
+                    // (Left) v1 <- X.v 
+                    // (Left) v1 <- A.v1
+                    // (Left) w <- w - v1
+                FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, X, ldx, v, 1, F.zero, v1, 1);
+                FFLAS::ftrmm(F, FFLAS::FflasLeft, uplo, trans, diag, k, 1, F.one, A, lda, v1, 1);
+                FFLAS::fsubin(F, m, v1, 1, w, 1);
+            } else {
+                    // (Right) v <- A.v
+                    // (Right) w <- X.v - w
+                FFLAS::ftrmm(F, FFLAS::FflasLeft, uplo, trans, diag, k, 1, F.one, A, lda, v, 1);
+                FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, X, ldx, v, 1, F.mOne, w, 1); 
+            }
+        
+
+            FFLAS::fflas_delete(v1);
+        
+            bool pass = FFLAS::fiszero(F,m,1,w,1);
+            if (!pass) throw FailureTrsmCheck();
+            return pass;
+        }
+
+    private:	
+        inline void init(typename Field::RandIter &G, const size_t m, const size_t n, const typename Field::ConstElement_ptr B, size_t ldb, const typename Field::Element alpha) {
+            FFLAS::frand(F,G,n,v,1); 
+
+                // w <- alpha.B.v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, alpha, B, ldb, v, 1, F.zero, w, 1);
+        }
+    };
+    
+}
+
+#endif // __FFLASFFPACK_checker_ftrsm_INL
diff --git a/fflas-ffpack/checkers/checker_invert.inl b/fflas-ffpack/checkers/checker_invert.inl
new file mode 100644
index 0000000..a608e01
--- /dev/null
+++ b/fflas-ffpack/checkers/checker_invert.inl
@@ -0,0 +1,83 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/Checker_invert.inl
+ * Copyright (C) 2016 Ashley Lesdalons
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checker_invert_INL
+#define __FFLASFFPACK_checker_invert_INL
+
+namespace FFPACK {
+
+    template <class Field> 
+    class CheckerImplem_invert {
+
+        const Field& F;
+        typename Field::Element_ptr v,w;
+        const size_t m,lda;
+
+    public:
+        CheckerImplem_invert(const Field& F_, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) 
+                : F(F_), v(FFLAS::fflas_new(F_,m_,1)), w(FFLAS::fflas_new(F_,m_,1)), m(m_), lda(lda_)
+            {
+                typename Field::RandIter G(F);
+                init(G,m,A,lda);
+            }
+
+        CheckerImplem_invert(typename Field::RandIter &G, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) 
+                : F(G.ring()), v(FFLAS::fflas_new(F,m_,1)), w(FFLAS::fflas_new(F,m_,1)), m(m_), lda(lda_)
+            {
+                init(G,m,A,lda);
+            }
+
+        ~CheckerImplem_invert() {
+            FFLAS::fflas_delete(v,w);
+        }
+
+        inline bool check(typename Field::ConstElement_ptr A, int nullity) {
+                // v <- A.w - v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, m, F.one, A, lda, w, 1, F.mOne, v, 1);
+
+            bool pass = FFLAS::fiszero(F,m,1,v,1) || (nullity != 0);
+            if (!pass) throw FailureInvertCheck();
+            return pass;
+        }
+
+    private:
+        void init(typename Field::RandIter &G, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) {
+            FFLAS::frand(F,G,m,v,1);
+
+// write_field(F,std::cerr<<"init A : ",A,m,m,lda,true)<<std::endl;
+// write_field(F,std::cerr<<"init v : ",v,m,1,1,true)<<std::endl;  
+    	
+                // w <- A.v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, m, F.one, A, lda, v, 1, F.zero, w, 1);
+// write_field(F,std::cerr<<"init w : ",w,m,1,1,true)<<std::endl;
+        }
+  
+    };
+}
+
+#endif // __FFLASFFPACK_checker_invert_INL
diff --git a/fflas-ffpack/checkers/checker_pluq.inl b/fflas-ffpack/checkers/checker_pluq.inl
new file mode 100644
index 0000000..145f0ed
--- /dev/null
+++ b/fflas-ffpack/checkers/checker_pluq.inl
@@ -0,0 +1,147 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/checker_pluq.inl
+ * Copyright (C) 2016 Jean-Guillaume Dumas
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *            Jean-Guillaume Dumas <Jean-Guillaume.Dumas at imag.fr>
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checker_pluq_INL
+#define __FFLASFFPACK_checker_pluq_INL
+
+#include "fflas-ffpack/ffpack/ffpack.h"
+
+#ifdef TIME_CHECKER_PLUQ
+#include <givaro/givtimer.h>
+#endif
+
+namespace FFPACK {
+    template <class Field> 
+    class CheckerImplem_PLUQ {
+
+        const Field& F;
+        typename Field::Element_ptr v,w;
+        const size_t m,n;
+#ifdef TIME_CHECKER_PLUQ
+        Givaro::Timer _time;
+#endif
+
+    public:
+        CheckerImplem_PLUQ(const Field& F_, size_t m_, size_t n_, 
+                     typename Field::ConstElement_ptr A, size_t lda) 
+				: F(F_), 
+                  v(FFLAS::fflas_new(F_,n_,1)), 
+                  w(FFLAS::fflas_new(F_,m_,1)), 
+                  m(m_), n(n_)
+            {
+                typename Field::RandIter G(F);
+                init(G,A,lda);
+            }
+
+        CheckerImplem_PLUQ(typename Field::RandIter &G, size_t m_, size_t n_, 
+                     typename Field::ConstElement_ptr A, size_t lda)
+				: F(G.ring()), 
+                  v(FFLAS::fflas_new(F,n_,1)), 
+                  w(FFLAS::fflas_new(F,m_,1)), 
+                  m(m_), n(n_)
+            {
+                init(G,A,lda);
+            }
+
+        ~CheckerImplem_PLUQ() {
+            FFLAS::fflas_delete(v,w);
+        }
+
+            /** check if the PLUQ factorization is correct.
+             *  Returns true if w - P(L(U(Q.v))) == 0
+             * @param A
+             * @param r
+             * @param P
+             * @param Q
+             */
+        inline bool check(typename Field::ConstElement_ptr A, size_t lda, 
+                          size_t r, size_t *P, size_t *Q) {
+#ifdef TIME_CHECKER_PLUQ
+            Givaro::Timer checktime; checktime.start();
+#endif
+				// _w = [w1|w2]
+            typename Field::Element_ptr _w = FFLAS::fflas_new(F,m,1); 
+			
+                // v <-- Q.v
+            FFPACK::applyP(F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, 1, 0, r, v, 1, Q);
+
+                // w1 <- V1 && w2 <- 0
+            FFLAS::fassign(F, r, 1, v, 1, _w, 1);
+            FFLAS::fzero(F, m-r, _w+r, 1);
+
+                // w1 <- U1.w1
+                // WARNING: should be ftrmv
+            FFLAS::ftrmm(F, FFLAS::FflasLeft, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, r, 1, F.one, A, lda, _w, 1);
+		
+                // w1 <- U2.V2 + w1
+            if (r < n)
+                FFLAS::fgemm(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, r, 1, n-r, F.one, A+r, lda, v+r, 1, F.one, _w, 1);
+
+                // w2 <- L2.w1
+            if (r < m)
+                FFLAS::fgemm(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m-r, 1, r, F.one, A+r*n, lda, _w, 1, F.zero, _w+r, 1); 		
+
+                // w1 <- L1.w1
+                // WARNING: should be ftrmv
+            FFLAS::ftrmm(F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasUnit, r, 1, F.one, A, lda, _w, 1);
+
+                // _w <- P._w
+            FFPACK::applyP(F, FFLAS::FflasRight, FFLAS::FflasNoTrans, 1, 0, r, _w, 1, P);
+
+				// is _w == w ?
+            FFLAS::fsubin(F, m, w, 1, _w, 1);
+            bool pass = FFLAS::fiszero(F,m,_w,1);
+        
+            FFLAS::fflas_delete(_w);
+
+            if (!pass) throw FailurePLUQCheck();
+
+#ifdef TIME_CHECKER_PLUQ
+            checktime.stop(); _time += checktime;
+            std::cerr << "PLUQ CHECK: " << _time << std::endl;
+#endif
+            return pass;
+        }
+
+    private:	
+        inline void init(typename Field::RandIter &G, 
+                         typename Field::ConstElement_ptr A, size_t lda) {
+#ifdef TIME_CHECKER_PLUQ
+            Givaro::Timer inittime; inittime.start();
+#endif
+            FFLAS::frand(F,G,n,v,1);
+    	
+                // w <-- A.v
+            FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, A, lda, v, 1, F.zero, w, 1);
+#ifdef TIME_CHECKER_PLUQ
+            inittime.stop(); _time += inittime;
+#endif
+        }
+    };
+}
+#endif // __FFLASFFPACK_checker_pluq_INL
diff --git a/fflas-ffpack/Makefile.am b/fflas-ffpack/checkers/checkers.doxy
similarity index 73%
copy from fflas-ffpack/Makefile.am
copy to fflas-ffpack/checkers/checkers.doxy
index 4bccdb2..c16bb88 100644
--- a/fflas-ffpack/Makefile.am
+++ b/fflas-ffpack/checkers/checkers.doxy
@@ -1,5 +1,5 @@
-# Copyright (c) 2011 FFLAS-FFPACK
-# written by Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+# Copyright (c) 2016 FFLAS-FFPACK
+# written by Ashley Lesdalons (ash09) <ashley.lesdalons at e.ujf-grenoble.fr>
 # adapted from LinBox configuration
 #
 # ========LICENCE========
@@ -22,12 +22,11 @@
 #/
 
 
-SUBDIRS=fflas ffpack field utils paladin interfaces
+/** \ingroup fflas-ffpack
+ *   \defgroup checker CHECKER
+ *
+ *	\brief Class CHECKER provides functions to verify computations in FFLAS and FFPACK.
+ *
+ */
 
-EXTRA_DIST=fflas-ffpack.doxy
-
-pkginclude_HEADERS = config-blas.h   \
-		     fflas-ffpack.h  \
-		     config.h \
-		     fflas-ffpack-config.h \
-		     fflas-ffpack-optimise.h
+// vim:syn=doxygen
diff --git a/fflas-ffpack/checkers/checkers_fflas.h b/fflas-ffpack/checkers/checkers_fflas.h
new file mode 100644
index 0000000..d3ce1dd
--- /dev/null
+++ b/fflas-ffpack/checkers/checkers_fflas.h
@@ -0,0 +1,79 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/checkers.h
+ * Copyright (C) 2016 Ashley Lesdalons, JG Dumas
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ * Written by Jean-Guillaume Dumas <Jean-Guillaume.Dumas at imag.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checkers_fflas_H
+#define __FFLASFFPACK_checkers_fflas_H
+
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include "checker_empty.h"
+
+#ifdef DEBUG
+ 	#define CHECKING_MODE 1
+ 	#define ENABLE_ALL_CHECKINGS 1
+#endif
+
+#ifdef ENABLE_ALL_CHECKINGS
+	#define ENABLE_CHECKER_fgemm 1
+ 	#define ENABLE_CHECKER_ftrsm 1
+#endif
+
+#ifdef TIME_CHECKERS
+#include <givaro/givtimer.h>
+#define TIME_CHECKER_FGEMM
+#define TIME_CHECKER_FTRSM
+#endif
+
+// definition of the exceptions
+class FailureFgemmCheck {};
+class FailureTrsmCheck {};
+
+namespace FFLAS {
+	template <class Field> class CheckerImplem_fgemm;
+	template <class Field> class CheckerImplem_ftrsm;
+}
+
+namespace FFLAS {
+#ifdef ENABLE_CHECKER_fgemm
+	template <class Field> using Checker_fgemm = CheckerImplem_fgemm<Field>;
+#else
+	template <class Field> using Checker_fgemm = FFLAS::Checker_Empty<Field>;
+#endif
+
+#ifdef ENABLE_CHECKER_ftrsm
+	template <class Field> using Checker_ftrsm = CheckerImplem_ftrsm<Field>;
+#else
+	template <class Field> using Checker_ftrsm = FFLAS::Checker_Empty<Field>;
+#endif
+}
+
+#include "fflas-ffpack/fflas/fflas.h"
+#include "fflas-ffpack/fflas/fflas_enum.h"
+#include "fflas-ffpack/utils/fflas_memory.h"
+
+#endif
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128.inl b/fflas-ffpack/checkers/checkers_fflas.inl
similarity index 56%
copy from fflas-ffpack/fflas/fflas_simd/simd128.inl
copy to fflas-ffpack/checkers/checkers_fflas.inl
index 81bffef..e34c3de 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128.inl
+++ b/fflas-ffpack/checkers/checkers_fflas.inl
@@ -1,10 +1,9 @@
 /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-/*
- * Copyright (C) 2014 the FFLAS-FFPACK group
+/* checkers/checkers.inl
+ * Copyright (C) 2016 Ashley Lesdalons
  *
- * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
- * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
  *
  *
  * ========LICENCE========
@@ -27,25 +26,15 @@
  *.
  */
 
-#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
-#define __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#ifndef FFLASFFPACK_checkers_fflas_inl_H
+#define FFLASFFPACK_checkers_fflas_inl_H
 
-template <bool ArithType, bool Int, bool Signed, int Size> struct Simd128_impl;
+#include "checker_fgemm.inl"
+#include "checker_ftrsm.inl"
 
-#include "simd128_float.inl"
-#include "simd128_double.inl"
+namespace FFLAS {
+	template <class Field> using ForceCheck_fgemm = CheckerImplem_fgemm<Field>;
+	template <class Field> using ForceCheck_ftrsm = CheckerImplem_ftrsm<Field>;
+}
 
-#ifdef SIMD_INT
-// Trop d'instructions SSE manquantes pour les int8_t
-
-#include "simd128_int16.inl"
-#include "simd128_int32.inl"
-#include "simd128_int64.inl"
-
-#endif //#ifdef SIMD_INT
-
-template <class T>
-using Simd128 =
-    Simd128_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
-
-#endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#endif
diff --git a/fflas-ffpack/checkers/checkers_ffpack.h b/fflas-ffpack/checkers/checkers_ffpack.h
new file mode 100644
index 0000000..ab65b4f
--- /dev/null
+++ b/fflas-ffpack/checkers/checkers_ffpack.h
@@ -0,0 +1,89 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* checkers/checkers.h
+ * Copyright (C) 2016 Ashley Lesdalons, JG Dumas
+ *
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ * Written by Jean-Guillaume Dumas <Jean-Guillaume.Dumas at imag.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+#ifndef __FFLASFFPACK_checkers_ffpack_H
+#define __FFLASFFPACK_checkers_ffpack_H
+
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include "checker_empty.h"
+
+#ifdef DEBUG
+ 	#define CHECKING_MODE 1
+ 	#define ENABLE_ALL_CHECKINGS 1
+#endif
+
+#ifdef ENABLE_ALL_CHECKINGS
+	#define ENABLE_CHECKER_PLUQ 1
+ 	#define ENABLE_CHECKER_invert 1
+ 	#define ENABLE_CHECKER_charpoly 1
+#endif
+
+#ifdef TIME_CHECKERS
+#include <givaro/givtimer.h>
+#define TIME_CHECKER_PLUQ
+#define TIME_CHECKER_INVERT
+#define TIME_CHECKER_CHARPOLY
+#endif
+
+
+// definition of the exceptions
+class FailurePLUQCheck {};
+class FailureInvertCheck {};
+class FailureCharpolyCheck {};
+
+namespace FFPACK {
+	template <class Field> class CheckerImplem_PLUQ;
+	template <class Field> class CheckerImplem_invert;
+	template <class Field, class Polynomial> class CheckerImplem_charpoly;
+}
+
+
+namespace FFPACK {
+#ifdef ENABLE_CHECKER_PLUQ
+	template <class Field> using Checker_PLUQ = CheckerImplem_PLUQ<Field>;
+#else
+	template <class Field> using Checker_PLUQ = FFLAS::Checker_Empty<Field>;
+#endif
+
+#ifdef ENABLE_CHECKER_invert
+	template <class Field> using Checker_invert = CheckerImplem_invert<Field>;
+#else
+	template <class Field> using Checker_invert = FFLAS::Checker_Empty<Field>;
+#endif
+
+#ifdef ENABLE_CHECKER_charpoly
+	template <class Field, class Polynomial> using Checker_charpoly = CheckerImplem_charpoly<Field,Polynomial>;
+#else
+	template <class Field, class Polynomial> using Checker_charpoly = FFLAS::Checker_Empty<Field>;
+#endif
+}
+
+#include "fflas-ffpack/ffpack/ffpack.h"
+
+#endif
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128.inl b/fflas-ffpack/checkers/checkers_ffpack.inl
similarity index 56%
copy from fflas-ffpack/fflas/fflas_simd/simd128.inl
copy to fflas-ffpack/checkers/checkers_ffpack.inl
index 81bffef..06275e5 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128.inl
+++ b/fflas-ffpack/checkers/checkers_ffpack.inl
@@ -1,10 +1,9 @@
 /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-/*
- * Copyright (C) 2014 the FFLAS-FFPACK group
+/* checkers/checkers.inl
+ * Copyright (C) 2016 Ashley Lesdalons
  *
- * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
- * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
  *
  *
  * ========LICENCE========
@@ -27,25 +26,17 @@
  *.
  */
 
-#ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
-#define __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#ifndef FFLASFFPACK_checkers_ffpack_inl_H
+#define FFLASFFPACK_checkers_ffpack_inl_H
 
-template <bool ArithType, bool Int, bool Signed, int Size> struct Simd128_impl;
+#include "checker_pluq.inl"
+#include "checker_invert.inl"
+#include "checker_charpoly.inl"
 
-#include "simd128_float.inl"
-#include "simd128_double.inl"
+namespace FFPACK {
+	template <class Field> using ForceCheck_PLUQ = CheckerImplem_PLUQ<Field>;
+	template <class Field> using ForceCheck_invert = CheckerImplem_invert<Field>;
+	template <class Field, class Polynomial> using ForceCheck_charpoly = CheckerImplem_charpoly<Field,Polynomial>;
+}
 
-#ifdef SIMD_INT
-// Trop d'instructions SSE manquantes pour les int8_t
-
-#include "simd128_int16.inl"
-#include "simd128_int32.inl"
-#include "simd128_int64.inl"
-
-#endif //#ifdef SIMD_INT
-
-template <class T>
-using Simd128 =
-    Simd128_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
-
-#endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
+#endif
diff --git a/fflas-ffpack/fflas-ffpack-config.h b/fflas-ffpack/fflas-ffpack-config.h
old mode 100755
new mode 100644
index 7819a2b..9e99e3a
--- a/fflas-ffpack/fflas-ffpack-config.h
+++ b/fflas-ffpack/fflas-ffpack-config.h
@@ -39,6 +39,7 @@
 #endif
 
 #ifdef __CYGWIN__
+# ifndef _GLIBCXX_USE_C99
 #  define _GLIBCXX_USE_C99 true
 #  ifndef _GLIBCXX_USE_C99_MATH_TR1
 #    include <cstdlib>
@@ -78,6 +79,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #    define _GLIBCXX_USE_C99 true
 #    include <cstdlib>
 #  endif
+# endif
 #endif
 
 #include "fflas-ffpack/config.h"
@@ -89,12 +91,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 #include "fflas-ffpack/fflas-ffpack-optimise.h"
 
-#if defined(__FFLASFFPACK_USE_SSE) or defined(__FFLASFFPACK_USE_AVX) or defined(__FFLASFFPACK_USE_AVX2)
-#define __FFLASFFPACK_USE_SIMD // see configure...
-#endif
-
-
-
 // winograd algorithm threshold (for double)
 #ifndef __FFLASFFPACK_WINOTHRESHOLD
 #define __FFLASFFPACK_WINOTHRESHOLD 1000
@@ -120,14 +116,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 #endif
 
-#ifdef __x86_64__
-#if defined(__GNUC__) || defined (__clang__) /* who supports __int128_t ? */
-#define int128_t __int128_t
-#define uint128_t unsigned __int128_t
-#else /* hopefully this exists */
-#define int128_t __int128
-#define uint128_t unsigned __int128
-#endif /* __int128_t */
-#endif /* __x86_64__ */
+#include "givaro/givconfig.h"
+
+#ifdef __GIVARO_HAVE_INT128
+#define __FFLASFFPACK_HAVE_INT128
+#endif
 
 #endif // __FFLASFFPACK_fflas_ffpack_configuration_H
diff --git a/fflas-ffpack/fflas/fflas.h b/fflas-ffpack/fflas/fflas.h
index 44aa372..49c7481 100644
--- a/fflas-ffpack/fflas/fflas.h
+++ b/fflas-ffpack/fflas/fflas.h
@@ -93,6 +93,11 @@
 #endif
 
 //---------------------------------------------------------------------
+// Checkers
+#include "fflas-ffpack/checkers/checkers_fflas.h"
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
 // specialisations and implementation
 //---------------------------------------------------------------------
 
@@ -129,6 +134,8 @@
 #include "fflas_fgemv_mp.inl"
 #include "fflas-ffpack/field/rns.inl" // real implementation of the multiprecision field
 
+
+
 #include "fflas-ffpack/paladin/fflas_pfinit.h"
 
 //---------------------------------------------------------------------
@@ -137,4 +144,9 @@
 
 #include "fflas_sparse.h"
 
+//---------------------------------------------------------------------
+// Checkers
+//---------------------------------------------------------------------
+#include "fflas-ffpack/checkers/checkers_fflas.inl"
+
 #endif // __FFLASFFPACK_fflas_H
diff --git a/fflas-ffpack/fflas/fflas_fadd.h b/fflas-ffpack/fflas/fflas_fadd.h
index f6f55c3..4caa4fd 100644
--- a/fflas-ffpack/fflas/fflas_fadd.h
+++ b/fflas-ffpack/fflas/fflas_fadd.h
@@ -35,7 +35,7 @@ namespace FFLAS {
 	template<class T>
 	struct support_simd_add  : public std::false_type {} ;
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	template<>
 	struct support_simd_add<float> : public std::true_type {} ;
 	template<>
@@ -48,7 +48,7 @@ namespace FFLAS {
 
  #endif  // SIMD_INT
 
-// #endif // __FFLASFFPACK_USE_SIMD
+// #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // FFLAS
 
diff --git a/fflas-ffpack/fflas/fflas_fadd.inl b/fflas-ffpack/fflas/fflas_fadd.inl
index c3dad8d..5ca8ae6 100644
--- a/fflas-ffpack/fflas/fflas_fadd.inl
+++ b/fflas-ffpack/fflas/fflas_fadd.inl
@@ -34,7 +34,7 @@
 
 namespace FFLAS { namespace vectorised {
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 	template<class SimdT, class Element, bool positive>
 	inline typename std::enable_if<is_simd<SimdT>::value, void>::type
@@ -233,7 +233,7 @@ namespace FFLAS { namespace vectorised {
 	}
 
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // vectorised
 } //  FFLAS
diff --git a/fflas-ffpack/fflas/fflas_fgemm.inl b/fflas-ffpack/fflas/fflas_fgemm.inl
index 83e2d15..b547403 100644
--- a/fflas-ffpack/fflas/fflas_fgemm.inl
+++ b/fflas-ffpack/fflas/fflas_fgemm.inl
@@ -51,16 +51,16 @@ namespace FFLAS { namespace Protected{
 					   typename Field::Element_ptr C, const size_t ldc,
 					   MMHelper<Field, MMHelperAlgo::Winograd, FieldMode> & H)
 		{
-				// CP: lda, ldb, ldc can be zero (if m,n or k is 0) and since  this may have not 
-				// been checked by the caller at this point.
-				// FFLASFFPACK_check(lda);
-				// FFLASFFPACK_check(ldb);
-				// FFLASFFPACK_check(ldc);
+			// CP: lda, ldb, ldc can be zero (if m,n or k is 0) and since  this may have not 
+			// been checked by the caller at this point.
+			// FFLASFFPACK_check(lda);
+			// FFLASFFPACK_check(ldb);
+			// FFLASFFPACK_check(ldc);
 
 			Givaro::ModularBalanced<FloatElement> G((FloatElement) F.characteristic());
 			FloatElement tmp,alphaf, betaf;
-				// This conversion is quite tricky, but convert and init are required
-				// in sequence e.g. for when F is a ModularBalanced field and alpha == -1
+			// This conversion is quite tricky, but convert and init are required
+			// in sequence e.g. for when F is a ModularBalanced field and alpha == -1
 			F.convert (tmp, beta);
 			G.init(betaf, tmp);
 			F.convert (tmp, alpha);
@@ -112,7 +112,7 @@ namespace FFLAS{ namespace Protected{
 			Outmax = Op1max + Op2max;
 			if (WH.MaxStorableValue - Op1max < Op2max ||
 				WH.MaxStorableValue + Op1min < -Op2min){
-					// Reducing both Op1 and Op2
+				// Reducing both Op1 and Op2
 				Op1min = Op2min = WH.FieldMin;
 				Op1max = Op2max = WH.FieldMax;
 				Outmin = 2*WH.FieldMin;
@@ -142,7 +142,7 @@ namespace FFLAS{ namespace Protected{
 			Outmax = Op1max - Op2min;
 			if (WH.MaxStorableValue - Op1max < -Op2min || 
 				WH.MaxStorableValue - Op2max < -Op1min){
-					// Reducing both Op1 and Op2
+				// Reducing both Op1 and Op2
 				Op1min = Op2min = WH.FieldMin;
 				Op1max = Op2max = WH.FieldMax;
 				Outmin = WH.FieldMin-WH.FieldMax;
@@ -157,20 +157,20 @@ namespace FFLAS{ namespace Protected{
 										 Element& Op2min, Element& Op2max,
 										 MMHelper<Field, AlgoT, ModeT, ParSeqTrait >& WH)
 		{
-				// Necessary? -> CP: Yes, for generic Mode of op
+			// Necessary? -> CP: Yes, for generic Mode of op
 			Outmin = WH.FieldMin;
 			Outmax = WH.FieldMax;
 			return false;
 		}
 
-//Probable bug here due to overflow of int64_t
+		//Probable bug here due to overflow of int64_t
 		template<class Field, class Element, class AlgoT, class ParSeqTrait>
 		inline bool NeedDoublePreAddReduction (Element& Outmin, Element& Outmax,
 											   Element& Op1min, Element& Op1max,
 											   Element& Op2min, Element& Op2max, Element beta,
 											   MMHelper<Field, AlgoT, ModeCategories::LazyTag, ParSeqTrait >& WH)
 		{
-				// Testing if P5 need to be reduced
+			// Testing if P5 need to be reduced
 			Outmin =  std::min(beta*Op2min,beta*Op2max);
 			Outmax =  std::max(beta*Op2min,beta*Op2max);
 			if (Op1max > WH.MaxStorableValue-Outmax || 
@@ -260,16 +260,16 @@ namespace FFLAS {
 			return Protected::fgemm_convert<float,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H);
 		else if (16*F.cardinality() < Givaro::ModularBalanced<double>::maxCardinality())
 			return Protected::fgemm_convert<double,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H);
-			// else if (Protected::AreEqual<typename Field::Element,int64_t>::value) {
-			// 	    // Stays over int64_t
-			// 	MMHelper<Field, MMHelperAlgo::Winograd, ModeCategories::DelayedTag, ParSeqHelper::Sequential> HG(H);
-			// 	H.Outmin=HG.Outmin;
-			// 	H.Outmax=HG.Outmax;
-			// 	return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,HG);
+		// else if (Protected::AreEqual<typename Field::Element,int64_t>::value) {
+		// 	    // Stays over int64_t
+		// 	MMHelper<Field, MMHelperAlgo::Winograd, ModeCategories::DelayedTag, ParSeqHelper::Sequential> HG(H);
+		// 	H.Outmin=HG.Outmin;
+		// 	H.Outmax=HG.Outmax;
+		// 	return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,HG);
 			
-		    //	}
+		//	}
 		else {
-			    // Fall back case: used 
+			// Fall back case: used 
 			FFPACK::failure()(__func__,__LINE__,"Invalid ConvertTo Mode for this field");	
 		}
 		return C;
@@ -339,7 +339,10 @@ namespace FFLAS {
 			fscalin(F, m, n, beta, C, ldc);
 		 	return C;
 		}
-		return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,FFLAS::ParSeqHelper::Sequential());
+		Checker_fgemm<Field> checker(F,m,n,k,beta,C,ldc);
+		fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,FFLAS::ParSeqHelper::Sequential());
+		checker.check(ta,tb,alpha,A,lda,B,ldb,C);
+		return C;
 	}
 
 	template<typename Field, class ModeT, class ParSeq>
@@ -383,10 +386,10 @@ namespace FFLAS {
 			return C;
 		}
 #ifndef NDEBUG
-			/*  check if alpha is invertible.
-			 *  XXX do it in F.isInvertible(Element&) ?
-			 *  XXX do it in return status of F.inv(Element&,Element&)
-			 */
+		/*  check if alpha is invertible.
+		 *  XXX do it in F.isInvertible(Element&) ?
+		 *  XXX do it in return status of F.inv(Element&,Element&)
+		 */
 		typename Field::Element e ;
 		F.assign(e,beta);
 		F.divin(e,alpha);
@@ -395,14 +398,14 @@ namespace FFLAS {
 #endif
 
 #if 0
-			// detect fgemv
+		// detect fgemv
 		if (n == 1 and ...) {}
-			// detect fger
+		// detect fger
 		if (k==1 and ...) {}
 #endif
 		if (Protected::AreEqual<Field, Givaro::Modular<double> >::value ||
 		    Protected::AreEqual<Field, Givaro::ModularBalanced<double> >::value){
-			    //Givaro::Modular<double> need to switch to float if p too small
+			//Givaro::Modular<double> need to switch to float if p too small
 			if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER)
 				return Protected::fgemm_convert<float,Field>(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H);
 		}
@@ -420,12 +423,12 @@ namespace FFLAS {
 			F.assign (beta_,beta);
 		}
 		MMHelper<Field, MMHelperAlgo::Winograd, ModeCategories::LazyTag>  HD(H);
-			// std::cerr<<"\n Delayed -> Lazy alpha_ = "<<alpha_<<std::endl;
-			// std::cerr<<" A = "<<*A<<"\n B = "<<*B<<"\n C = "<<*C<<"\n alpha, beta ="<<alpha<<" "<<beta<<std::endl;
+		// std::cerr<<"\n Delayed -> Lazy alpha_ = "<<alpha_<<std::endl;
+		// std::cerr<<" A = "<<*A<<"\n B = "<<*B<<"\n C = "<<*C<<"\n alpha, beta ="<<alpha<<" "<<beta<<std::endl;
 		fgemm (F, ta, tb, m, n, k, alpha_, A, lda, B, ldb, beta_, C, ldc, HD);
-			// std::cerr<<"Sortie de fgemm Lazy C = "<<*C<<std::endl;
+		// std::cerr<<"Sortie de fgemm Lazy C = "<<*C<<std::endl;
 		Protected::ScalAndReduce (F, m, n, alpha, C, ldc, HD);
-			// std::cerr<<"Sortie de ScalAndReduce C = "<<*C<<std::endl;
+		// std::cerr<<"Sortie de ScalAndReduce C = "<<*C<<std::endl;
 
 		H.initOut();
 
@@ -457,21 +460,21 @@ namespace FFLAS {
 		else
 			F.convert (betad, beta);
 
-			//! @bug why double ?
-			// Double  matrices initialisation
+		//! @bug why double ?
+		// Double  matrices initialisation
 		Givaro::DoubleDomain::Element_ptr Ad = fflas_new (Givaro::DoubleDomain(),n,n);
 		Givaro::DoubleDomain::Element_ptr Cd = fflas_new (Givaro::DoubleDomain(),n,n);
-			// Conversion finite Field = >  double
+		// Conversion finite Field = >  double
 		fconvert (F, n, n, Ad, n, A, lda);
 		if (!F.isZero(beta)) fconvert(F, n, n, Cd, n, C, ldc);
 
-			// Call to the blas Multiplication
+		// Call to the blas Multiplication
 		FFLASFFPACK_check(n);
 		cblas_dgemm (CblasRowMajor, (CBLAS_TRANSPOSE)ta,
 					 (CBLAS_TRANSPOSE)ta, (int)n, (int)n, (int)n,
 					 (Givaro::DoubleDomain::Element) alphad, Ad, (int)n, Ad, (int)n,
 					 (Givaro::DoubleDomain::Element) betad, Cd, (int)n);
-			// Conversion double = >  Finite Field
+		// Conversion double = >  Finite Field
 		fflas_delete (Ad);
 		finit (F,n,n, Cd, n, C, ldc);
 		fflas_delete (Cd);
@@ -480,7 +483,7 @@ namespace FFLAS {
 
 	namespace Protected {
 
-			// F is Modular(Balanced)<float/double>
+		// F is Modular(Balanced)<float/double>
 		template < class Field >
 		inline typename Field::Element_ptr
 		fsquareCommon (const Field& F,
diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl
index 339861d..c3b83ba 100644
--- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl
+++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl
@@ -39,7 +39,7 @@
 #include <cmath>
 
 #include "fflas-ffpack/field/field-traits.h"
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 #include "fflas-ffpack/fflas/fflas_igemm/igemm.h"
 #endif
 #include "fflas-ffpack/utils/Matio.h"
@@ -293,7 +293,7 @@ namespace FFLAS {
 		FFLASFFPACK_check(ldb);
 		FFLASFFPACK_check(ldc);
 		
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#if defined (__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
 		igemm_ (FflasRowMajor, ta, tb, (int)m, (int)n, (int)k, alpha, Ad, (int)lda, Bd, (int)ldb, beta, Cd, (int)ldc);
 #else
 		for (size_t i=0; i<m; i++){
diff --git a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl
index 26f9dbd..1a6fb23 100644
--- a/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl
+++ b/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl
@@ -50,66 +50,96 @@ namespace FFLAS {
 			 typename AlgoTrait,
 			 typename ParSeqTrait>
 	struct MMHelper<Field, AlgoTrait,ModeCategories::ConvertTo<ElementCategories::RNSElementTag>, ParSeqTrait> {
+		typedef MMHelper<Field, AlgoTrait,ModeCategories::ConvertTo<ElementCategories::RNSElementTag>, ParSeqTrait>  Self_t;
 		Givaro::Integer normA,normB;
 		int recLevel;
 		ParSeqTrait parseq;
 		MMHelper() : normA(0), normB(0), recLevel(-1) {}
 		template <class F2, class A2, class M2, class PS2>
 		MMHelper(MMHelper<F2, A2, M2, PS2> H2) : 
-				normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {}
+			normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {}
 		MMHelper(Givaro::Integer Amax, Givaro::Integer Bmax) : normA(Amax), normB(Bmax), recLevel(-1) {}
 		MMHelper(const Field& F, size_t m, size_t n, size_t k, ParSeqTrait PS=ParSeqTrait())
-				: recLevel(-1), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+			: recLevel(-1), parseq(PS)
+		{F.characteristic(normA);F.characteristic(normB); }
 		MMHelper(const Field& F, int wino, ParSeqTrait PS=ParSeqTrait()) : recLevel(wino), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+		{F.characteristic(normA);F.characteristic(normB);}
 		void setNorm(Givaro::Integer p){normA=normB=p;}
+
+		friend std::ostream& operator<<(std::ostream& out, const Self_t& M)
+		{
+			return out <<"Helper: "
+					   <<typeid(AlgoTrait).name()<<' '
+					   <<typeid(ModeCategories::ConvertTo<ElementCategories::RNSElementTag>).name()<< ' '
+					   << M.parseq <<std::endl
+					   <<"  recLevel = "<<M.recLevel<<std::endl;
+		}
 	};
 	template<typename E,
 			 typename AlgoTrait,
 			 typename ParSeqTrait>
 	struct MMHelper<FFPACK::RNSInteger<E>, AlgoTrait,ModeCategories::DefaultTag, ParSeqTrait> {
+		typedef  MMHelper<FFPACK::RNSInteger<E>, AlgoTrait,ModeCategories::DefaultTag, ParSeqTrait> Self_t;
 		Givaro::Integer normA,normB;
 		int recLevel;
 		ParSeqTrait parseq;
 		MMHelper() : normA(0), normB(0), recLevel(-1) {}
 		MMHelper(Givaro::Integer Amax, Givaro::Integer Bmax) : normA(Amax), normB(Bmax), recLevel(-1) {}
 		MMHelper(const FFPACK::RNSInteger<E>& F, size_t m, size_t n, size_t k, ParSeqTrait PS=ParSeqTrait())
-				: recLevel(-1), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+			: recLevel(-1), parseq(PS)
+		{F.characteristic(normA);F.characteristic(normB);}
 		MMHelper(const FFPACK::RNSInteger<E>& F, int wino, ParSeqTrait PS=ParSeqTrait()) : recLevel(wino), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+		{F.characteristic(normA);F.characteristic(normB);}
 		template <class F2, class A2, class M2, class PS2>
 		MMHelper(MMHelper<F2, A2, M2, PS2> H2) : 
-				normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {}
+			normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {}
 		void setNorm(Givaro::Integer p){normA=normB=p;}
+
+		friend std::ostream& operator<<(std::ostream& out, const Self_t& M)
+		{
+			return out <<"Helper: "
+					   <<typeid(AlgoTrait).name()<<' '
+					   <<typeid(ModeCategories::DefaultTag).name()<< ' '
+					   << M.parseq <<std::endl
+					   <<"  recLevel = "<<M.recLevel<<std::endl;
+		}
 	};
 	template<typename E,
 			 typename AlgoTrait,
 			 typename ParSeqTrait>
 	struct MMHelper<FFPACK::RNSIntegerMod<E>, AlgoTrait,ModeCategories::DefaultTag, ParSeqTrait> {
+		typedef MMHelper<FFPACK::RNSIntegerMod<E>, AlgoTrait,ModeCategories::DefaultTag, ParSeqTrait> Self_t;
 		Givaro::Integer normA,normB;
 		int recLevel;
 		ParSeqTrait parseq;
 		MMHelper() : normA(0), normB(0), recLevel(-1) {}
 		MMHelper(Givaro::Integer Amax, Givaro::Integer Bmax) : normA(Amax), normB(Bmax), recLevel(-1) {}
 		MMHelper(const FFPACK::RNSIntegerMod<E>& F, size_t m, size_t n, size_t k, ParSeqTrait PS=ParSeqTrait())
-				: recLevel(-1), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+			: recLevel(-1), parseq(PS)
+		{F.characteristic(normA);F.characteristic(normB);}
 		MMHelper(const FFPACK::RNSIntegerMod<E>& F, int wino, ParSeqTrait PS=ParSeqTrait()) : recLevel(wino), parseq(PS)
-			{F.characteristic(normA);F.characteristic(normB);}
+		{F.characteristic(normA);F.characteristic(normB);}
 		// copy constructor from other Field and Algo Traits
 		template<class F2, typename AlgoT2, typename FT2, typename PS2>
 		MMHelper(MMHelper<F2, AlgoT2, FT2, PS2>& WH) : recLevel(WH.recLevel), parseq(WH.parseq) {}
 
 		void setNorm(Givaro::Integer p){normA=normB=p;}
+
+		friend std::ostream& operator<<(std::ostream& out, const Self_t& M)
+		{
+			return out <<"Helper: "
+					   <<typeid(AlgoTrait).name()<<' '
+					   <<typeid(ModeCategories::DefaultTag).name()<< ' '
+					   << M.parseq <<std::endl
+					   <<"  recLevel = "<<M.recLevel<<std::endl;
+		}
 	};
 
-		/***********************************
-		 *** MULTIPRECISION FGEMM OVER Z ***
-		 ***********************************/
+	/***********************************
+	 *** MULTIPRECISION FGEMM OVER Z ***
+	 ***********************************/
 
-		// fgemm for RnsInteger sequential version
+	// fgemm for RnsInteger sequential version
 	template<typename RNS>
 	inline  typename FFPACK::RNSInteger<RNS>::Element_ptr 
 	fgemm (const FFPACK::RNSInteger<RNS> &F,
@@ -124,8 +154,8 @@ namespace FFLAS {
 	       MMHelper<FFPACK::RNSInteger<RNS>, MMHelperAlgo::Classic,ModeCategories::DefaultTag, ParSeqHelper::Sequential> & H)
 	{		
 
-			// compute each fgemm componentwise
-#ifdef FFT_PROFILER
+		// compute each fgemm componentwise
+#ifdef PROFILE_FGEMM_MP
 		Givaro::Timer t;t.start();
 #endif
 		for(size_t i=0;i<F.size();i++){
@@ -137,7 +167,7 @@ namespace FFLAS {
 						 beta._ptr[i*beta._stride],
 						 Cd._ptr+i*Cd._stride, ldc, H2);
 		}
-#ifdef FFT_PROFILER
+#ifdef PROFILE_FGEMM_MP
 		t.stop();
 
 		std::cerr<<"=========================================="<<std::endl
@@ -147,7 +177,7 @@ namespace FFLAS {
 		return Cd;
 	}
 
-		// fgemm for RnsInteger parallel version
+	// fgemm for RnsInteger parallel version
 	template<typename RNS, typename Cut, typename Param>
 	inline  typename FFPACK::RNSInteger<RNS>::Element_ptr
 	fgemm (const FFPACK::RNSInteger<RNS> &F,
@@ -161,16 +191,16 @@ namespace FFLAS {
 	       typename FFPACK::RNSInteger<RNS>::Element_ptr Cd, const size_t ldc,
 		   MMHelper<FFPACK::RNSInteger<RNS>, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel<Cut,Param> > & H)
 	{
-			// compute each fgemm componentwise
+		// compute each fgemm componentwise
 		size_t s=F.size();
 		size_t nt=H.parseq.numthreads();
 		size_t loop_nt = std::min(s,nt);
 		size_t iter_nt = nt / loop_nt;
 		size_t leftover_nt = nt % loop_nt;
-			//std::cerr<<"iter_nt = "<<iter_nt<<" loop_nt = "<<loop_nt<<" leftover_nt = "<<leftover_nt<<std::endl;
+		//std::cerr<<"iter_nt = "<<iter_nt<<" loop_nt = "<<loop_nt<<" leftover_nt = "<<leftover_nt<<std::endl;
 		ParSeqHelper::Parallel<Cut,Param>  sp(loop_nt);
 		//#endif
-#ifdef FFT_PROFILER
+#ifdef PROFILE_FGEMM_MP
 		Givaro::Timer t;t.start();
 #endif
 		typedef MMHelper<typename RNS::ModField,
@@ -182,30 +212,30 @@ namespace FFLAS {
 		FORBLOCK1D(iter,s,SPLITTER(H.parseq.numthreads()),
 				   TASK(MODE(CONSTREFERENCE(F,H)),
 						{for(auto i=iter.begin(); i!=iter.end(); ++i) 
-//				  for(int i=0; i<s;++i)
-				 {
-					 size_t gemm_nt = iter_nt;
-					 if (i < leftover_nt)
-						 gemm_nt++;
-					 if (gemm_nt>1){ // Running a parallel fgemm
-						 MMH_par_t H2(F.rns()._field_rns[i], H.recLevel,
-										  ParSeqHelper::Parallel<Cut,Param>(gemm_nt));
-//									  SPLITTER(gemm_nt,Cut,Param));
-							 //std::cerr<<"calling fgemm with "<<gemm_nt<<" threads"<<std::endl;
-						 FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, m, n, k, alpha._ptr[i*alpha._stride],
-									  Ad._ptr+i*Ad._stride, lda, Bd._ptr+i*Bd._stride, ldb,
-									  beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, ldc, H2);
-					 } else { // Running a sequential fgemm
-						 MMH_seq_t WH(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Sequential());
-						 FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, m, n, k, alpha._ptr[i*alpha._stride],
-									  Ad._ptr+i*Ad._stride, lda, Bd._ptr+i*Bd._stride, ldb,
-									  beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, ldc, WH);
-					 }
-				 }
+								//				  for(int i=0; i<s;++i)
+								{
+									size_t gemm_nt = iter_nt;
+									if (i < leftover_nt)
+										gemm_nt++;
+									if (gemm_nt>1){ // Running a parallel fgemm
+										MMH_par_t H2(F.rns()._field_rns[i], H.recLevel,
+													 ParSeqHelper::Parallel<Cut,Param>(gemm_nt));
+										//									  SPLITTER(gemm_nt,Cut,Param));
+										//std::cerr<<"calling fgemm with "<<gemm_nt<<" threads"<<std::endl;
+										FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, m, n, k, alpha._ptr[i*alpha._stride],
+													 Ad._ptr+i*Ad._stride, lda, Bd._ptr+i*Bd._stride, ldb,
+													 beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, ldc, H2);
+									} else { // Running a sequential fgemm
+										MMH_seq_t WH(F.rns()._field_rns[i], H.recLevel, ParSeqHelper::Sequential());
+										FFLAS::fgemm(F.rns()._field_rns[i],ta,tb, m, n, k, alpha._ptr[i*alpha._stride],
+													 Ad._ptr+i*Ad._stride, lda, Bd._ptr+i*Bd._stride, ldb,
+													 beta._ptr[i*beta._stride], Cd._ptr+i*Cd._stride, ldc, WH);
+									}
+								}
 						}); // TASK
 				   ); // FLORBLOCK1D
 		
-#ifdef FFT_PROFILER
+#ifdef PROFILE_FGEMM_MP
 		t.stop();
 
 		std::cerr<<"=========================================="<<std::endl
@@ -229,6 +259,7 @@ namespace FFLAS {
 	       Givaro::Integer* C, const size_t ldc,
 	       MMHelper<Givaro::ZRing<Givaro::Integer>, MMHelperAlgo::Classic, ModeCategories::ConvertTo<ElementCategories::RNSElementTag>, ParSeq >  & H)
 	{
+		//std::cerr<<"Entering fgemm<ZRing<Integer>> ParSeq"<<std::endl;
 #ifdef PROFILE_FGEMM_MP
 		Timer chrono;
 		chrono.start();
@@ -239,12 +270,12 @@ namespace FFLAS {
 		}
 
 		if (k==0) return C;
-			// compute bit size of feasible prime for FFLAS
+		// compute bit size of feasible prime for FFLAS
 		size_t _k=k,lk=0;
 		while ( _k ) {_k>>=1; ++lk;}
 		size_t prime_bitsize= (53-lk)>>1;
 
-			// compute bound on the output
+		// compute bound on the output
 		Givaro::Integer  mA,mB,mC;
 		size_t logA,logB;
 		mA=H.normA;
@@ -254,11 +285,11 @@ namespace FFLAS {
 		logA = H.normA.bitsize();
 		if (H.normB==0)
 			H.normB = InfNorm ((tb==FflasNoTrans)?k:n,(tb==FflasNoTrans)?n:k,B,ldb);
-		logB = H.normA.bitsize();
+		logB = H.normB.bitsize();
 
 		mC = 2*uint64_t(k)*H.normA*H.normB*abs(alpha); // need to use 2x bound to reach both positive and negative
         
-			// construct an RNS structure and its associated Domain
+ 		// construct an RNS structure and its associated Domain
 		FFPACK::rns_double RNS(mC, prime_bitsize);
 
 		typedef FFPACK::RNSInteger<FFPACK::rns_double> RnsDomain;
@@ -270,7 +301,7 @@ namespace FFLAS {
 		if (tb == FFLAS::FflasNoTrans){Browd=k; Bcold = n; }
 		else { Browd=n; Bcold = k;}
 		
-			// allocate data for RNS representation
+		// allocate data for RNS representation
 		typename RnsDomain::Element_ptr Ap,Bp,Cp;
 		Ap = FFLAS::fflas_new(Zrns,Arowd,Acold);
 		Bp = FFLAS::fflas_new(Zrns,Browd,Bcold);
@@ -284,7 +315,7 @@ namespace FFLAS {
 		chrono.start();
 #endif
 
-			// convert the input matrices to RNS representation
+		// convert the input matrices to RNS representation
 		finit_rns(Zrns,Arowd,Acold,(logA/16)+((logA%16)?1:0),A,lda,Ap);
 		finit_rns(Zrns,Browd,Bcold,(logB/16)+((logB%16)?1:0),B,ldb,Bp);
 
@@ -294,16 +325,16 @@ namespace FFLAS {
 		chrono.start();
 #endif
 
-			// perform the fgemm in RNS
-			// Classic as no Winograd over ZZ available for the moment
+		// perform the fgemm in RNS
+		// Classic as no Winograd over ZZ available for the moment
 		MMHelper<RnsDomain, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeq> H2(Zrns,H.recLevel,H.parseq);
 
-			// compute alpha and beta in RNS
+		// compute alpha and beta in RNS
 		typename RnsDomain::Element alphap, betap;
 		Zrns.init(alphap, alpha);
 		Zrns.init(betap, F.zero);
 
-			// call  fgemm
+		// call  fgemm
 		fgemm(Zrns,ta,tb,m,n,k,alphap,Ap,Acold,Bp,Bcold,betap,Cp,n,H2);
 
 #ifdef PROFILE_FGEMM_MP
@@ -313,7 +344,7 @@ namespace FFLAS {
 #endif
 
 		
-			// convert the RNS output to integer representation (C=beta.C+ RNS^(-1)(Cp) )
+		// convert the RNS output to integer representation (C=beta.C+ RNS^(-1)(Cp) )
 		fconvert_rns(Zrns,m,n,beta,C,ldc,Cp);
 
 		FFLAS::fflas_delete(Ap);
@@ -330,7 +361,7 @@ namespace FFLAS {
 
 	
 
-// Simple switch Winograd -> Classic (waiting for Winograd's algorithm to be generic wrt ModeTrait)
+	// Simple switch Winograd -> Classic (waiting for Winograd's algorithm to be generic wrt ModeTrait)
 	template<typename RNS, class ModeT>
 	inline typename RNS::Element_ptr fgemm (const FFPACK::RNSInteger<RNS> &F,
 											const FFLAS_TRANSPOSE ta,
@@ -364,11 +395,11 @@ namespace FFLAS {
 	// 	return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2);
 
 	// }
-		/************************************
-		 *** MULTIPRECISION FGEMM OVER Fp ***
-		 ************************************/
+	/************************************
+	 *** MULTIPRECISION FGEMM OVER Fp ***
+	 ************************************/
 
-		// fgemm for RNSIntegerMod  with Winograd Helper
+	// fgemm for RNSIntegerMod  with Winograd Helper
 	template<typename RNS>
 	inline typename RNS::Element_ptr fgemm (const FFPACK::RNSIntegerMod<RNS> &F,
 											const FFLAS_TRANSPOSE ta,
@@ -381,7 +412,7 @@ namespace FFLAS {
 											typename RNS::Element_ptr Cd, const size_t ldc,
 											MMHelper<FFPACK::RNSIntegerMod<RNS>, MMHelperAlgo::Winograd> & H)
 	{
-			// compute the product over Z
+		// compute the product over Z
 		typedef FFPACK::RNSInteger<RNS> RnsDomain;
 		RnsDomain Zrns(F.rns());
 		MMHelper<RnsDomain, MMHelperAlgo::Classic> H2(Zrns, H.recLevel,H.parseq);
@@ -389,7 +420,7 @@ namespace FFLAS {
 		FFLAS::Timer chrono;chrono.start();
 #endif
 		fgemm(Zrns,ta,tb,m,n,k,alpha,Ad,lda,Bd,ldb,beta,Cd,ldc,H2);
-			// reduce the product mod p (note that entries are larger than p, due to RNS modulo reduction)
+		// reduce the product mod p (note that entries are larger than p, due to RNS modulo reduction)
 		freduce (F, m, n, Cd, ldc);
 #ifdef BENCH_PERF_FGEMM_MP
 		chrono.stop();
@@ -400,7 +431,8 @@ namespace FFLAS {
 	}
 
 
-		// fgemm for IntegerDomain with Winograd Helper
+	// fgemm for IntegerDomain with Winograd Helper
+	
 	inline Givaro::Integer* fgemm (const Givaro::Modular<Givaro::Integer>& F,
 								   const FFLAS_TRANSPOSE ta,
 								   const FFLAS_TRANSPOSE tb,
@@ -412,8 +444,8 @@ namespace FFLAS {
 								   Givaro::Integer* C, const size_t ldc,
 								   MMHelper<Givaro::Modular<Givaro::Integer>, MMHelperAlgo::Classic, ModeCategories::ConvertTo<ElementCategories::RNSElementTag> > & H)
 	{
-			// compute the product over Z
-		// std::cerr<<"Entering fgemm<Modular<Integer>>"<<std::endl;
+		// compute the product over Z
+		//std::cerr<<"Entering fgemm<Modular<Integer>>"<<std::endl;
 		typedef Givaro::ZRing<Givaro::Integer> IntegerDomain;
 		Givaro::Integer p;
 		F.cardinality(p);
@@ -423,7 +455,7 @@ namespace FFLAS {
 
 		fgemm(Z,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2);
 
-			// reduce the product mod p
+		// reduce the product mod p
 		freduce (F, m, n, C, ldc);
 
 		return C;
@@ -440,8 +472,8 @@ namespace FFLAS {
 								   Givaro::Integer* C, const size_t ldc,
 								   MMHelper<Givaro::Modular<Givaro::Integer>, MMHelperAlgo::Auto, ModeCategories::ConvertTo<ElementCategories::RNSElementTag>, ParSeq > & H)
 	{
-			// compute the product over Z
-		// std::cerr<<"Entering fgemm<Modular<Integer>>"<<std::endl;
+		// compute the product over Z
+		//std::cerr<<"Entering fgemm<Modular<Integer>> PArSeq"<<std::endl;
 		typedef Givaro::ZRing<Givaro::Integer> IntegerDomain;
 		Givaro::Integer p;
 		F.cardinality(p);
@@ -451,7 +483,7 @@ namespace FFLAS {
 		
 		fgemm(Z,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2);
 		
-			// reduce the product mod p
+		// reduce the product mod p
 		freduce (F, m, n, C, ldc);
 
 		return C;
diff --git a/fflas-ffpack/fflas/fflas_fgemv.inl b/fflas-ffpack/fflas/fflas_fgemv.inl
index fb8303c..ab70fad 100644
--- a/fflas-ffpack/fflas/fflas_fgemv.inl
+++ b/fflas-ffpack/fflas/fflas_fgemv.inl
@@ -33,7 +33,7 @@
 
 #include <givaro/zring.h> // DoubleDomain
 
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 #include "fflas-ffpack/fflas/fflas_igemm/igemm.h"
 #endif
 
@@ -373,7 +373,7 @@ namespace FFLAS{
 	{
 		FFLASFFPACK_check(lda);
 
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
 		if (ta == FflasNoTrans)
 			igemm_ (FflasRowMajor, ta, FflasNoTrans,M,1,N,alpha,A,lda,X,incX,beta,Y,incY);
 		else
diff --git a/fflas-ffpack/fflas/fflas_freduce.h b/fflas-ffpack/fflas/fflas_freduce.h
index 55a895d..e237343 100644
--- a/fflas-ffpack/fflas/fflas_freduce.h
+++ b/fflas-ffpack/fflas/fflas_freduce.h
@@ -39,7 +39,7 @@ namespace FFLAS {
 	template<class T>
 	struct support_simd_mod  : public std::false_type {} ;
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	template<>
 	struct support_simd_mod<float> : public std::true_type {} ;
 	template<>
@@ -49,7 +49,7 @@ namespace FFLAS {
 	struct support_simd_mod<int64_t> : public std::true_type {} ;
 #endif  // SIMD_INT
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // FFLAS
 
diff --git a/fflas-ffpack/fflas/fflas_freduce.inl b/fflas-ffpack/fflas/fflas_freduce.inl
index e89d40e..830f585 100644
--- a/fflas-ffpack/fflas/fflas_freduce.inl
+++ b/fflas-ffpack/fflas/fflas_freduce.inl
@@ -301,7 +301,7 @@ namespace FFLAS { namespace vectorised {
 	} ;
 
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	template<class Field, class SimdT, class ElementTraits = typename ElementTraits<typename Field::Element>::value>
 	struct HelperModSimd  ;
 
@@ -389,7 +389,7 @@ namespace FFLAS { namespace vectorised {
 
 		}
 	} ;
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 
 #ifdef __x86_64__
@@ -442,7 +442,7 @@ namespace FFLAS { namespace vectorised {
 
 
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 	template<class Field, class SimdT, int ALGO>
 	inline void
@@ -473,14 +473,14 @@ namespace FFLAS { namespace vectorised {
 		}
 	}
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // vectorised
 } // FFLAS
 
 namespace FFLAS  { namespace vectorised { namespace unswitch  {
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	template<class Field, bool round, int algo>
 	inline typename std::enable_if<FFLAS::support_simd_mod<typename Field::Element>::value, void>::type
 	modp(const Field &F, typename Field::ConstElement_ptr U, const size_t & n,
diff --git a/fflas-ffpack/fflas/fflas_fscal.inl b/fflas-ffpack/fflas/fflas_fscal.inl
index fa8d2d9..fce4363 100644
--- a/fflas-ffpack/fflas/fflas_fscal.inl
+++ b/fflas-ffpack/fflas/fflas_fscal.inl
@@ -32,7 +32,7 @@
 
 namespace FFLAS { namespace vectorised {
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 	template<class SimdT, class Element>
 	inline typename std::enable_if<is_simd<SimdT>::value, void>::type
@@ -132,7 +132,7 @@ namespace FFLAS { namespace vectorised {
 
 	}
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 } // vectorised
 } // FFLAS
 
diff --git a/fflas-ffpack/fflas/fflas_ftrmm.inl b/fflas-ffpack/fflas/fflas_ftrmm.inl
index cc80481..a9f96ed 100644
--- a/fflas-ffpack/fflas/fflas_ftrmm.inl
+++ b/fflas-ffpack/fflas/fflas_ftrmm.inl
@@ -44,7 +44,7 @@ ftrmm (const Field& F, const FFLAS_SIDE Side,
 	      const FFLAS_DIAG Diag,
 	      const size_t M, const size_t N,
 	      const typename Field::Element alpha,
-	      typename Field::Element_ptr A, const size_t lda,
+	      typename Field::ConstElement_ptr A, const size_t lda,
 	      typename Field::Element_ptr B, const size_t ldb)
 {
 	if (!M || !N ) return;
diff --git a/fflas-ffpack/fflas/fflas_ftrmm_src.inl b/fflas-ffpack/fflas/fflas_ftrmm_src.inl
index 58659eb..52a0140 100644
--- a/fflas-ffpack/fflas/fflas_ftrmm_src.inl
+++ b/fflas-ffpack/fflas/fflas_ftrmm_src.inl
@@ -166,7 +166,7 @@ public:
 
 template <class Field>
 void delayed (const Field& F, const size_t M, const size_t N,
-	      typename Field::Element_ptr A, const size_t lda,
+	      typename Field::ConstElement_ptr A, const size_t lda,
 	      typename Field::Element_ptr B, const size_t ldb)
 {
 	Mjoin(cblas_,Mjoin(__FFLAS__BLAS_PREFIX,trmm))
@@ -181,7 +181,7 @@ void delayed (const Field& F, const size_t M, const size_t N,
 
 template <class Field>
 void operator () (const Field& F, const size_t M, const size_t N,
-		  typename Field::Element_ptr A, const size_t lda,
+		  typename Field::ConstElement_ptr A, const size_t lda,
 		  typename Field::Element_ptr B, const size_t ldb)
 {
 
@@ -225,7 +225,7 @@ public:
 
 template<class Field>
 void operator()	(const Field& F, const size_t M, const size_t N,
-		 typename Field::Element_ptr A, const size_t lda,
+		 typename Field::ConstElement_ptr A, const size_t lda,
 		 typename Field::Element_ptr B, const size_t ldb)
 {
 
diff --git a/fflas-ffpack/fflas/fflas_ftrsm.inl b/fflas-ffpack/fflas/fflas_ftrsm.inl
index 853b645..1ca12d2 100644
--- a/fflas-ffpack/fflas/fflas_ftrsm.inl
+++ b/fflas-ffpack/fflas/fflas_ftrsm.inl
@@ -56,7 +56,9 @@ namespace FFLAS {
 	{
 		ParSeqHelper::Sequential PSH;
 		TRSMHelper<StructureHelper::Recursive, ParSeqHelper::Sequential> H(PSH);
+		FFLAS::Checker_ftrsm<Field> checker(F, M, N, alpha, B, ldb);
 		ftrsm(F, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, H);
+		checker.check(Side, Uplo, TransA, Diag, M, N, A, lda, B, ldb);
 	}
 
 	template<class Field>
diff --git a/fflas-ffpack/fflas/fflas_ftrsm_mp.inl b/fflas-ffpack/fflas/fflas_ftrsm_mp.inl
index 268cd03..d18113a 100644
--- a/fflas-ffpack/fflas/fflas_ftrsm_mp.inl
+++ b/fflas-ffpack/fflas/fflas_ftrsm_mp.inl
@@ -353,5 +353,8 @@ namespace FFLAS {
 #endif // #ifndef DOXYGEN_SHOULD_SKIP_THIS
 } // END OF NAMESPACE FFLAS
 
+
+
+
 #endif
 
diff --git a/fflas-ffpack/fflas/fflas_ftrsv.inl b/fflas-ffpack/fflas/fflas_ftrsv.inl
index 0857a21..a6ac178 100644
--- a/fflas-ffpack/fflas/fflas_ftrsv.inl
+++ b/fflas-ffpack/fflas/fflas_ftrsv.inl
@@ -111,5 +111,4 @@ ftrsv (const Field& F, const FFLAS_UPLO Uplo,
 }
 
 }
-
 #endif // __FFLASFFPACK_ftrsv_INL
diff --git a/fflas-ffpack/fflas/fflas_helpers.inl b/fflas-ffpack/fflas/fflas_helpers.inl
old mode 100755
new mode 100644
index cfe0ca9..2c9fbf9
--- a/fflas-ffpack/fflas/fflas_helpers.inl
+++ b/fflas-ffpack/fflas/fflas_helpers.inl
@@ -51,8 +51,20 @@ namespace FFLAS{ namespace Protected{
 }//FFLAS
 
 namespace FFLAS {
-
+	
 	namespace Protected{
+		template <class DFE> inline size_t min_types(DFE& k) {return static_cast<size_t>(k);}
+#if __FFLASFFPACK_SIZEOF_LONG == 4
+		template <> inline size_t min_types(double& k) {return static_cast<size_t>(std::min(k,double(std::numeric_limits<size_t>::max())));}
+		template <> inline size_t min_types(int64_t& k) {return static_cast<size_t>(std::min(k,int64_t(std::numeric_limits<size_t>::max())));}
+#endif
+		template <> inline size_t min_types(RecInt::rint<6>& k) {return static_cast<size_t>(uint64_t(std::min(k,RecInt::rint<6>(uint64_t(std::numeric_limits<size_t>::max())))));}
+		template <> inline size_t min_types(RecInt::rint<7>& k) {return static_cast<size_t>(uint64_t(std::min(k,RecInt::rint<7>(uint64_t(std::numeric_limits<size_t>::max())))));}
+		template <> inline size_t min_types(RecInt::rint<8>& k) {return static_cast<size_t>(uint64_t(std::min(k,RecInt::rint<8>(uint64_t(std::numeric_limits<size_t>::max())))));}
+		template <> inline size_t min_types(RecInt::rint<9>& k) {return static_cast<size_t>(uint64_t(std::min(k,RecInt::rint<9>(uint64_t(std::numeric_limits<size_t>::max())))));}
+		template <> inline size_t min_types(RecInt::rint<10>& k) {return static_cast<size_t>(uint64_t(std::min(k,RecInt::rint<10>(uint64_t(std::numeric_limits<size_t>::max())))));}
+		template <> inline size_t min_types(Givaro::Integer& k) {return static_cast<size_t>(uint64_t(std::min(k,Givaro::Integer(uint64_t(std::numeric_limits<size_t>::max())))));}
+
 		template <class T>
 		inline bool unfit(T x){return false;}
 		template <>
@@ -160,6 +172,10 @@ namespace FFLAS {
 
 		size_t MaxDelayedDim(DFElt beta)
 		{
+			if (MaxStorableValue < DFElt(0))
+				//Infinte precision delayed field
+				return std::numeric_limits<size_t>::max();
+
 			DFElt absbeta;
 			delayedField.init(absbeta,beta);
 			if (beta < 0) absbeta = -beta;
@@ -169,7 +185,15 @@ namespace FFLAS {
 				* std::max(static_cast<const DFElt&>(-Cmin), Cmax);
 			DFElt AB = std::max(static_cast<const DFElt&>(-Amin), Amax)
 				* std::max(static_cast<const DFElt&>(-Bmin), Bmax);
-			return static_cast<size_t>(((diff < DFElt(0u))||(AB<DFElt(0u)))? DFElt(0u) : diff / AB);
+			if ((diff < DFElt(0u))||(AB<DFElt(0u))) return 0;
+
+
+			DFElt kmax = diff/AB;
+			return FFLAS::Protected::min_types<DFElt>(kmax);
+			// if (kmax > std::numeric_limits<size_t>::max())
+			// 	return std::numeric_limits<size_t>::max();
+			// else
+			// 	return kmax;
 		}
 		bool Aunfit(){ return Protected::unfit(std::max(static_cast<const DFElt&>(-Amin),Amax));}
 		bool Bunfit(){ return Protected::unfit(std::max(static_cast<const DFElt&>(-Bmin),Bmax));}
diff --git a/fflas-ffpack/fflas/fflas_igemm/igemm.h b/fflas-ffpack/fflas/fflas_igemm/igemm.h
index 1057a71..058a94d 100644
--- a/fflas-ffpack/fflas/fflas_igemm/igemm.h
+++ b/fflas-ffpack/fflas/fflas_igemm/igemm.h
@@ -41,7 +41,7 @@ namespace FFLAS {
 
 } // FFLAS
 
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 #include "igemm_kernels.h"
 #include "igemm_tools.h"
 #endif 
@@ -89,7 +89,7 @@ namespace FFLAS { /*  igemm */
 
 
 } // FFLAS
-#if defined(__AVX2__) or defined(__AVX__) or defined(__SSE4_1__)
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 #include "igemm.inl"
 #endif
 #endif // __FFLASFFPACK_fflas_igemm_igemm_H
diff --git a/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl b/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl
index 567d0f8..80b2bb9 100644
--- a/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl
+++ b/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl
@@ -31,19 +31,19 @@
 #define __FFLASFFPACK_fflas_igemm_igemm_kernels_INL
 
 
-#ifdef __AVX2__
+#ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
 #define _nr 4
 #define _mr 8
 #define StepA 4
 #define StepB 4
-#elif defined(__SSE4_1__) or defined(__AVX__)
+#elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
 #define _nr 4
 #define _mr 4
 #define StepA 2
 #define StepB 2
 #else
 #error "kernels not supported"
-#endif
+#endif // __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
 
 #include "fflas-ffpack/utils/fflas_memory.h"
 #include "igemm_tools.h"
diff --git a/fflas-ffpack/fflas/fflas_level1.inl b/fflas-ffpack/fflas/fflas_level1.inl
index 811a9de..f00929a 100644
--- a/fflas-ffpack/fflas/fflas_level1.inl
+++ b/fflas-ffpack/fflas/fflas_level1.inl
@@ -414,6 +414,7 @@ namespace FFLAS {
 	template <class Field>
 	void
 	fsubin (const Field& F,  const size_t N,
+		typename Field::ConstElement_ptr B, const size_t incb,
 		typename Field::Element_ptr C, const size_t incc);
 
 
diff --git a/fflas-ffpack/fflas/fflas_level3.inl b/fflas-ffpack/fflas/fflas_level3.inl
index 96011ee..c6e7e6c 100644
--- a/fflas-ffpack/fflas/fflas_level3.inl
+++ b/fflas-ffpack/fflas/fflas_level3.inl
@@ -205,7 +205,7 @@ namespace FFLAS {
 	       const FFLAS_DIAG Diag,
 	       const size_t M, const size_t N,
 	       const typename Field::Element alpha,
-	       typename Field::Element_ptr A, const size_t lda,
+	       typename Field::ConstElement_ptr A, const size_t lda,
 	       typename Field::Element_ptr B, const size_t ldb);
 
 	/** @brief  fgemm: <b>F</b>ield <b>GE</b>neral <b>M</b>atrix <b>M</b>ultiply.
diff --git a/fflas-ffpack/fflas/fflas_pfgemm.inl b/fflas-ffpack/fflas/fflas_pfgemm.inl
index 4e04ec7..083493b 100644
--- a/fflas-ffpack/fflas/fflas_pfgemm.inl
+++ b/fflas-ffpack/fflas/fflas_pfgemm.inl
@@ -28,7 +28,7 @@
  */
 
 #ifndef __FFLASFFPACK_fflas_pfgemm_INL
-#define __FFLASFFPACK_fflas_pgemm_INL
+#define __FFLASFFPACK_fflas_pfgemm_INL
 
 #define __FFLASFFPACK_SEQPARTHRESHOLD 220
 #define __FFLASFFPACK_DIMKPENALTY 1
diff --git a/fflas-ffpack/fflas/fflas_simd.h b/fflas-ffpack/fflas/fflas_simd.h
index 29e6756..515611c 100644
--- a/fflas-ffpack/fflas/fflas_simd.h
+++ b/fflas-ffpack/fflas/fflas_simd.h
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -61,106 +61,106 @@
 #define PURE
 #endif
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 namespace std { // Why? - A.B. 2015-04-30
 
-inline
-std::ostream &operator<<(std::ostream &o, const __m128 &v) {
-    const float *vArray = (const float *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1];
-    o << ',';
-    o << vArray[2] << ',' << vArray[3];
-    o << '>';
-    return o;
-}
-
-inline
-std::ostream &operator<<(std::ostream &o, const __m128i &v) {
-    const int64_t *vArray = (const int64_t *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1];
-    o << '>';
-    return o;
-}
-
-inline
-std::ostream &operator<<(std::ostream &o, const __m128d &v) {
-    const double *vArray = (const double *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1];
-    o << '>';
-    return o;
-}
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m128 &v) {
+		const float *vArray = (const float *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1];
+		o << ',';
+		o << vArray[2] << ',' << vArray[3];
+		o << '>';
+		return o;
+	}
+
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m128i &v) {
+		const int64_t *vArray = (const int64_t *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1];
+		o << '>';
+		return o;
+	}
+
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m128d &v) {
+		const double *vArray = (const double *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1];
+		o << '>';
+		return o;
+	}
 } // std
 
-#ifdef __FFLASFFPACK_USE_AVX
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 namespace std {
 
-inline
-std::ostream &operator<<(std::ostream &o, const __m256 &v) {
-    const float *vArray = (const float *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
-    o << ',';
-    o << vArray[4] << ',' << vArray[5] << ',' << vArray[6] << ',' << vArray[7];
-    o << '>';
-    return o;
-}
-
-inline
-std::ostream &operator<<(std::ostream &o, const __m256i &v) {
-    const int64_t *vArray = (const int64_t *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
-    o << '>';
-    return o;
-}
-
-inline
-std::ostream &operator<<(std::ostream &o, const __m256d &v) {
-    const double *vArray = (const double *)(&v);
-    o << '<';
-    o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
-    o << '>';
-    return o;
-}
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m256 &v) {
+		const float *vArray = (const float *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
+		o << ',';
+		o << vArray[4] << ',' << vArray[5] << ',' << vArray[6] << ',' << vArray[7];
+		o << '>';
+		return o;
+	}
+
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m256i &v) {
+		const int64_t *vArray = (const int64_t *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
+		o << '>';
+		return o;
+	}
+
+	inline
+	std::ostream &operator<<(std::ostream &o, const __m256d &v) {
+		const double *vArray = (const double *)(&v);
+		o << '<';
+		o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3];
+		o << '>';
+		return o;
+	}
 } // std
-#endif // __FFLASFFPACK_USE_AVX
+#endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 namespace FFLAS {
-template <class T> struct support_simd : public std::false_type {};
+	template <class T> struct support_simd : public std::false_type {};
 
-#if defined(__FFLASFFPACK_USE_SIMD)
-template <> struct support_simd<float> : public std::true_type {};
-template <> struct support_simd<double> : public std::true_type {};
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONSUSE_SIMD)
+	template <> struct support_simd<float> : public std::true_type {};
+	template <> struct support_simd<double> : public std::true_type {};
 #ifdef SIMD_INT
-template <> struct support_simd<int64_t> : public std::true_type {};
-template <> struct support_simd<int32_t> : public std::true_type {};
-template <> struct support_simd<int16_t> : public std::true_type {};
+	template <> struct support_simd<int64_t> : public std::true_type {};
+	template <> struct support_simd<int32_t> : public std::true_type {};
+	template <> struct support_simd<int16_t> : public std::true_type {};
 #endif
 #endif
 
 } // FFLAS
 
-#define NORML_MOD(C, P, NEGP, MIN, MAX, Q, T)                                                                          \
-    {                                                                                                                  \
-        Q = greater(C, MAX);                                                                                           \
-        T = lesser(C, MIN);                                                                                            \
-        Q = vand(Q, NEGP);                                                                                             \
-        T = vand(T, P);                                                                                                \
-        Q = vor(Q, T);                                                                                                 \
-        C = add(C, Q);                                                                                                 \
-    }
-
-#define FLOAT_MOD(C, P, INVP, Q)                                                                                       \
-    {                                                                                                                  \
-        Q = mul(C, INVP);                                                                                              \
-        Q = floor(Q);                                                                                                  \
-        C = fnmadd(C, Q, P);                                                                                           \
-    }
+#define NORML_MOD(C, P, NEGP, MIN, MAX, Q, T)                                                                      \
+{                                                                                                                  \
+	Q = greater(C, MAX);                                                                                           \
+	T = lesser(C, MIN);                                                                                            \
+	Q = vand(Q, NEGP);                                                                                             \
+	T = vand(T, P);                                                                                                \
+	Q = vor(Q, T);                                                                                                 \
+	C = add(C, Q);                                                                                                 \
+	}
+
+#define FLOAT_MOD(C, P, INVP, Q)                                                                                   \
+{                                                                                                                  \
+	Q = mul(C, INVP);                                                                                              \
+	Q = floor(Q);                                                                                                  \
+	C = fnmadd(C, Q, P);                                                                                           \
+	}
 
 // to activate SIMD with integers
 //#define SIMD_INT
@@ -172,12 +172,12 @@ template <class T> struct simdToType;
  */
 
 template <class T> struct is_simd {
-    static const constexpr bool value = false;
-    using type = std::integral_constant<bool, false>;
+	static const constexpr bool value = false;
+	using type = std::integral_constant<bool, false>;
 };
 
 // SSE
-#if defined(__FFLASFFPACK_USE_SIMD) // SSE or better
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // SSE or better
 #include "fflas-ffpack/fflas/fflas_simd/simd128.inl"
 
 template <> struct simdToType<__m128d> { using type = double; };
@@ -185,26 +185,26 @@ template <> struct simdToType<__m128d> { using type = double; };
 template <> struct simdToType<__m128> { using type = float; };
 
 template <> struct is_simd<__m128d> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 
 template <> struct is_simd<__m128> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 
 #ifdef SIMD_INT
 template <> struct is_simd<__m128i> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 #endif
 
 #endif // SSE
 
 // AVX
-#if defined(__FFLASFFPACK_USE_AVX) or defined(__FFLASFFPACK_USE_AVX2)
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
 #include "fflas-ffpack/fflas/fflas_simd/simd256.inl"
 
 template <> struct simdToType<__m256d> { using type = double; };
@@ -212,19 +212,19 @@ template <> struct simdToType<__m256d> { using type = double; };
 template <> struct simdToType<__m256> { using type = float; };
 
 template <> struct is_simd<__m256d> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 
 template <> struct is_simd<__m256> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 
 #ifdef SIMD_INT
 template <> struct is_simd<__m256i> {
-    static const constexpr bool value = true;
-    using type = std::integral_constant<bool, true>;
+	static const constexpr bool value = true;
+	using type = std::integral_constant<bool, true>;
 };
 #endif
 #endif // AVX
@@ -233,61 +233,77 @@ template <> struct is_simd<__m256i> {
  * Simd functors
  */
 
+template<typename T>
 struct NoSimd {
-    // Test if the pointer p is multiple of alignment
-    template <class T> static constexpr bool valid(T p) { return false; }
-
-    // Test if n is multiple of vect_size
-    template <class T> static constexpr bool compliant(T n) { return false; }
+	/*
+	* alias to 128 bit simd register
+	*/
+	using vect_t = T*;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = T;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 1;
+
+	// Test if the pointer p is multiple of alignment
+	template <class TT> static constexpr bool valid(TT p) { return false; }
+
+	// Test if n is multiple of vect_size
+	template <class TT> static constexpr bool compliant(TT n) { return false; }
 };
 
-// #if defined(__FFLASFFPACK_USE_AVX)
+// #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
 
 template <class T, bool = std::is_arithmetic<T>::value, bool = std::is_integral<T>::value> struct SimdChooser {};
 
-template <class T, bool b> struct SimdChooser<T, false, b> { using value = NoSimd; };
+template <class T, bool b> struct SimdChooser<T, false, b> { using value = NoSimd<T>; };
 
 template <class T>
 struct SimdChooser<T, true, false> // floating number
-    {
-#ifdef __FFLASFFPACK_USE_AVX
-    using value = Simd256<T>;
-#elif defined(__FFLASFFPACK_USE_SSE)
-    using value = Simd128<T>;
+{
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
+	using value = Simd256<T>;
+#elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+	using value = Simd128<T>;
 #else
-    using value = NoSimd;
+	using value = NoSimd<T>;
 #endif
 };
 
 template <class T>
 struct SimdChooser<T, true, true> // integral number
-    {
-#ifdef __FFLASFFPACK_USE_AVX2
-    using value = Simd256<T>;
-#elif __FFLASFFPACK_USE_SSE
-    using value = Simd128<T>;
+{
+#ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
+	using value = Simd256<T>;
+#elif __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
+	using value = Simd128<T>;
 #else
-    using value = NoSimd;
+	using value = NoSimd<T>;
 #endif
 };
 
 template <class T> using Simd = typename SimdChooser<T>::value;
 
 // template <class T> struct SimdChooser<T, true> {
-// #if defined(__FFLASFFPACK_USE_AVX2)
+// #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
 //     typedef Simd256<T> value;
 // #else
 //     typedef Simd128<T> value;
-// #endif // __FFLASFFPACK_USE_AVX2
+// #endif // __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
 // };
 
-// #elif defined(__FFLASFFPACK_USE_SSE) // not AVX
+// #elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // not AVX
 
 // template <class T> using Simd = Simd128<T>;
 
-// #endif // __FFLASFFPACK_USE_AVX
+// #endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 
-#if defined(__FFLASFFPACK_USE_SIMD) // SSE or better
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // SSE or better
 
 // template <class T> struct floating_simd;
 
@@ -296,7 +312,7 @@ template <class T> using Simd = typename SimdChooser<T>::value;
 // template <> struct floating_simd<double> { typedef Simd<double> value; };
 
 // template <> struct floating_simd<int64_t> {
-// #if defined(__FFLASFFPACK_USE_AVX2)
+// #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
 // // typedef Simd256<double> value;
 // #else
 //     typedef Simd128<double> value;
@@ -305,49 +321,52 @@ template <class T> using Simd = typename SimdChooser<T>::value;
 
 #endif
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 namespace FFLAS { /*  print helper */
 
-// need friend ?
-template <class simdT>
-inline std::ostream &print(std::ostream &os, const typename simdT::vect_t &P) {
-    typename simdT::scalar_t p[simdT::vect_size];
-    os << '<';
-    simdT::store(p, P);
-    for (size_t i = 0; i < simdT::vect_size; ++i) {
-        os << p[i];
-        if (i < simdT::vect_size - 1)
-            os << '|';
-    }
-    os << '>';
-
-    return os;
-}
+	// need friend ?
+	template <class simdT>
+	inline std::ostream &print(std::ostream &os, const typename simdT::vect_t &P) {
+		typename simdT::scalar_t p[simdT::vect_size];
+		os << '<';
+		simdT::storeu(p, P);
+		for (size_t i = 0; i < simdT::vect_size; ++i) {
+			os << p[i];
+			if (i < simdT::vect_size - 1)
+				os << '|';
+		}
+		os << '>';
+
+		return os;
+	}
 
 } // FFLAS
 
 namespace std {
-// cannot be instanciated, T is not déductible
-template <class T>
-inline std::ostream &operator<<(std::ostream &o, const typename Simd128<T>::vect_t &v) {
-    FFLAS::print<Simd128<T>>(o, v);
-    return o;
-}
+	// cannot be instanciated, T is not deductible
+	template <class T>
+	inline std::ostream &operator<<(std::ostream &o, const typename Simd128<T>::vect_t &v) {
+		FFLAS::print<Simd128<T>>(o, v);
+		return o;
+	}
 } // std
 
-#ifdef __FFLASFFPACK_USE_AVX
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 namespace std {
-// cannot be instanciated, T is not déductible
-template <class T>
-inline std::ostream &operator<<(std::ostream &o, const typename Simd256<T>::vect_t &v) {
-    FFLAS::print(o, v);
-    return o;
+	// cannot be instanciated, T is not deductible
+	template <class T>
+	inline std::ostream &operator<<(std::ostream &o, const typename Simd256<T>::vect_t &v) {
+		FFLAS::print(o, v);
+		return o;
+	}
 }
-}
-#endif // __FFLASFFPACK_USE_AVX
+#endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
+
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
-#endif // __FFLASFFPACK_USE_SIMD
+// Provide simd modular support
+#include <fflas-ffpack/fflas/fflas_simd/simd_modular.inl>
 
 #undef INLINE
 #undef PURE
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128.inl b/fflas-ffpack/fflas/fflas_simd/simd128.inl
index 81bffef..f98e0f6 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -30,8 +30,75 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
 
+struct Simd128i_base {
+
+	/*
+	* alias to 128 bit simd register
+	*/
+	using vect_t = __m128i;
+
+	/*
+	*  Return vector of type vect_t with all elements set to zero
+	*  Return [0, ...,0]
+	*/
+	static INLINE CONST vect_t zero() { return _mm_setzero_si128(); }
+
+	/*
+	* Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0] int128_t
+	* Return : [a0 << (s*8)] int128_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t sll128(const vect_t a) { return _mm_slli_si128(a, s); }
+
+	/*
+	* Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0] int128_t
+	* Return : [a0 >> (s*8)] int128_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t srl128(const vect_t a) { return _mm_srli_si128(a, s); }
+
+	/*
+	* Compute the bitwise AND and store the results in vect_t.
+	* Args   : [a0, ..., a127]
+	*		   [b0, ..., b127]
+	* Return : [a0 AND b0, ..., a127 AND b127]
+	*/
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_si128(b, a); }
+
+	/*
+	* Compute the bitwise OR and store the results in vect_t.
+	* Args   : [a0, ..., a127]
+	*		   [b0, ..., b127]
+	* Return : [a0 OR b0, ..., a127 OR b127]
+	*/
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_si128(b, a); }
+
+	/*
+	* Compute the bitwise XOR and store the results in vect_t.
+	* Args   : [a0, ..., a127]
+	*		   [b0, ..., b127]
+	* Return : [a0 XOR b0, ..., a127 XOR b127]
+	*/
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_si128(b, a); }
+
+	/*
+	* Compute the bitwise AND NOT and store the results in vect_t.
+	* Args   : [a0, ..., a127]
+	*		   [b0, ..., b127]
+	* Return : [a0 AND (NOT b0), ..., a127 AND (NOT b127)]
+	*/
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_si128(b, a); }
+
+};
+
 template <bool ArithType, bool Int, bool Signed, int Size> struct Simd128_impl;
 
+template <class T>
+using Simd128 =
+Simd128_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
+
 #include "simd128_float.inl"
 #include "simd128_double.inl"
 
@@ -44,8 +111,4 @@ template <bool ArithType, bool Int, bool Signed, int Size> struct Simd128_impl;
 
 #endif //#ifdef SIMD_INT
 
-template <class T>
-using Simd128 =
-    Simd128_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
-
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128_double.inl b/fflas-ffpack/fflas/fflas_simd/simd128_double.inl
index ecf68a6..f56ae05 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128_double.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128_double.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -34,319 +34,379 @@
  * Simd128 specialized for double
  */
 template <> struct Simd128_impl<true, false, true, 8> {
-#if defined(__FFLASFFPACK_USE_SIMD)
-
-    /*
-     * alias to 128 bit simd register
-     */
-    using vect_t = __m128d;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = double;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 2;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 16;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Return vector of type vect_t with all elements set to zero.
-     * Return [0,0]
-     */
-    static INLINE CONST vect_t zero() { return _mm_setzero_pd(); }
-
-    /*
-     * Broadcast double-precision (64-bit) floating-point value a to all elements of vect_t.
-     * Return [x,x]
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_pd(x); }
-
-    /*
-     *  Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values.
-     *  Return [x1,x2]
-     */
-    static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2) { return _mm_set_pd(x2, x1); }
-
-    /*
-     *  Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
-     * vect_t.
-     *  Return [p[idx[0]], p[idx[1]]]
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return _mm_set_pd(p[idx[1]], p[idx[0]]);
-    }
-
-    /*
-     * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     * Return [p[0], p[1]]
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_pd(p); }
-
-    /*
-     * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0], p[1]]
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_pd(p); }
-
-    /*
-     * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using
-     * a non-temporal memory hint.
-     * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0+b0, a1+b1]
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_pd(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit)
-     * floating-point elements in a, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0-b0, a1-b1]
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_pd(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0*b0, a1*b1]
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_pd(a, b); }
-
-    static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1], [c0, c1]
-     * Return : [a0*b0+c0, a1*b1+c1]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+
+	/*
+	 * alias to 128 bit simd register
+	 */
+	using vect_t = __m128d;
+
+	/*
+	 * define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = double;
+
+	/*
+	 *  number of scalar_t in a simd register
+	 */
+	static const constexpr size_t vect_size = 2;
+
+	/*
+	 *  alignement required by scalar_t pointer to be loaded in a vect_t
+	 */
+	static const constexpr size_t alignment = 16;
+
+	/*
+	 * Check if the pointer p is a multiple of alignemnt
+	 */
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	 * Check if the number n is a multiple of vect_size
+	 */
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	 * Return vector of type vect_t with all elements set to zero.
+	 * Return [0,0]
+	 */
+	static INLINE CONST vect_t zero() { return _mm_setzero_pd(); }
+
+	/*
+	 * Broadcast double-precision (64-bit) floating-point value a to all elements of vect_t.
+	 * Return [x,x]
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_pd(x); }
+
+	/*
+	 *  Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values.
+	 *  Return [x1,x2]
+	 */
+	static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2) { return _mm_set_pd(x2, x1); }
+
+	/*
+	 *  Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
+	 * vect_t.
+	 *  Return [p[idx[0]], p[idx[1]]]
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return _mm_set_pd(p[idx[1]], p[idx[0]]);
+	}
+
+	/*
+	 * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0], p[1]]
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_pd(p); }
+
+	/*
+	 * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0], p[1]]
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_pd(p); }
+
+	/*
+	 * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using
+	 * a non-temporal memory hint.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	* Shuffle double-precision (64-bit) floating-point elements using the control in s,
+	* and store the results in dst.
+	* Args   : [a0, a1] double
+	* Return : [a[s[0]], a[s[1]]] double
+	*/
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		return _mm_permute_pd(a, s);
+	}
+#endif
+
+	/*
+	* Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1] double
+			   [b0, b1] double
+	* Return : [a0, b0] double
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_pd(a, b); }
+
+	/*
+	* Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1] double
+			   [b0, b1] double
+	* Return : [a1, b1] double
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_pd(a, b); }
+
+	/*
+	* Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s,
+	* and store the results in dst.
+	* Args   : [a0, a1] double
+			   [b0, b1] double
+	* Return : [s[0]?a0:b0, s[1]?a1:b1] double
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm_blend_pd(a, b, s);
+	}
+
+	/*
+	* Blend packed double-precision (64-bit) floating-point elements from a and b using mask,
+	* and store the results in dst.
+	* Args   : [a0, a1] double
+			   [b0, b1] double
+	* Return : [mask[63]?a0:b0, mask[127]?a1:b1] double
+	*/
+	static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) {
+		return _mm_blendv_pd(a, b, mask);
+	}
+
+	/*
+	 * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0+b0, a1+b1]
+	 */
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_pd(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	 * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit)
+	 * floating-point elements in a, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0-b0, a1-b1]
+	 */
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_pd(a, b); }
+
+	static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0*b0, a1*b1]
+	 */
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_pd(a, b); }
+
+	static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
+
+	/*
+	 * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b,
+	 * and store the results in dst.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0/b0, a1/b1]
+	 */
+	static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm_div_pd(a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1], [c0, c1]
+	 * Return : [a0*b0+c0, a1*b1+c1]
+	 */
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fmadd_pd(a, b, c);
+		return _mm_fmadd_pd(a, b, c);
 #else
-        return add(c, mul(a, b));
+		return add(c, mul(a, b));
 #endif
-    }
+	}
 
-    static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+	static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
 
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1], [c0, c1]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1], [c0, c1]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1]
+	 */
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fnmadd_pd(a, b, c);
+		return _mm_fnmadd_pd(a, b, c);
 #else
-        return sub(c, mul(a, b));
+		return sub(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1], [c0, c1]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1]
-     */
-    static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1], [c0, c1]
-     * Return : [a0*b0-c0, a1*b1-c1]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1], [c0, c1]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1]
+	 */
+	static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1], [c0, c1]
+	 * Return : [a0*b0-c0, a1*b1-c1]
+	 */
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fmsub_pd(a, b, c);
+		return _mm_fmsub_pd(a, b, c);
 #else
-        return sub(mul(a, b), c);
+		return sub(mul(a, b), c);
 #endif
-    }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1], [b0, b1], [c0, c1]
-     * Return : [a0*b0-c0, a1*b1-c1]
-     */
-    static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
-
-    static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results
-     in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_pd(a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_pd(a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm_cmple_pd(a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_pd(a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_pd(a, b); }
-
-    /*
-     * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0 AND b0, a1 AND b1]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_pd(a, b); }
-
-    /*
-     * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0 OR b0, a1 OR b1]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_pd(a, b); }
-
-    /*
-     * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0 XOR b0, a1 XOR b1]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_pd(a, b); }
-
-    /*
-     * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0 AND NOT b0, a1 AND NOT b1]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_pd(a, b); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1]
-     * Return : [floor(a0), floor(a1)]
-     */
-    static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_pd(a); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1]
-     * Return : [ceil(a0), ceil(a1)]
-     */
-    static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_pd(a); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed
-     * double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1]
-     * Return : [round(a0), round(a1)]
-     */
-    static INLINE CONST vect_t round(const vect_t a) {
-        return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    }
-
-    /*
-     * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the
-     * results in vect_t.
-     * Args   : [a0, a1], [b0, b1]
-     * Return : [a0+a1, b0+b1]
-     */
-    static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_pd(a, b); }
-
-    /*
-     * Horizontally add double-precision (64-bit) floating-point elements in a.
-     * Args   : [a0, a1]
-     * Return : a0+a1
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1];
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-        FLOAT_MOD(C, P, INVP, Q);
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-
-        return C;
-    }
-
-#else // __AVX__
+	}
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1], [c0, c1]
+	 * Return : [a0*b0-c0, a1*b1-c1]
+	 */
+	static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
+
+	static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results
+	 in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_pd(a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_pd(a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm_cmple_pd(a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_pd(a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_pd(a, b); }
+
+	/*
+	 * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0 AND b0, a1 AND b1]
+	 */
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_pd(a, b); }
+
+	/*
+	 * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0 OR b0, a1 OR b1]
+	 */
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_pd(a, b); }
+
+	/*
+	 * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0 XOR b0, a1 XOR b1]
+	 */
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_pd(a, b); }
+
+	/*
+	 * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0 AND NOT b0, a1 AND NOT b1]
+	 */
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_pd(a, b); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1]
+	 * Return : [floor(a0), floor(a1)]
+	 */
+	static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_pd(a); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1]
+	 * Return : [ceil(a0), ceil(a1)]
+	 */
+	static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_pd(a); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed
+	 * double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1]
+	 * Return : [round(a0), round(a1)]
+	 */
+	static INLINE CONST vect_t round(const vect_t a) {
+		return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+	}
+
+	/*
+	 * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the
+	 * results in vect_t.
+	 * Args   : [a0, a1], [b0, b1]
+	 * Return : [a0+a1, b0+b1]
+	 */
+	static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_pd(a, b); }
+
+	/*
+	 * Horizontally add double-precision (64-bit) floating-point elements in a.
+	 * Args   : [a0, a1]
+	 * Return : a0+a1
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1];
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+		FLOAT_MOD(C, P, INVP, Q);
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+
+		return C;
+	}
+
+#else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 #error "You need SSE instructions to perform 128bits operations on double"
 #endif
 };
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128_float.inl b/fflas-ffpack/fflas/fflas_simd/simd128_float.inl
index 04315a1..7e96a38 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128_float.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128_float.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -34,340 +34,400 @@
  * Simd128 specialized for float
  */
 template <> struct Simd128_impl<true, false, true, 4> {
-#if defined(__FFLASFFPACK_USE_SIMD)
-
-    /*
-     * alias to 128 bit simd register
-     */
-    using vect_t = __m128;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = float;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 4;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 16;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0]
-     */
-    static INLINE CONST vect_t zero() { return _mm_setzero_ps(); }
-
-    /*
-     *  Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t.
-     *  Return [x,x,x,x]
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) {
-#ifdef __AVX__
-        // return _mm_broadcast_ss(&x);
-        return _mm_set1_ps(x);
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
+
+	/*
+	 * alias to 128 bit simd register
+	 */
+	using vect_t = __m128;
+
+	/*
+	 * define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = float;
+
+	/*
+	 *  number of scalar_t in a simd register
+	 */
+	static const constexpr size_t vect_size = 4;
+
+	/*
+	 *  alignement required by scalar_t pointer to be loaded in a vect_t
+	 */
+	static const constexpr size_t alignment = 16;
+
+	/*
+	 * Check if the pointer p is a multiple of alignemnt
+	 */
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	 * Check if the number n is a multiple of vect_size
+	 */
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	 *  Return vector of type vect_t with all elements set to zero
+	 *  Return [0,0,0,0]
+	 */
+	static INLINE CONST vect_t zero() { return _mm_setzero_ps(); }
+
+	/*
+	 *  Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t.
+	 *  Return [x,x,x,x]
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) {
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
+		// return _mm_broadcast_ss(&x);
+		return _mm_set1_ps(x);
 #else
-        return _mm_set1_ps(x);
+		return _mm_set1_ps(x);
 #endif
-    }
-
-    /*
-     *  Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values.
-     *  Return [x1,x2,x3,x4]
-     */
-    static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) {
-        return _mm_set_ps(x4, x3, x2, x1);
-    }
-
-    /*
-     *  Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
-     * vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]]
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return _mm_set_ps(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
-    }
-
-    /*
-     * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     * Return [p[0], p[1], p[2], p[3]]
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_ps(p); }
-
-    /*
-     * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0], p[1], p[2], p[3]]
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_ps(p); }
-
-    /*
-     * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 128-bits (composed of 4 packed double-precision (32-bit) floating-point elements) from a into memory using
-     * a non-temporal memory hint.
-     * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3]
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_ps(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit)
-     * floating-point elements in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3]
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_ps(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3]
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_ps(a, b); }
-
-    static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 *  Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values.
+	 *  Return [x1,x2,x3,x4]
+	 */
+	static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) {
+		return _mm_set_ps(x4, x3, x2, x1);
+	}
+
+	/*
+	 *  Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
+	 * vect_t.
+	 *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]]
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return _mm_set_ps(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
+	}
+
+	/*
+	 * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0], p[1], p[2], p[3]]
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_ps(p); }
+
+	/*
+	 * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0], p[1], p[2], p[3]]
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_ps(p); }
+
+	/*
+	 * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 128-bits (composed of 4 packed double-precision (32-bit) floating-point elements) from a into memory using
+	 * a non-temporal memory hint.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	* Shuffle single-precision (32-bit) floating-point elements in a using the control in s,
+	* and store the results in dst.
+	* Args   :	[a0, a1, a2, a3] float
+	* Return :	[a[s[0..1]], ..., a[s[6..7]] float
+	*/
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		return _mm_permute_ps(a, s);
+	}
+#endif
+
+	/*
+	* Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] float
+			   [b0, b1, b2, b3] float
+	* Return : [a0, b0, a1, b1] float
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_ps(a, b); }
+
+	/*
+	* Unpack and interleave single-precision (32-bit) floating-point elements from the high half a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] float
+			   [b0, b1, b2, b3] float
+	* Return : [a2, b2, a3, b3] float
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_ps(a, b); }
+
+	/*
+	* Blend packed single-precision (32-bit) floating-point elements from a and b using control mask s,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] float
+			   [b0, b1, b2, b3] float
+	* Return : [s[0]?a0:b0,   , s[3]?a3:b3] float
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm_blend_ps(a, b, s);
+	}
+
+	/*
+	* Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] float
+			   [b0, b1, b2, b3] float
+	* Return : [mask[31]?a0:b0,   , mask[127]?a3:b3] float
+	*/
+	static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) {
+		return _mm_blendv_ps(a, b, mask);
+	}
+
+	/*
+	 * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0+b0, a1+b1, a2+b2, a3+b3]
+	 */
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_ps(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	 * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit)
+	 * floating-point elements in a, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0-b0, a1-b1, a2-b2, a3-b3]
+	 */
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_ps(a, b); }
+
+	static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0*b0, a1*b1, a2*b2, a3*b3]
+	 */
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_ps(a, b); }
+
+	static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
+
+	/*
+	 * Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b,
+	 * and store the results in dst.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3] float
+	 * Return : [a0/b0, a1/b1, a2/b2, a3/b3] float
+	 */
+	static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm_div_ps(a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
+	 */
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fmadd_ps(a, b, c);
+		return _mm_fmadd_ps(a, b, c);
 #else
-        return add(c, mul(a, b));
+		return add(c, mul(a, b));
 #endif
-    }
+	}
 
-    static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+	static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
 
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4]
+	 */
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fnmadd_ps(a, b, c);
+		return _mm_fnmadd_ps(a, b, c);
 #else
-        return sub(c, mul(a, b));
+		return sub(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4]
-     */
-    static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4]
+	 */
+	static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
+	 */
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm_fmsub_ps(a, b, c);
+		return _mm_fmsub_ps(a, b, c);
 #else
-        return sub(mul(a, b), c);
+		return sub(mul(a, b), c);
 #endif
-    }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
-     */
-    static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
-
-    static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results
-     in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0==b0) ? 0xFFFFFFFF : 0,
-     (a1==b1) ? 0xFFFFFFFF : 0,
-     (a2==b2) ? 0xFFFFFFFF : 0,
-     (a3==b3) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_ps(a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0<b0) ? 0xFFFFFFFF : 0,
-     (a1<b1) ? 0xFFFFFFFF : 0,
-     (a2<b2) ? 0xFFFFFFFF : 0,
-     (a3<b3) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_ps(a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0<=b0) ? 0xFFFFFFFF : 0,
-     (a1<=b1) ? 0xFFFFFFFF : 0,
-     (a2<=b2) ? 0xFFFFFFFF : 0,
-     (a3<=b3) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm_cmple_ps(a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0>b0) ? 0xFFFFFFFF : 0,
-     (a1>b1) ? 0xFFFFFFFF : 0,
-     (a2>b2) ? 0xFFFFFFFF : 0,
-     (a3>b3) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_ps(a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0>=b0) ? 0xFFFFFFFF : 0,
-     (a1>=b1) ? 0xFFFFFFFF : 0,
-     (a2>=b2) ? 0xFFFFFFFF : 0,
-     (a3>=b3) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_ps(a, b); }
-
-    /*
-     * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_ps(a, b); }
-
-    /*
-     * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_ps(a, b); }
-
-    /*
-     * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_ps(a, b); }
-
-    /*
-     * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_ps(a, b); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [floor(a0), floor(a1), floor(a2), floor(a3)]
-     */
-    static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_ps(a); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the
-     * results as packed single-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)]
-     */
-    static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_ps(a); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed
-     * single-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [round(a0), round(a1), round(a2), round(a3)]
-     */
-    static INLINE CONST vect_t round(const vect_t a) {
-        return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    }
-
-    /*
-     * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0+a1, b0+b1, a2+a3, b2+b3]
-     */
-    static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_ps(a, b); }
-
-    /*
-     * Horizontally add single-precision (32-bit) floating-point elements in a.
-     * Args   : [a0, a1, a2, a3]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
-               ((const scalar_t *)&a)[3];
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-        FLOAT_MOD(C, P, INVP, Q);
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-
-        return C;
-    }
-#else // __AVX__
+	}
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
+	 */
+	static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
+
+	static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results
+	 in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0==b0) ? 0xFFFFFFFF : 0,
+	 (a1==b1) ? 0xFFFFFFFF : 0,
+	 (a2==b2) ? 0xFFFFFFFF : 0,
+	 (a3==b3) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_ps(a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0<b0) ? 0xFFFFFFFF : 0,
+	 (a1<b1) ? 0xFFFFFFFF : 0,
+	 (a2<b2) ? 0xFFFFFFFF : 0,
+	 (a3<b3) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_ps(a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0<=b0) ? 0xFFFFFFFF : 0,
+	 (a1<=b1) ? 0xFFFFFFFF : 0,
+	 (a2<=b2) ? 0xFFFFFFFF : 0,
+	 (a3<=b3) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm_cmple_ps(a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0>b0) ? 0xFFFFFFFF : 0,
+	 (a1>b1) ? 0xFFFFFFFF : 0,
+	 (a2>b2) ? 0xFFFFFFFF : 0,
+	 (a3>b3) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_ps(a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0>=b0) ? 0xFFFFFFFF : 0,
+	 (a1>=b1) ? 0xFFFFFFFF : 0,
+	 (a2>=b2) ? 0xFFFFFFFF : 0,
+	 (a3>=b3) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_ps(a, b); }
+
+	/*
+	 * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
+	 */
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_ps(a, b); }
+
+	/*
+	 * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
+	 */
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_ps(a, b); }
+
+	/*
+	 * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
+	 */
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_ps(a, b); }
+
+	/*
+	 * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3]
+	 */
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_ps(a, b); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [floor(a0), floor(a1), floor(a2), floor(a3)]
+	 */
+	static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_ps(a); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the
+	 * results as packed single-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)]
+	 */
+	static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_ps(a); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed
+	 * single-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [round(a0), round(a1), round(a2), round(a3)]
+	 */
+	static INLINE CONST vect_t round(const vect_t a) {
+		return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+	}
+
+	/*
+	 * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0+a1, b0+b1, a2+a3, b2+b3]
+	 */
+	static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_ps(a, b); }
+
+	/*
+	 * Horizontally add single-precision (32-bit) floating-point elements in a.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : a0+a1+a2+a3
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
+				((const scalar_t *)&a)[3];
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+		FLOAT_MOD(C, P, INVP, Q);
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+
+		return C;
+	}
+#else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 #error "You need SSE instructions to perform 128bits operations on double"
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 };
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_float_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128_int16.inl b/fflas-ffpack/fflas/fflas_simd/simd128_int16.inl
index 924cc57..d06a16c 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128_int16.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128_int16.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,396 +31,555 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL
 
+#ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
+#error "You need SSE instructions to perform 128 bits operations on int16"
+#endif
+
 /*
  * Simd128 specialized for int16_t
  */
-template <> struct Simd128_impl<true, true, true, 2> {
-#if defined(__FFLASFFPACK_USE_SIMD)
-    /*
-     * alias to 128 bit simd register
-     */
-    using vect_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int16_t;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 8;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 16;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *      Converter conv;
-     *      conv.v = a;
-     *      scalart_t x = conv.t[1]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0,0,0,0,0] int16_t
-     */
-    static INLINE CONST vect_t zero() { return _mm_setzero_si128(); }
-
-    /*
-     *  Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x,x,x,x,x,x,x] int16_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi16(x); }
-
-    /*
-     *  Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1,x2,x3,x4,x5,x6,x7] int16_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
-                                   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
-        return _mm_set_epi16(x7, x6, x5, x4, x3, x2, x1, x0);
-    }
-
-    /*
-     *  Gather 16-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]],
-     p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] int16_t
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm_storeu_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-     */
-    // static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_si128(const_cast<scalar_t *>(p), v); }
-
-     /*
-     * Shift packed 32-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int16_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi16(a, s); }
-
-    /*
-     * Shift packed 32-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int16_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi16(a, s); }
-
-
-    static INLINE CONST vect_t sra(const vect_t a, const scalar_t s) { return _mm_sra_epi16(a, set1(s)); }
-
-    /*
-     * Add packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7]   int16_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi16(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7]  int16_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi16(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]           int16_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1, a2*b2 mod 2^16-1, a3*b3 mod 2^16-1,
-     a4*b4 mod 2^16-1, a5*b5 mod 2^16-1, a6*b6 mod 2^16-1, a7*b7 mod 2^16-1] int16_t
-     */
-    static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi16(a, b); }
-
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    /*
-     * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return :
-     */
-    static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm_mulhi_epi16(a, b); }
-
-    /*
-     * Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results
-     in dst.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]    int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]    int16_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7]    int16_t
-     */
-    static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
-        vect_t mask = set1(0x00FF); // ???
-        a = vand(a, mask);
-        b = vand(b, mask);
-        return _mm_mullo_epi16(a, b);
-    }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]    int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]    int16_t
-     [c0, c1, c2, c3, c4, c5, c6, c7]    int16_t
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]    int16_t
-     */
-    static INLINE CONST vect_t fmaddx(vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
-
-    static INLINE CONST vect_t fmadd(vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    /*
-     * Compare packed 16-bits in a and b for equality, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0,
-     (a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0,
-     (a4==b4) ? 0xFFFF : 0, (a5==b5) ? 0xFFFF : 0,
-     (a6==b6) ? 0xFFFF : 0, (a7==b7) ? 0xFFFF : 0]                     int16_t
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi16(a, b); }
-
-    /*
-     * Compare packed 16-bits in a and b for greater-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0,
-     (a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0,
-     (a4>b4) ? 0xFFFF : 0, (a5>b5) ? 0xFFFF : 0,
-     (a6>b6) ? 0xFFFF : 0, (a7>b7) ? 0xFFFF : 0]                      int16_t
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi16(a, b); }
-
-    /*
-     * Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [(a0<b0) ? 0xFFFF : 0, (a1<b1) ? 0xFFFF : 0,
-     (a2<b2) ? 0xFFFF : 0, (a3<b3) ? 0xFFFF : 0,
-     (a4<b4) ? 0xFFFF : 0, (a5<b5) ? 0xFFFF : 0,
-     (a6<b6) ? 0xFFFF : 0, (a7<b7) ? 0xFFFF : 0]                      int16_t
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmpgt_epi16(b, a); }
-
-    /*
-     * Compare packed 16-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0,
-     (a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0,
-     (a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0,
-     (a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0]                    int16_t
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int16_t
-     * Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0,
-     (a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0,
-     (a4<=b4) ? 0xFFFF : 0, (a5<=b5) ? 0xFFFF : 0,
-     (a6<=b6) ? 0xFFFF : 0, (a7<=b7) ? 0xFFFF : 0]                     int16_t
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_si128(b, a); }
-
-    /*
-     * Compute the bitwise OR of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_si128(b, a); }
-
-    /*
-     * Compute the bitwise XOR of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_si128(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     ANDNOT b7]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_si128(b, a); }
-
-    /*
-     * Horizontally add 16-bits elements of a.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter conv;
-        conv.v = a;
-        return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3] + conv.t[4] + conv.t[5] + conv.t[6] + conv.t[7]);
-    }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const __m64 &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T)
-
-    {
+template <> struct Simd128_impl<true, true, true, 2> : public Simd128i_base {
+
+	/*
+	* alias to 128 bit simd register
+	*/
+	using vect_t = __m128i;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = int16_t;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 8;
+
+	/*
+	*  alignement required by scalar_t pointer to be loaded in a vect_t
+	*/
+	static const constexpr size_t alignment = 16;
+
+	/*
+	* Check if the pointer p is a multiple of alignemnt
+	*/
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	* Check if the number n is a multiple of vect_size
+	*/
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 16-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x] int16_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi16(x); }
+
+	/*
+	*  Set packed 16-bit integers in dst with the supplied values.
+	*  Return [x0, ..., x7] int16_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
+		return _mm_set_epi16(x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 16-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
+	*  Return [p[idx[0]], ..., p[idx[7]]] int16_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	* Return [p[0], ..., p[7]] int16_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0], ..., p[7]] int16_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 16-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+	* Return :	[a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int16_t
+	*/
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi16(a, s); }
+
+	/*
+	* Shift packed 16-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+	* Return :	[a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int16_t
+	*/
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi16(a, s); }
+
+	/*
+	* Shift packed 16-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+	* Return :	[a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int16_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srai_epi16(a, s); }
+
+	/*
+	* Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   :	[a0, ..., a7] int16_t
+	* Return :	[a[s[0..3]], ..., a[s[28..31]] int16_t
+	*/
+	template<uint32_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		//#pragma warning "The simd shuffle function is emulated, it may impact the performances.";
+		Converter conv;
+		conv.v = a;
+		return set (conv.t[( s      & 0x0000000F)], conv.t[( s      & 0x000000F0)],
+					conv.t[((s>> 8) & 0x0000000F)], conv.t[((s>> 8) & 0x000000F0)],
+					conv.t[((s>>16) & 0x0000000F)], conv.t[((s>>16) & 0x000000F0)],
+					conv.t[((s>>24) & 0x0000000F)], conv.t[((s>>24) & 0x000000F0)]);
+	}
+
+	/*
+	* Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[a0, b0, ..., a3, b3] int16_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi16(a, b); }
+
+	/*
+	* Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[a4, b4, ..., a7, b7] int16_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi16(a, b); }
+
+	/*
+	* Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[s[0]?a0:b0,   , s[7]?a7:b7] int16_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm_blend_epi16(a, b, s);
+	}
+
+	/*
+	* Add packed 16-bits integer in a and b, and store the results in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[a0+b0, ..., a7+b7]   int16_t
+	*/
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi16(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	* Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[a0-b0, ..., a7-b7]  int16_t
+	*/
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi16(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
+	of the intermediate integers in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[a0*b0 smod 2^16, ..., a7*b7 smod 2^16]	int16_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi16(a, b); }
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16
+	bits of the intermediate integers in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[Floor(a0*b0/2^16), ..., Floor(a7*b7/2^16)] int16_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm_mulhi_epi16(a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results
+	in vect_t.
+	* Args   :	[a0, ..., a7] int16_t
+				[b0, ..., b7] int16_t
+	* Return :	[(a0 smod 2^8)*(b0 smod 2^8), ..., (a7 smod 2^8)*(b7 smod 2^8)]	int16_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1, mask2;
+		mask1 = set1(0x00FF);
+		mask2 = set1(0x0080);
+		a1 = add(a,mask2);
+		a1 = vand(a1,mask1);
+		a1 = sub(a1,mask2);
+		b1 = add(b,mask2);
+		b1 = vand(b1,mask1);
+		b1 = sub(b1,mask2);
+		return mul(a1,b1);
+	}
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* keep the low 16 bits of the intermediate and add the low 16-bits of c.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[(a0*b0+c0) smod 2^16, ..., (a7*b7+c7) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and add the low 16-bits of c.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[((a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ...,
+	*		 ((a7 smod 2^8)*(b7 smod 2^8)+c7) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and substract the low 16 bits of the intermediate from elements of c.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[(-a0*b0+c0) smod 2^16, ..., (-a7*b7+c7) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and substract them from elements of c.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[(-(a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ...,
+	*		 (-(a7 smod 2^8)*(b7 smod 2^8)+c7) smod 2^16]		int16_t
+	*/
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	* Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and substract elements of c to the low 16-bits of the intermediate.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[(a0*b0-c0) smod 2^16, ..., (a7*b7-c7) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and substract elements of c from them.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+				[c0, ..., c7]		int16_t
+	* Return :	[((a0 smod 2^8)*(b0 smod 2^8)-c0) smod 2^16, ...,
+	*		 ((a7 smod 2^8)*(b7 smod 2^8)-c7) smod 2^16]		int16_t
+	*/
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for equality, and store the results in vect_t.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+	* Return :	[(a0==b0) ? 0xFFFF : 0, ..., (a7==b7) ? 0xFFFF : 0]			int16_t
+	*/
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi16(a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for greater-than, and store the results in vect_t.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+	* Return :	[(a0>b0) ? 0xFFFF : 0, ..., (a7>b7) ? 0xFFFF : 0]			int16_t
+	*/
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi16(a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+	* Return :	[(a0<b0) ? 0xFFFF : 0, ..., (a7<b7) ? 0xFFFF : 0]			int16_t
+	*/
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_epi16(a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for greater or equal than, and store the results in vect_t.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+	* Return :	[(a0>=b0) ? 0xFFFF : 0, ..., (a7>=b7) ? 0xFFFF : 0]			int16_t
+	*/
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	* Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t.
+	* Args   :	[a0, ..., a7]		int16_t
+				[b0, ..., b7]		int16_t
+	* Return :	[(a0<=b0) ? 0xFFFF : 0, ..., (a7<=b7) ? 0xFFFF : 0]			int16_t
+	*/
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Horizontally add 16-bits elements of a.
+	* Args   :	[a0, a1, a2, a3, a4, a5, a6, a7]
+	* Return :	a0+a1+a2+a3
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter conv;
+		conv.v = a;
+		return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3] + conv.t[4] + conv.t[5] + conv.t[6] + conv.t[7]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const __m64 &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
 #ifdef __INTEL_COMPILER
-        C = _mm_rem_epi16(C, P);
+		C = _mm_rem_epi16(C, P);
 #else
-        FFLASFFPACK_abort("not implemented");
+		FFLASFFPACK_abort("not implemented");
 #endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
-
-#else
-#error "You need SSE instructions to perform 128 bits operations on int16"
-#endif // defined(__FFLASFFPACK_USE_AVX2)
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+		return C;
+	}
 };
 
 /*
  * Simd128 specialized for uint16_t
  */
 template <> struct Simd128_impl<true, true, false, 2> : public Simd128_impl<true, true, true, 2> {
-    using scalar_t = uint16_t;
-
-    /*
-     * Load 128-bits of unsigned integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of unsigned integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of unsigned integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm_storeu_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi16(a, b);
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi16(a, b);
-    }
-
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-};
 
-#endif
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = uint16_t;
+
+	/*
+	 * Converter from vect_t to a tab.
+	 * exple:
+	 *	Converter conv;
+	 *	conv.v = a;
+	 *	scalart_t x = conv.t[1]
+	 */
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x] uint16_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi16(x); }
+
+	/*
+	*  Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x0, ..., x7] uint16_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
+		return _mm_set_epi16(x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 16-bit unsigned integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
+	*  Return [p[idx[0]],..., p[idx[7]]] uint16_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
+	}
+
+	/*
+	* Load 128-bits of unsigned integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[idx[0]],..., p[idx[7]]] uint16_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 128-bits of unsigned integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[idx[0]],..., p[idx[7]]] uint16_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 16-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   :	[a0, ..., a7]			uint16_t
+	 * Return :	[Floor(a0/2^s), ..., Floor(a7/2^s)]	int16_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi16(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm_cmpgt_epi16(a, b);
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm_cmplt_epi16(a, b);
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and store the high 16 bits of the intermediate integers in vect_t.
+	* Args   :	[a0, ..., a7] uint16_t
+	*			[b0, ..., b7] uint16_t
+	* Return :	[Floor(a0*b0/2^16), ..., Floor(a7*b7/2^16)] uint16_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm_mulhi_epu16(a, b); }
+
+	/*
+	* Multiply the low unsigned 8-bit integers from each packed 16-bit element in a and b,
+	* and store the signed 16-bit results in vect_t.
+	* Args   :	[a0, ..., a7] uint16_t
+	*			[b0, ..., b7] uint16_t
+	* Return :	[(a0 mod 2^8)*(b0 mod 2^8), ..., (a7 mod 2^8)*(b7 mod 2^8)] uint16_t
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1;
+		mask1 = set1(0x00FF);
+		a1 = vand(a,mask1);
+		b1 = vand(b,mask1);
+		return mul(a1,b1);
+	}
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Horizontally add 16-bits elements of a.
+	* Args   :	[a0, a1, a2, a3, a4, a5, a6, a7]
+	* Return :	a0+a1+a2+a3
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter conv;
+		conv.v = a;
+		return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3] + conv.t[4] + conv.t[5] + conv.t[6] + conv.t[7]);
+	}
+}; //Simd128_impl<true,true,false,2>
+
+#endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128_int32.inl b/fflas-ffpack/fflas/fflas_simd/simd128_int32.inl
index c99f450..efb8f7f 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128_int32.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128_int32.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,426 +31,600 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL
 
-// int32_t
-template <> struct Simd128_impl<true, true, true, 4> {
-
-#if defined(__FFLASFFPACK_USE_SIMD)
-
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int32_t;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 4;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 16;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *      Converter conv;
-     *      conv.v = a;
-     *      scalart_t x = conv.t[1]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0] int32_t
-     */
-    static INLINE CONST vect_t zero() { return _mm_setzero_si128(); }
-
-    /*
-     *  Broadcast 32-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x,x,x] int32_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi32(x); }
-
-    /*
-     *  Broadcast 32-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1,x2,x3] int32_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
-        return _mm_set_epi32(x3, x2, x1, x0);
-    }
-
-    /*
-     *  Gather 32-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int32_t
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3]] int32_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm_storeu_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-     */
-    // static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_si128(const_cast<scalar_t *>(p), v); }
-
-     /*
-     * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int32_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi32(a, s); }
-
-    /*
-     * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int32_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi32(a, s); }
-
-    static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_sra_epi32(a, set1(s)); }
-
-    /*
-     * Add packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     [b0, b1, b2, b3] int32_t
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3]   int32_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi32(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     [b0, b1, b2, b3] int32_t
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3]  int32_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi32(a, b); }
-
-    static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1, a2*b2 mod 2^16-1, a3*b3 mod 2^16-1] int32_t
-     */
-    static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi32(a, b); }
-
-    /*
-     * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the high 32
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     [b0, b1, b2, b3] int32_t
-     * Return :
-     */
-    static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
-        // _mm_mulhi_epi32 emul
-        //#pragma warning "The simd mulhi function is emulate, it may impact the performances."
-        vect_t a1, a2, b1, b2;
-        a1 = set(0, _mm_extract_epi32(a, 0), 0, _mm_extract_epi32(a, 1));
-        a2 = set(0, _mm_extract_epi32(a, 1), 0, _mm_extract_epi32(a, 3));
-        b1 = set(0, _mm_extract_epi32(b, 0), 0, _mm_extract_epi32(b, 1));
-        b2 = set(0, _mm_extract_epi32(b, 1), 0, _mm_extract_epi32(b, 3));
-        a1 = _mm_mul_epi32(a1, b1);
-        a2 = _mm_mul_epi32(a2, b2);
-        return set(_mm_extract_epi32(a1, 0), _mm_extract_epi32(a1, 2), _mm_extract_epi32(b1, 0),
-                   _mm_extract_epi32(b2, 0));
-    }
-
-    /*
-     * Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results
-     in vect_t.
-     * Args   : [a0, a1, a2, a3]    int32_t
-     [b0, b1, b2, b3]    int32_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3]    int32_t
-     */
-    static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epi32(a, b); }
-
-    /*
-     * Multiply the packed 32-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1, a2*b2 mod 2^16-1, a3*b3 mod 2^16-1] int32_t
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     [c0, c1, c2, c3]    int32_t
-     * Return : [(a0*b0 mod 2^16-1)+c0, (a1*b1 mod 2^16-1)+c1, (a2*b2 mod 2^16-1)+c2, (a3*b3 mod 2^16-1)+c3] int32_t
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     [c0, c1, c2, c3]    int32_t
-     * Return : [-(a0*b0 mod 2^16-1)+c0, -(a1*b1 mod 2^16-1)+c1, -(a2*b2 mod 2^16-1)+c2, -(a3*b3 mod 2^16-1)+c3] int32_t
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t c, const vect_t a, const vect_t b) { return c = sub(c, mul(a, b)); }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     [c0, c1, c2, c3]    int32_t
-     * Return : [(a0*b0 mod 2^16-1)-c0, (a1*b1 mod 2^16-1)-c1, (a2*b2 mod 2^16-1)-c2, (a3*b3 mod 2^16-1)-c3] int32_t
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3]           int32_t
-     [b0, b1, b2, b3]           int32_t
-     [c0, c1, c2, c3]    int32_t
-     * Return : [(a0*b0 mod 2^16-1)-c0, (a1*b1 mod 2^16-1)-c1, (a2*b2 mod 2^16-1)-c2, (a3*b3 mod 2^16-1)-c3] int32_t
-     */
-    static INLINE CONST vect_t fmsubin(vect_t c, const vect_t a, const vect_t b) { return c = sub(mul(a, b), c); }
-
-    /*
-     * Compare packed 32-bits in a and b for equality, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0,
-     (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0,
-     (a4==b4) ? 0xFFFFFFFF : 0, (a5==b5) ? 0xFFFFFFFF : 0,
-     (a6==b6) ? 0xFFFFFFFF : 0, (a7==b7) ? 0xFFFFFFFF : 0]                     int32_t
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi32(a, b); }
-
-    /*
-     * Compare packed 32-bits in a and b for greater-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0>b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0,
-     (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0,
-     (a4>b4) ? 0xFFFFFFFF : 0, (a5>b5) ? 0xFFFFFFFF : 0,
-     (a6>b6) ? 0xFFFFFFFF : 0, (a7>b7) ? 0xFFFFFFFF : 0]                      int32_t
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi32(a, b); }
-
-    /*
-     * Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0<b0) ? 0xFFFFFFFF : 0, (a1<b1) ? 0xFFFFFFFF : 0,
-     (a2<b2) ? 0xFFFFFFFF : 0, (a3<b3) ? 0xFFFFFFFF : 0,
-     (a4<b4) ? 0xFFFFFFFF : 0, (a5<b5) ? 0xFFFFFFFF : 0,
-     (a6<b6) ? 0xFFFFFFFF : 0, (a7<b7) ? 0xFFFFFFFF : 0]                      int32_t
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmpgt_epi32(b, a); }
-
-    /*
-     * Compare packed 32-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0>=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0,
-     (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0,
-     (a4>=b4) ? 0xFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFF : 0,
-     (a6>=b6) ? 0xFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFF : 0]                    int32_t
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0<=b0) ? 0xFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFF : 0,
-     (a2<=b2) ? 0xFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFF : 0,
-     (a4<=b4) ? 0xFFFFFFFF : 0, (a5<=b5) ? 0xFFFFFFFF : 0,
-     (a6<=b6) ? 0xFFFFFFFF : 0, (a7<=b7) ? 0xFFFFFFFF : 0]                     int32_t
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_si128(b, a); }
-
-    /*
-     * Compute the bitwise OR of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_si128(b, a); }
-
-    /*
-     * Compute the bitwise XOR of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_si128(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     ANDNOT b7]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_si128(b, a); }
-
-    /*
-     * Horizontally add 32-bits elements of a.
-     * Args   : [a0, a1, a2, a3]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter conv;
-        conv.v = a;
-        return conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3];
-    }
-
-    static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(mulx(a, b), c); }
-
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-#ifdef __INTEL_COMPILER
-        C = _mm_rem_epi32(C, P);
-#else
-        FFLASFFPACK_abort("pas implementé");
-// C = fnmadd(C,_mm_castps_si128(_mm_floor_ps(_mm_mul_ps(INVP,_mm_castsi128_ps(C)))),P);
+#ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
+#error "You need SSE instructions to perform 128 bits operations on int32"
 #endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
 
+#include "fflas-ffpack/fflas/fflas_simd/simd128_int64.inl"
+
+/*
+ * Simd128 specialized for int32_t
+ */
+template <> struct Simd128_impl<true, true, true, 4> : public Simd128i_base {
+
+	/*
+	* alias to 128 bit simd register
+	*/
+	using vect_t = __m128i;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = int32_t;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 4;
+
+	/*
+	*  alignement required by scalar_t pointer to be loaded in a vect_t
+	*/
+	static const constexpr size_t alignment = 16;
+
+	/*
+	* Check if the pointer p is a multiple of alignemnt
+	*/
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	* Check if the number n is a multiple of vect_size
+	*/
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastd.
+	*  Return [x,x,x,x] int32_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi32(x); }
+
+	/*
+	*  Set packed 32-bit integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3] int32_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
+		return _mm_set_epi32(x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 32-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int32_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3]] int32_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+	* Return : [a0 << s, a1 << s, a2 << s, a3 << s] int32_t
+	*/
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi32(a, s); }
+
+	/*
+	* Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+	* Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int32_t
+	*/
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi32(a, s); }
+
+	/*
+	* Shift packed 32-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+	* Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int32_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srai_epi32(a, s); }
+
+	/*
+	* Shuffle 32-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int32_t
+	* Return : [a[s[0..1]], ..., a[s[6..7]] int32_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		return _mm_shuffle_epi32(a, s);
+	}
+
+	/*
+	* Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [a0, b0, a1, b1] int32_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi32(a, b); }
+
+	/*
+	* Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [a2, b2, a3, b3] int32_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi32(a, b); }
+
+	/*
+	* Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [s[0]?a0:b0,   , s[3]?a3:b3] int32_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		// _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2
+		// We have to transform s = [d3 d2 d1 d0]_base2 to s1 = [d3 d3 d2 d2 d1 d1 d0 d0]_base2
+		constexpr uint8_t s1 = (s & 0x1) * 3 + (((s & 0x2) << 1)*3)  + (((s & 0x4) << 2)*3) + (((s & 0x8) << 3)*3);
+		return _mm_blend_epi16(a, b, s1);
+	}
+
+	/*
+	* Add packed 32-bits integer in a and b, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [a0+b0, a1+b1, a2+b2, a3+b3]   int32_t
+	*/
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi32(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	* Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [a0-b0, a1-b1, a2-b2, a3-b3]  int32_t
+	*/
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi32(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits
+	of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [a0*b0 smod 2^32, ..., a3*b3 smod 2^32]	int32_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi32(a, b); }
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the high 32
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [Floor(a0*b0/2^32), ..., Floor(a3*b3/2^32)] int32_t
+	*/
+
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		// _mm_mulhi_epi32 emul
+		//#pragma warning "The simd mulhi function is emulated, it may impact the performances."
+#if 0
+		vect_t a1, a2, b1, b2;
+		a1 = set(_mm_extract_epi32(a, 0), 0, _mm_extract_epi32(a, 2), 0);
+		a2 = set(_mm_extract_epi32(a, 1), 0, _mm_extract_epi32(a, 3), 0);
+		b1 = set(_mm_extract_epi32(b, 0), 0, _mm_extract_epi32(b, 2), 0);
+		b2 = set(_mm_extract_epi32(b, 1), 0, _mm_extract_epi32(b, 3), 0);
+		a1 = _mm_mul_epi32(a1, b1);
+		a2 = _mm_mul_epi32(a2, b2);
+		return set(_mm_extract_epi32(a1, 1), _mm_extract_epi32(a2, 1), _mm_extract_epi32(a1, 3),
+				   _mm_extract_epi32(a2, 3));
 #else
-#error "You need SSE instructions to perform 128 bits operations on int32"
+		typedef Simd128_impl<true, true, true, 8> Simd128_64;
+		vect_t C,A1,B1;
+		C  = Simd128_64::mulx(a,b);
+		A1 = Simd128_64::srl(a,32);
+		B1 = Simd128_64::srl(b,32);
+		A1 = Simd128_64::mulx(A1,B1);
+		C  = Simd128_64::srl(C,32);
+		A1 = Simd128_64::srl(A1,32);
+		A1 = Simd128_64::sll(A1,32);
+		return Simd128_64::vor(C,A1);
 #endif
+	}
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results
+	in vect_t.
+	* Args   : [a0, a1, a2, a3] int32_t
+			   [b0, b1, b2, b3] int32_t
+	* Return : [(a0 smod 2^16)*(b0 smod 2^16), (a1 smod 2^16)*(b1 smod 2^16),
+	*	    (a2 smod 2^16)*(b2 smod 2^16), (a3 smod 2^16)*(b3 smod 2^16)]	int32_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1, mask2;
+		mask1 = set1(0x0000FFFF);
+		mask2 = set1(0x00008000);
+		a1 = add(a,mask2);
+		a1 = vand(a1,mask1);
+		a1 = sub(a1,mask2);
+		b1 = add(b,mask2);
+		b1 = vand(b1,mask1);
+		b1 = sub(b1,mask2);
+		return mul(a1,b1);
+	}
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* keep the low 32 bits of the intermediate and add the low 32-bits of c.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return :	[(a0*b0+c0) smod 2^32, ..., (a3*b3+c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and add the low 32-bits of c.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return :	[((a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ...,
+	*		 ((a3 smod 2^16)*(b3 smod 2^16)+c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and substract the low 32 bits of the intermediate from elements of c.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return :	[(-a0*b0+c0) smod 2^32, ..., (-a3*b3+c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and add the low 32-bits of c and substract them from elements of c.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return :	[(-(a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ...,
+	*		 (-(a3 smod 2^16)*(b3 smod 2^16)+c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and substract elements of c to the low 32-bits of the intermediate.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return : [(a0*b0-c0) smod 2^32, ..., (a3*b3-c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and substract elements of c from them.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+				[c0, c1, c2, c3]		int32_t
+	* Return :	[((a0 smod 2^16)*(b0 smod 2^16)-c0) smod 2^32, ...,
+	*		 ((a3 smod 2^16)*(b3 smod 2^16)-c3) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for equality, and store the results in vect_t.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+	* Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0,
+	(a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0]			int32_t
+	*/
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi32(a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for greater-than, and store the results in vect_t.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+	* Return : [(a0>b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0,
+	(a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0]			int32_t
+	*/
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi32(a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+	* Return : [(a0<b0) ? 0xFFFFFFFF : 0, (a1<b1) ? 0xFFFFFFFF : 0,
+	(a2<b2) ? 0xFFFFFFFF : 0, (a3<b3) ? 0xFFFFFFFF : 0]			int32_t
+	*/
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm_cmplt_epi32(a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for greater or equal than, and store the results in vect_t.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+	* Return : [(a0>=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0,
+	(a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0]			int32_t
+	*/
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	* Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t.
+	* Args   :	[a0, a1, a2, a3]		int32_t
+				[b0, b1, b2, b3]		int32_t
+	* Return : [(a0<=b0) ? 0xFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFF : 0,
+	(a2<=b2) ? 0xFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFF : 0]			int32_t
+	*/
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Horizontally add 32-bits elements of a.
+	* Args   : [a0, a1, a2, a3]
+	* Return : a0+a1+a2+a3
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter conv;
+		conv.v = a;
+		return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+#ifdef __INTEL_COMPILER
+		C = _mm_rem_epi32(C, P);
+#else
+		FFLASFFPACK_abort("pas implementé");
+		// C = fnmadd(C,_mm_castps_si128(_mm_floor_ps(_mm_mul_ps(INVP,_mm_castsi128_ps(C)))),P);
+#endif
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+		return C;
+	}
+
 };
 
-// uint32_t
+/*
+ * Simd128 specialized for uint32_t
+ */
 template <> struct Simd128_impl<true, true, false, 4> : public Simd128_impl<true, true, true, 4> {
-    using scalar_t = uint32_t;
-
-    /*
-    * Load 128-bits of unsigned integer data from memory into dst.
-    * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-    * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-    */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of unsigned integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of unsigned integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi32(a, b);
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi32(a, b);
-    }
-
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-};
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = uint32_t;
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 32-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x] uint32_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi32(x); }
+
+	/*
+	*  Set packed 32-bit unsigned integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3] uint32_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
+		return _mm_set_epi32(x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 32-bit unsigned integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] uint32_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
+	}
+
+	/*
+	* Load 128-bits of unsigned integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3]] uint32_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 128-bits of unsigned integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 32-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, ..., a3]			int32_t
+	 * Return : [Floor(a0/2^s), ..., Floor(a3/2^s)]	int32_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi32(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm_cmpgt_epi32(a, b);
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm_cmplt_epi32(a, b);
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Multiply the packed unsigned 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and store the high 32	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3]		 uint32_t
+	*		   [b0, b1, b2, b3]		 uint32_t
+	* Return : [Floor(a0*b0/2^32), ..., Floor(a3*b3/2^32)] uint32_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		// _mm_mulhi_epi32 emul
+		//#pragma warning "The simd mulhi function is emulated, it may impact the performances."
+		typedef Simd128_impl<true, true, false, 8> Simd128_64;
+		vect_t C,A1,B1;
+		C  = Simd128_64::mulx(a,b);
+		A1 = Simd128_64::srl(a,32);
+		B1 = Simd128_64::srl(b,32);
+		A1 = Simd128_64::mulx(A1,B1);
+		C  = Simd128_64::srl(C,32);
+		A1 = Simd128_64::srl(A1,32);
+		A1 = Simd128_64::sll(A1,32);
+		return Simd128_64::vor(C,A1);
+	}
+
+	/*
+	* Multiply the low unsigned 16-bit integers from each packed 32-bit element in a and b,
+	* and store the signed 32-bit results in vect_t.
+	* Args   : [a0, a1, a2, a3]		 uint32_t
+	*		   [b0, b1, b2, b3]		 uint32_t
+	* Return : [(a0 mod 2^16)*(b0 mod 2^16), (a1 mod 2^16)*(b1 mod 2^16),
+	*	    (a2 mod 2^16)*(b2 mod 2^16), (a3 mod 2^16)*(b3 mod 2^16)]	uint32_t
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1;
+		mask1 = set1(0x0000FFFF);
+		a1 = vand(a,mask1);
+		b1 = vand(b,mask1);
+		return mul(a1,b1);
+	}
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Horizontally add 32-bits elements of a.
+	* Args   : [a0, a1, a2, a3]
+	* Return : a0+a1+a2+a3
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter conv;
+		conv.v = a;
+		return conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3];
+	}
+}; //Simd128_impl<true,true,false,4>
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd128_int64.inl b/fflas-ffpack/fflas/fflas_simd/simd128_int64.inl
index d154c3b..b2dbed5 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd128_int64.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd128_int64.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,464 +31,683 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL
 
+#ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
+#error "You need SSE instructions to perform 128 bits operations on int64"
+#endif
+
 /*
  * Simd128 specialized for int64_t
  */
-template <> struct Simd128_impl<true, true, true, 8> {
-
-#if defined(__FFLASFFPACK_USE_SIMD)
-    /*
-     * alias to 128 bit simd register
-     */
-    using vect_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int64_t;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 2;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 16;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *      Converter conv;
-     *      conv.v = a;
-     *      scalart_t x = conv.t[1]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0] int64_t
-     */
-    static INLINE CONST vect_t zero() { return _mm_setzero_si128(); }
-
-    /*
-     *  Broadcast 64-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x] int64_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi64x(x); }
-
-    /*
-     *  Broadcast 64-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1] int64_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1) { return _mm_set_epi64x(x1, x0); }
-
-    /*
-     *  Gather 64-bit integer elements with indexes idx[0], ..., idx[1] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]]] int64_t
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]]);
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1]] int64_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1]] int64_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm_storeu_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 128-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
-     */
-    // static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(static_cast<vect_t *>(p), v); }
-
-    /*
-     * Add packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1] int64_t
-     [b0, b1] int64_t
-     * Return : [a0+b0, a1+b1]   int64_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi64(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in vect_t.
-     * Args   : [a0, a1] int64_t
-     [b0, b1] int64_t
-     * Return : [a0-b0, a1-b1]  int64_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi64(a, b); }
-
-    static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int64_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int64_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi64(a, s); }
-
-    /*
-     * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int64_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi64(a, s); }
-
-    static INLINE CONST vect_t sra(const vect_t a, const int s) {
-#ifdef __AVX512__
-        return _mm_sra_epi64(a, set1(s));
+template <> struct Simd128_impl<true, true, true, 8> : public Simd128i_base {
+
+	/*
+	* alias to 128 bit simd register
+	*/
+	using vect_t = __m128i;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = int64_t;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 2;
+
+	/*
+	*  alignement required by scalar_t pointer to be loaded in a vect_t
+	*/
+	static const constexpr size_t alignment = 16;
+
+	/*
+	* Check if the pointer p is a multiple of alignemnt
+	*/
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	* Check if the number n is a multiple of vect_size
+	*/
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x] int64_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi64x(x); }
+
+	/*
+	*  Set packed 64-bit integers in dst with the supplied values.
+	*  Return [x0,x1] int64_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1) { return _mm_set_epi64x(x1, x0); }
+
+	/*
+	*  Gather 64-bit integer elements with indexes idx[0], idx[1] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]]] int64_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]]);
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1]] int64_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 128-bits of integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1]] int64_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 128-bits of integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+	* Return : [a0 << s, a1 << s] int64_t
+	*/
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi64(a, s); }
+
+	/*
+	* Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+	* Return : [a0 >> s, a1 >> s] int64_t
+	*/
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi64(a, s); }
+
+	/*
+	* Shift packed 64-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+	* Return : [a0 >> s, a1 >> s] int64_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) {
+#ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+		return _mm_srai_epi64(a, s);
 #else
-        const int b = 63 - s;
-        vect_t m = sll(set1(1), b);
-        vect_t x = srl(a, s);
-        vect_t result = sub(vxor(x, m), m); // result = x^m - m
-        return result;
-#endif // 512
-    }
-
-    /*
-     * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1]           int64_t
-     [b0, b1]           int64_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1] int64_t
-     */
-    static INLINE CONST vect_t mullo(const vect_t x0, const vect_t x1) {
-        // _mm_mullo_epi32 emul
-        // #pragma warning "The simd mullo function is emulate, it may impact the performances."
-
-        Converter c0, c1;
-        c0.v = x0;
-        c1.v = x1;
-        return set((scalar_t)(c0.t[0] * c1.t[0]), (scalar_t)(c0.t[1] * c1.t[1]));
-    }
-
-    static INLINE CONST vect_t mullox(const vect_t x0, const vect_t x1) { return _mm_mullo_epi32(x0, x1); }
-
-    /*
-     * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1]           int64_t
-     [b0, b1]           int64_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1] int64_t
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
-// #pragma warning "The simd mulhi function is emulate, it may impact the performances."
-#ifdef __X86_64__
-        Converter c0, c1;
-        c0.v = a;
-        c1.v = b;
-        return set((scalar_t)((int128_t(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)((int128_t(c0.t[1]) * c1.t[1]) >> 64));
+		const int b = 63 - s;
+		vect_t m = sll(set1(1), b);
+		vect_t x = srl(a, s);
+		vect_t result = sub(vxor(x, m), m); // result = x^m - m
+		return result;
+#endif // __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+	}
+
+	/*
+	* Shuffle 64-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   : [a0, a1] int64_t
+	* Return : [a[s[0]], a[s[1]]] int64_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		// Transform s = [d1 d0]_base2 to s1 = [2*d1+1 2*d1 2*d0+1 2*d0]_base4
+		constexpr uint8_t s1 = ((s & 1)?(3*4+2):(1*4+0))+16*((s & 2)?(3*4+2):(1*4+0));
+		return _mm_shuffle_epi32(a, s1);
+	}
+
+	/*
+	* Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [a0, b0] int64_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi64(a, b); }
+
+	/*
+	* Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [a1, b1] int64_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi64(a, b); }
+
+	/*
+	* Blend packed 64-bit integers from a and b using control mask imm8, and store the results in dst.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [s[0]?a0:b0, s[1]?a1:b1] int64_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		// _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2
+		// We have to transform s = [d1 d0]_base2 to s1 = [d1 d1 d1 d1 d0 d0 d0 d0]_base2
+		constexpr uint8_t s1 = (s & 0x1) * 15 + ((s & 0x2) << 3) * 15;
+		return _mm_blend_epi16(a, b, s1);
+	}
+
+	/*
+	* Add packed 64-bits integer in a and b, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [a0+b0, a1+b1]   int64_t
+	*/
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi64(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	* Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [a0-b0, a1-b1]  int64_t
+	*/
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi64(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [a0*b0 smod 2^64, a1*b1 smod 2^64] int64_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mullo(const vect_t x0, const vect_t x1) {
+#ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+		_mm_mullo_epi64(x0, x1);
 #else
-        return zero();
+		// _mm_mullo_epi64 emul
+		//#pragma warning "The simd mullo function is emulate, it may impact the performances."
+		Converter c0, c1;
+		c0.v = x0;
+		c1.v = x1;
+		return set((scalar_t)(c0.t[0] * c1.t[0]), (scalar_t)(c0.t[1] * c1.t[1]));
+#endif // __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+	}
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the high 64
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [Floor(a0*b0/2^64), Floor(a1*b1/2^64)] int64_t
+	*/
+#ifdef __FFLASFFPACK_HAVE_INT128
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulhi function is emulated, it may impact the performances."
+		Converter c0, c1;
+		c0.v = a;
+		c1.v = b;
+		return set((scalar_t)((int128_t(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)((int128_t(c0.t[1]) * c1.t[1]) >> 64));
+	}
 #endif
-    }
-
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
 
-    static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epi32(a, b); }
-
-    static INLINE CONST vect_t mulux(const vect_t a, const vect_t b) { return _mm_mul_epu32(a, b); }
-
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi64(a, b); }
-
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) {
-#ifdef __SSE4_2__
-        return _mm_cmpgt_epi64(a, b);
+	/*
+	* Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results
+	in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0 smod 2^32)*(b0 smod 2^32), (a1 smod 2^32)*(b1 smod 2^32)]	int64_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epi32(a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers,
+	* keep the low 64 bits of the intermediate and add the low 64-bits of c.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [(a0*b0+c0) smod 2^64, (a1*b1+c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	* Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	* keep the signed 64-bit results and add the low 64-bits of c.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [((a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64,
+	*		 ((a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers,
+	* and substract the low 64 bits of the intermediate from elements of c.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [(-a0*b0+c0) smod 2^64, (-a1*b1+c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	* Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	* keep the signed 64-bit results and substract them from elements of c.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64,
+	*		 (-(a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers,
+	* and substract elements of c to the low 64-bits of the intermediate.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [(a0*b0-c0) smod 2^64, (a1*b1-c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	* Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	* keep the signed 64-bit results and substract elements of c from them.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+			   [c0, c1] int64_t
+	* Return : [(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64,
+	*		 (-(a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64]	int64_t
+	*/
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Compare packed 64-bits in a and b for equality, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	*/
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi64(a, b); }
+
+	/*
+	* Compare packed 64-bits in a and b for greater-than, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	*/
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) {
+#ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+		return _mm_cmpgt_epi64(a, b);
 #else
-#warning "The simd greater function is emulate, it may impact the performances."
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
-#endif // __SSE4_2__
-    }
-
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) {
-#ifdef __SSE4_2__
-        return _mm_cmpgt_epi64(b, a);
+		//#warning "The simd greater function is emulate, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
+#endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+	}
+
+	/*
+	* Compare packed 64-bits in a and b for lesser-than, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	*/
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) {
+#ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+		return _mm_cmpgt_epi64(b, a);
 #else
-#warning "The simd lesser function is emulate, it may impact the performances."
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set((ca.t[0] < cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] < cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
-#endif // __SSE4_2__
-    }
-
-    /*
-     * Compare packed 64-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int64_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int64_t
-     * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a4>=b4) ? 0xFFFFFFFFFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a6>=b6) ? 0xFFFFFFFFFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFFFFFFFFFF : 0]                    int64_t
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int64_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int64_t
-     * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a4<=b4) ? 0xFFFFFFFFFFFFFFFF : 0, (a5<=b5) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a6<=b6) ? 0xFFFFFFFFFFFFFFFF : 0, (a7<=b7) ? 0xFFFFFFFFFFFFFFFF : 0]                     int64_t
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_si128(a, b); }
-
-    /*
-     * Compute the bitwise OR of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_si128(a, b); }
-
-    /*
-     * Compute the bitwise XOR of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_si128(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     ANDNOT b7]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_si128(b, a); }
-
-    /*
-     * Horizontally add 64-bits elements of a.
-     * Args   : [a0, a1, a2, a3]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter c;
-        c.v = a;
-        return c.t[0] + c.t[1];
-    }
-
-    static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
-
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    // mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF
-    static INLINE CONST vect_t mask_high() { return srl(_mm_set1_epi8(-1), 32); }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    // warning : may be off by 1 multiple, but we save a mul...
-    static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y) {
-        // unsigned mulhi starts:
-        // x1 = xy_high = mulhiu_fast(x,y)
-        const vect_t mask = mask_high();
-
-        vect_t x0 = vand(x, mask), x1 = srl(x, 32);
-        vect_t y0 = vand(y, mask), y1 = srl(y, 32);
-
-        x0 = mulux(x0, y1); // x0y1
-        y0 = mulux(x1, y0); // x1y0
-        y1 = mulux(x1, y1); // x1y1
-
-        x1 = vand(y0, mask);
-        y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0
-        x1 = srl(add(x1, x0), 32);
-        y0 = add(y1, y0);
-
-        x1 = add(x1, y0);
-        // unsigned mulhi ends
-
-        // fixing signs
-        x0 = vand(signbits(x), y);
-        x1 = sub(x1, x0);
-        x0 = vand(signbits(y), x);
-        x1 = sub(x1, x0);
-        // end fixing
-        return x1;
-    }
-
-    template <bool overflow, bool poweroftwo>
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
-                             const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) {
-#ifdef __INTEL_COMPILER
-        // Works fine with ICC 15.0.1 - A.B.
-        // #warning "not tested"
-        C = _mm_rem_epi64(C, P);
-#else
-        if (poweroftwo) {
-            Q = srl(C, 63);
-            vect_t un = set1(1);
-            T = sub(sll(un, shifter), un);
-            Q = add(C, vand(Q, T));
-            Q = sll(srl(Q, shifter), shifter);
-            C = sub(C, Q);
-            Q = vand(greater(zero(), Q), P);
-            C = add(C, Q);
-        } else {
-            Q = mulhi_fast(C, magic);
-            if (overflow) {
-                Q = add(Q, C);
-            }
-            Q = sra(Q, shifter);
-            vect_t q1 = mulux(Q, P);
-            vect_t q2 = sll(mulux(srl(Q, 32), P), 32);
-            C = sub(C, add(q1, q2));
-            T = greater_eq(C, P);
-            C = sub(C, vand(T, P));
-        }
-#endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
+		//#warning "The simd lesser function is emulate, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set((ca.t[0] < cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] < cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
+#endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+	}
+
+	/*
+	* Compare packed 64-bits in a and b for greater or equal than, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	*/
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	* Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t.
+	* Args   : [a0, a1] int64_t
+			   [b0, b1] int64_t
+	* Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	*/
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Horizontally add 64-bits elements of a.
+	* Args   : [a0, a1]	int64_t
+	* Return : a0+a1	int64_t
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter conv;
+		conv.v = a;
+		return scalar_t(conv.t[0] + conv.t[1]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	// mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF
+	static INLINE CONST vect_t mask_high() { return srl(_mm_set1_epi8(-1), 32); }
+
+	static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y);
+
+	template <bool overflow, bool poweroftwo>
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
+							 const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T);
+}; // Simd128_impl<true, true, true, 8>
 
-#else
-
-#error "You need SSE instructions to perform 128 bits operations on int64"
-
-#endif // __FFLASFFPACK_USE_SIMD
-};
-
-// uint64_t
+/*
+ * Simd128 specialized for uint64_t
+ */
 template <> struct Simd128_impl<true, true, false, 8> : public Simd128_impl<true, true, true, 8> {
-    using scalar_t = uint64_t;
-
-    /*
-    * Load 128-bits of unsigned integer data from memory into dst.
-    * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-    * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-    */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 128-bits of unsigned integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int16_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 128-bits of unsigned integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm_store_si128(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-#ifdef __SSE4_2__
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi64(a, b);
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = uint64_t;
+
+	/*
+	 * Converter from vect_t to a tab.
+	 * exple:
+	 *	Converter conv;
+	 *	conv.v = a;
+	 *	scalart_t x = conv.t[1]
+	 */
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	 *  Broadcast 64-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	 *  Return [x,x] uint64_t
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi64x(x); }
+
+	/*
+	 *  Set packed 64-bit integers in dst with the supplied values.
+	 *  Return [x0,x1] uint64_t
+	 */
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1) { return _mm_set_epi64x(x1, x0); }
+
+	/*
+	 *  Gather 64-bit unsigned integer elements with indexes idx[0], ..., idx[1] from the address p in vect_t.
+	 *  Return [p[idx[0]], p[idx[1]]] uint64_t
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]]);
+	}
+
+	/*
+	 * Load 128-bits of unsigned integer data from memory into dst.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0],p[1]] uint64_t
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm_load_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Load 128-bits of unsigned integer data from memory into dst.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0],p[1]] uint64_t
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm_loadu_si128(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Store 128-bits of unsigned integer data from a into memory.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm_store_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 128-bits of unsigned integer data from a into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm_storeu_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	 * p must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm_stream_si128(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 64-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, a1]				uint64_t
+	 * Return : [Floor(a0/2^s), Floor(a1/2^s)]	uint64_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi64(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+#ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+		vect_t x;
+		x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(x, a);
+		b = sub(x, b);
+		return _mm_cmpgt_epi64(b, a);
 #else
-#warning "The simd greater function is emulate, it may impact the performances."
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
-#endif
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-#ifdef __SSE4_2__
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm_cmpgt_epi64(a, b);
+		//#pragma warning "The simd greater function is emulated, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
+#endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+#ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+		vect_t x;
+		x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(x, a);
+		b = sub(x, b);
+		return _mm_cmpgt_epi64(a, b);
 #else
-#warning "The simd greater function is emulate, it may impact the performances."
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set((ca.t[0] < cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] < cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
+		//#pragma warning "The simd greater function is emulated, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set((ca.t[0] < cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] < cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0);
+#endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Multiply the packed 64-bit unsigned integers in a and b, producing intermediate 128-bit integers, and store the low 64
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1] uint64_t
+			   [b0, b1] uint64_t
+	* Return : [a0*b0 mod 2^64, a1*b1 mod 2^64] uint64_t
+	*/
+	static INLINE CONST vect_t mullo(const vect_t x0, const vect_t x1) {
+		// _mm_mullo_epi32 emul
+		//#pragma warning "The simd mullo function is emulated, it may impact the performances."
+		Converter c0, c1;
+		c0.v = x0;
+		c1.v = x1;
+		return set((scalar_t)(c0.t[0] * c1.t[0]), (scalar_t)(c0.t[1] * c1.t[1]));
+	}
+
+	/*
+	* Multiply the packed unsigned 64-bit integers in a and b, producing intermediate 128-bit integers,
+	* and store the high 64 bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1] uint64_t
+			   [b0, b1] uint64_t
+	* Return : [Floor(a0*b0/2^16), Floor(a1*b1/2^16)] uint64_t
+	*/
+#ifdef __FFLASFFPACK_HAVE_INT128
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulhi function is emulate, it may impact the performances."
+		Converter c0, c1;
+		c0.v = a;
+		c1.v = b;
+		return set((scalar_t)((uint128_t(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)((uint128_t(c0.t[1]) * c1.t[1]) >> 64));
+	}
 #endif
-    }
 
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+	/*
+	* Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results
+	in vect_t.
+	* Args   : [a0, a1] uint64_t
+			   [b0, b1] uint64_t
+	* Return : [(a0 mod 2^32)*(b0 mod 2^32), (a1 mod 2^32)*(b1 mod 2^32)]	uint64_t
+	*/
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epu32(a, b); }
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE CONST vect_t fmsubxin(vect_t c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Horizontally add 64-bits elements of a.
+	* Args   : [a0, a1, a2, a3]
+	* Return : a0+a1+a2+a3
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter c;
+		c.v = a;
+		return c.t[0] + c.t[1];
+	}
+}; //Simd128_impl<true,true,false,8>
+
+#define vect_t Simd128_impl<true,true,true,8>::vect_t
+
+// warning : may be off by 1 multiple, but we save a mul...
+INLINE CONST vect_t Simd128_impl<true,true,true,8>::mulhi_fast(vect_t x, vect_t y) {
+	// unsigned mulhi starts:
+	// x1 = xy_high = mulhiu_fast(x,y)
+	const vect_t mask = mask_high();
+
+	vect_t x0 = vand(x, mask), x1 = srl(x, 32);
+	vect_t y0 = vand(y, mask), y1 = srl(y, 32);
+
+	x0 = Simd128_impl<true, true, false, 8>::mulx(x0, y1); // x0y1
+	y0 = Simd128_impl<true, true, false, 8>::mulx(x1, y0); // x1y0
+	y1 = Simd128_impl<true, true, false, 8>::mulx(x1, y1); // x1y1
+
+	x1 = vand(y0, mask);
+	y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0
+	x1 = srl(add(x1, x0), 32);
+	y0 = add(y1, y0);
+
+	x1 = add(x1, y0);
+	// unsigned mulhi ends
+
+	// fixing signs
+	x0 = vand(signbits(x), y);
+	x1 = sub(x1, x0);
+	x0 = vand(signbits(y), x);
+	x1 = sub(x1, x0);
+	// end fixing
+	return x1;
+}
+
+// warning : may be off by 1 multiple, but we save a mul...
+template <bool overflow, bool poweroftwo>
+INLINE CONST vect_t Simd128_impl<true,true,true,8>::mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
+														const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) {
+#ifdef __INTEL_COMPILER
+	// Works fine with ICC 15.0.1 - A.B.
+	// #warning "not tested"
+	C = _mm_rem_epi64(C, P);
+#else
+	if (poweroftwo) {
+		Q = srl(C, 63);
+		vect_t un = set1(1);
+		T = sub(sll(un, shifter), un);
+		Q = add(C, vand(Q, T));
+		Q = sll(srl(Q, shifter), shifter);
+		C = sub(C, Q);
+		Q = vand(greater(zero(), Q), P);
+		C = add(C, Q);
+	} else {
+		Q = mulhi_fast(C, magic);
+		if (overflow) {
+			Q = add(Q, C);
+		}
+		Q = sra(Q, shifter);
+		vect_t q1 = Simd128_impl<true, true, false, 8>::mulx(Q, P);
+		vect_t q2 = sll(Simd128_impl<true, true, false, 8>::mulx(srl(Q, 32), P), 32);
+		C = sub(C, add(q1, q2));
+		T = greater_eq(C, P);
+		C = sub(C, vand(T, P));
+	}
+#endif
+	NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+	return C;
+}
 
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-};
+#undef vect_t
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256.inl b/fflas-ffpack/fflas/fflas_simd/simd256.inl
index 028ea9d..ede276d 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -30,24 +30,157 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
 
+struct Simd256fp_base {
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS)
+
+	/*
+	* Shuffle 128-bits selected by imm8 from a and b, and store the results in dst.
+	* Args   :	[a0, a1]
+	*			[b0, b1]
+	* Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1]
+	*/
+	template<int s>
+	static INLINE CONST __m256d permute128(const __m256d a, const __m256d b) {
+		return _mm256_permute2f128_pd(a, b, s);
+	}
+
+	template<int s>
+	static INLINE CONST __m256 permute128(const __m256 a, const __m256 b) {
+		return _mm256_permute2f128_ps(a, b, s);
+	}
+
+	/*
+	* Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int128_t
+			   [b0, b1] int128_t
+	* Return : [a0, b0] int128_t
+	*/
+	static INLINE CONST __m256d unpacklo128(const __m256d a, const __m256d b) { return permute128<0x20>(a, b); }
+	static INLINE CONST __m256 unpacklo128(const __m256 a, const __m256 b) { return permute128<0x20>(a, b); }
+
+	/*
+	* Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int128_t
+			   [b0, b1] int128_t
+	* Return : [a1, b1] int128_t
+	*/
+	static INLINE CONST __m256d unpackhi128(const __m256d a, const __m256d b) { return permute128<0x31>(a, b); }
+	static INLINE CONST __m256 unpackhi128(const __m256 a, const __m256 b) { return permute128<0x31>(a, b); }
+
+#endif
+};
+
+struct Simd256i_base {
+
+	/*
+	* alias to 256 bit simd register
+	*/
+	using vect_t = __m256i;
+
+	/*
+	*  Return vector of type vect_t with all elements set to zero
+	*  Return [0, ...,0]
+	*/
+	static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); }
+
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+
+	/*
+	* Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1] int128_t
+	* Return : [a0 << (s*8), a1 << (s*8)] int128_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t sll128(const vect_t a) { return _mm256_bslli_epi128(a, s); }
+
+	/*
+	* Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1] int128_t
+	* Return : [a0 << (s*8), a1 << (s*8)] int128_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t srl128(const vect_t a) { return _mm256_bsrli_epi128(a, s); }
+
+	/*
+	* Compute the bitwise AND and store the results in vect_t.
+	* Args   : [a0, ..., a255]
+	*		   [b0, ..., b255]
+	* Return : [a0 AND b0, ..., a255 AND b255]
+	*/
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); }
+
+	/*
+	* Compute the bitwise OR and store the results in vect_t.
+	* Args   : [a0, ..., a255]
+	*		   [b0, ..., b255]
+	* Return : [a0 OR b0, ..., a255 OR b255]
+	*/
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); }
+
+	/*
+	* Compute the bitwise XOR and store the results in vect_t.
+	* Args   : [a0, ..., a255]
+	*		   [b0, ..., b255]
+	* Return : [a0 XOR b0, ..., a255 XOR b255]
+	*/
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); }
+
+	/*
+	* Compute the bitwise AND NOT and store the results in vect_t.
+	* Args   : [a0, ..., a255]
+	*		   [b0, ..., b255]
+	* Return : [a0 AND (NOT b0), ..., a255 AND (NOT b255)]
+	*/
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(b, a); }
+
+	/*
+	* Shuffle 128-bit integers in a and b using the control in imm8, and store the results in dst.
+	* Args   :	[a0, a1] int128_t
+	*			[b0, b1] int128_t
+	* Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] int128_t
+	*/
+	template<int s>
+	static INLINE CONST vect_t permute128(const vect_t a, const vect_t b) {
+		return _mm256_permute2x128_si256(a, b, s);
+	}
+
+	/*
+	* Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int128_t
+			   [b0, b1] int128_t
+	* Return : [a0, b0] int128_t
+	*/
+	static INLINE CONST vect_t unpacklo128(const vect_t a, const vect_t b) { return permute128<0x20>(a, b); }
+
+	/*
+	* Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1] int128_t
+			   [b0, b1] int128_t
+	* Return : [a1, b1] int128_t
+	*/
+	static INLINE CONST vect_t unpackhi128(const vect_t a, const vect_t b) { return permute128<0x31>(a, b); }
+#endif
+};
+
 template <bool ArithType, bool Int, bool Signed, int Size> struct Simd256_impl;
 
+template <class T>
+using Simd256 =
+	Simd256_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
+
 #include "simd256_float.inl"
 #include "simd256_double.inl"
 
 #ifdef SIMD_INT
-// Trop d'instructions SSE manquantes pour les int8_t
+// To many missing insctructions on int8_t
 
-#if defined(__FFLASFFPACK_USE_AVX2)
-#include "simd256_int16.inl"
-#include "simd256_int32.inl"
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
 #include "simd256_int64.inl"
+#include "simd256_int32.inl"
+#include "simd256_int16.inl"
 #endif
 
 #endif //#ifdef SIMD_INT
 
-template <class T>
-using Simd256 =
-    Simd256_impl<std::is_arithmetic<T>::value, std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T)>;
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256_double.inl b/fflas-ffpack/fflas/fflas_simd/simd256_double.inl
index e8f176c..3cca53f 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256_double.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256_double.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -30,355 +30,417 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
 
+#if not (defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS))
+#error "You need AVX instructions to perform 256bits operations on double"
+#endif
+
 /*
  * Simd256 specialized for double
  */
-template <> struct Simd256_impl<true, false, true, 8> {
-#if defined(__FFLASFFPACK_USE_AVX) or defined(__FFLASFFPACK_USE_AVX2)
-
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m256d;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = double;
-
-    /*
-     *	number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 4;
-
-    /*
-     *	alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 32;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     *	Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0]
-     */
-    static INLINE CONST vect_t zero() { return _mm256_setzero_pd(); }
-
-    /*
-     *	Broadcast double-precision (64-bit) floating-point value x to all elements of vect_t.
-     *  Return [x,x,x,x]
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_pd(x); }
-
-    /*
-     *	Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values.
-     *  Return [x1,x2,x3,x4]
-     */
-    static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) {
-        return _mm256_set_pd(x4, x3, x2, x1);
-    }
-
-    /*
-     *	Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
-     *vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]]
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        // TODO AVX2 Gather
-        return _mm256_set_pd(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
-    }
-
-    /*
-     * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0], p[1], p[2], p[3]]
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_pd(p); }
-
-    /*
-     * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0], p[1], p[2], p[3]]
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_pd(p); }
-
-    /*
-     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using
-     * a non-temporal memory hint.
-     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_pd(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3]
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_pd(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit)
-     * floating-point elements in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3]
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_pd(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3]
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_pd(a, b); }
-
-    static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
+template <> struct Simd256_impl<true, false, true, 8> : public Simd256fp_base {
+	/*
+	 * alias to 256 bit simd register
+	 */
+	using vect_t = __m256d;
+
+	/*
+	 * define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = double;
+
+	/*
+	 *	number of scalar_t in a simd register
+	 */
+	static const constexpr size_t vect_size = 4;
+
+	/*
+	 *	alignement required by scalar_t pointer to be loaded in a vect_t
+	 */
+	static const constexpr size_t alignment = 32;
+
+	/*
+	 * Check if the pointer p is a multiple of alignemnt
+	 */
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	 * Check if the number n is a multiple of vect_size
+	 */
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	 *	Return vector of type vect_t with all elements set to zero
+	 *  Return [0,0,0,0]
+	 */
+	static INLINE CONST vect_t zero() { return _mm256_setzero_pd(); }
+
+	/*
+	 *	Broadcast double-precision (64-bit) floating-point value x to all elements of vect_t.
+	 *  Return [x,x,x,x]
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_pd(x); }
+
+	/*
+	 *	Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values.
+	 *  Return [x1,x2,x3,x4]
+	 */
+	static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) {
+		return _mm256_set_pd(x4, x3, x2, x1);
+	}
+
+	/*
+	 *	Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
+	 *vect_t.
+	 *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]]
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		// TODO AVX2 Gather
+		return _mm256_set_pd(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
+	}
+
+	/*
+	 * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0], p[1], p[2], p[3]]
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_pd(p); }
+
+	/*
+	 * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0], p[1], p[2], p[3]]
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_pd(p); }
+
+	/*
+	 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using
+	 * a non-temporal memory hint.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_pd(const_cast<scalar_t *>(p), v); }
+
+	/*
+	* Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] double
+			   [b0, b1, b2, b3] double
+	* Return : [a[s[0..1]], ..., a[s[6..7]]] double
+	*/
+#if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		return _mm256_permute4x64_pd(a, s);
+	}
+#endif
+
+	/*
+	* Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] double
+			   [b0, b1, b2, b3] double
+	* Return : [a0, b0, a2, b2] double
+	*/
+	static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_pd(a, b); }
+
+	/*
+	* Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] double
+			   [b0, b1, b2, b3] double
+	* Return : [a1, b1, a3, b3] double
+	*/
+	static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_pd(a, b); }
+
+	/*
+	* Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] double
+			   [b0, b1, b2, b3] double
+	* Return : [s[0]?a0:b0, ..., s[3]?a3:b3] double
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm256_blend_pd(a, b, s);
+	}
+
+	/*
+	* Blend packed double-precision (64-bit) floating-point elements from a and b using mask,
+	* and store the results in dst.
+	* Args   : [a0, a1, a2, a3] double
+			   [b0, b1, b2, b3] double
+	* Return : [mask[31]?a0:b0, ..., mask[255]?a3:b3] double
+	*/
+	static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) {
+		return _mm256_blendv_pd(a, b, mask);
+	}
+
+	/*
+	 * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0+b0, a1+b1, a2+b2, a3+b3]
+	 */
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_pd(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	 * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit)
+	 * floating-point elements in a, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0-b0, a1-b1, a2-b2, a3-b3]
+	 */
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_pd(a, b); }
+
+	static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0*b0, a1*b1, a2*b2, a3*b3]
+	 */
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_pd(a, b); }
+
+	static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
+
+	/*
+	 * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b,
+	 * and store the results in dst.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0/b0, a1/b1, a2/b2, a3/b3]
+	 */
+	static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_pd(a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
+	 */
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fmadd_pd(a, b, c);
+		return _mm256_fmadd_pd(a, b, c);
 #else
-        return add(c, mul(a, b));
+		return add(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
-     */
-    static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
-     */
-    static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
-
-    static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
+	 */
+	static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3]
+	 */
+	static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
+
+	static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3]
+	 */
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fnmadd_pd(a, b, c);
+		return _mm256_fnmadd_pd(a, b, c);
 #else
-        return sub(c, mul(a, b));
+		return sub(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3]
-     */
-    static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3]
+	 */
+	static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
+	 */
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fmsub_pd(a, b, c);
+		return _mm256_fmsub_pd(a, b, c);
 #else
-        return sub(mul(a, b), c);
-#endif
-    }
-
-    /*
-     * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
-     */
-    static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
-
-    static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results
-     in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2<b2) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a3<b3) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GT_OS); }
-
-    /*
-     * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
-     (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GE_OS); }
-
-    /*
-     * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_pd(a, b); }
-
-    /*
-     * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_pd(a, b); }
-
-    /*
-     * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_pd(a, b); }
-
-    /*
-     * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0 AND NOT b0, a1 AND NOT b1, a2 AND NOT b2, a3 AND NOT b3]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_pd(a, b); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [floor(a0), floor(a1), floor(a2), floor(a3)]
-     */
-    static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_pd(a); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)]
-     */
-    static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_pd(a); }
-
-    /*
-     * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed
-     * double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     * Return : [round(a0), round(a1), round(a2), round(a3)]
-     */
-    static INLINE CONST vect_t round(const vect_t a) {
-        return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    }
-
-    /*
-     * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
-     * Return : [a0+a1, b0+b1, a2+a3, b2+b3]
-     */
-    static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_pd(a, b); }
-
-    /*
-     * Horizontally add double-precision (64-bit) floating-point elements in a.
-     * Args   : [a0, a1, a2, a3]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
-               ((const scalar_t *)&a)[3];
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-        FLOAT_MOD(C, P, INVP, Q);
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-
-        return C;
-    }
-
-#else // __AVX__
-#error "You need AVX instructions to perform 256bits operations on double"
+		return sub(mul(a, b), c);
 #endif
+	}
+
+	/*
+	 * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3]
+	 */
+	static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
+
+	static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results
+	 in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2<b2) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a3<b3) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GT_OS); }
+
+	/*
+	 * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GE_OS); }
+
+	/*
+	 * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
+	 */
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_pd(a, b); }
+
+	/*
+	 * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
+	 */
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_pd(a, b); }
+
+	/*
+	 * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
+	 */
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_pd(a, b); }
+
+	/*
+	 * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0 AND NOT b0, a1 AND NOT b1, a2 AND NOT b2, a3 AND NOT b3]
+	 */
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_pd(a, b); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [floor(a0), floor(a1), floor(a2), floor(a3)]
+	 */
+	static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_pd(a); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)]
+	 */
+	static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_pd(a); }
+
+	/*
+	 * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed
+	 * double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : [round(a0), round(a1), round(a2), round(a3)]
+	 */
+	static INLINE CONST vect_t round(const vect_t a) {
+		return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+	}
+
+	/*
+	 * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3], [b0, b1, b2, b3]
+	 * Return : [a0+a1, b0+b1, a2+a3, b2+b3]
+	 */
+	static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_pd(a, b); }
+
+	/*
+	 * Horizontally add double-precision (64-bit) floating-point elements in a.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : a0+a1+a2+a3
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
+				((const scalar_t *)&a)[3];
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+		FLOAT_MOD(C, P, INVP, Q);
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+
+		return C;
+	}
+
 };
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256_float.inl b/fflas-ffpack/fflas/fflas_simd/simd256_float.inl
index 98c3d2d..c516c7b 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256_float.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256_float.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -33,372 +33,433 @@
 /*
  * Simd256 specialized for float
  */
-template <> struct Simd256_impl<true, false, true, 4> {
-#if defined(__FFLASFFPACK_USE_AVX) or defined(__FFLASFFPACK_USE_AVX2)
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m256;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = float;
-
-    /*
-     *	number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 8;
-
-    /*
-     *	alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 32;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     *	Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0,0,0,0,0]
-     */
-    static INLINE CONST vect_t zero() { return _mm256_setzero_ps(); }
-
-    /*
-     *	Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t.
-     *  Return [x,x,x,x,x,x,x,x]
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_ps(x); }
-
-    /*
-     *	Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values.
-     *  Return [x1,x2,x3,x4,x5,x6,x7,x8]
-     */
-    static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4,
-                                   const scalar_t x5, const scalar_t x6, const scalar_t x7, const scalar_t x8) {
-        return _mm256_set_ps(x8, x7, x6, x5, x4, x3, x2, x1);
-    }
-
-    /*
-     *	Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
-     *vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]]
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        // TODO AVX2 Gather
-        return _mm256_set_ps(p[idx[7]], p[idx[6]], p[idx[5]], p[idx[4]], p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
-    }
-
-    /*
-     * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]]
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_ps(p); }
-
-    /*
-     * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]]
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_ps(p); }
-
-    /*
-     * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Store 256-bits (composed of 8 packed double-precision (32-bit) floating-point elements) from a into memory using
-     * a non-temporal memory hint.
-     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_ps(const_cast<scalar_t *>(p), v); }
-
-    /*
-     * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7]
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_ps(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit)
-     * floating-point elements in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7]
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_ps(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7]
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_ps(a, b); }
-
-    static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
+template <> struct Simd256_impl<true, false, true, 4> : public Simd256fp_base {
+#if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)
+	/*
+	 * alias to 256 bit simd register
+	 */
+	using vect_t = __m256;
+
+	/*
+	 * define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = float;
+
+	/*
+	 *	number of scalar_t in a simd register
+	 */
+	static const constexpr size_t vect_size = 8;
+
+	/*
+	 *	alignement required by scalar_t pointer to be loaded in a vect_t
+	 */
+	static const constexpr size_t alignment = 32;
+
+	/*
+	 * Check if the pointer p is a multiple of alignemnt
+	 */
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	 * Check if the number n is a multiple of vect_size
+	 */
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	 *	Return vector of type vect_t with all elements set to zero
+	 *  Return [0,0,0,0,0,0,0,0]
+	 */
+	static INLINE CONST vect_t zero() { return _mm256_setzero_ps(); }
+
+	/*
+	 *	Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t.
+	 *  Return [x,x,x,x,x,x,x,x]
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_ps(x); }
+
+	/*
+	 *	Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values.
+	 *  Return [x1,x2,x3,x4,x5,x6,x7,x8]
+	 */
+	static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4,
+								   const scalar_t x5, const scalar_t x6, const scalar_t x7, const scalar_t x8) {
+		return _mm256_set_ps(x8, x7, x6, x5, x4, x3, x2, x1);
+	}
+
+	/*
+	 *	Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in
+	 *vect_t.
+	 *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]]
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		// TODO AVX2 Gather
+		return _mm256_set_ps(p[idx[7]], p[idx[6]], p[idx[5]], p[idx[4]], p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]);
+	}
+
+	/*
+	 * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]]
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_ps(p); }
+
+	/*
+	 * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]]
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_ps(p); }
+
+	/*
+	 * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	 * Store 256-bits (composed of 8 packed double-precision (32-bit) floating-point elements) from a into memory using
+	 * a non-temporal memory hint.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_ps(const_cast<scalar_t *>(p), v); }
+
+	/*
+	* Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in s,
+	* and store the results in dst.
+	* Args   :	[a0, ..., a7] float
+				[b0, ..., b7] float
+	* Return :	[a[s[0..3]], ..., a[s[28..31]]] float
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle_twice(const vect_t a) {
+		return _mm256_permute_ps(a, s);
+	}
+
+	/*
+	* Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b,
+	* and store the results in dst.
+	* Args   :	[a0, ..., a7] float
+				[b0, ..., b7] float
+	* Return :	[a0, b0, a1, b1, a4, b4, a5, b5] float
+	*/
+	static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_ps(a, b); }
+
+	/*
+	* Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b,
+	* and store the results in dst.
+	* Args   :	[a0, ..., a7] float
+				[b0, ..., b7] float
+	* Return :	[a2, b2, a3, b3, a6, b6, a7, b7] float
+	*/
+	static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_ps(a, b); }
+
+	/*
+	* Blend packed single-precision (32-bit) floating-point elements from a and b using control mask s,
+	* and store the results in dst.
+	* Args   :	[a0, ..., a7] float
+				[b0, ..., b7] float
+	* Return :	[s[0]?a0:b0, ..., s[7]?a7:b7] float
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm256_blend_ps(a, b, s);
+	}
+
+	/*
+	* Blend packed single-precision (32-bit) floating-point elements from a and b using mask,
+	* and store the results in dst.
+	* Args   :	[a0, ..., a7] float
+				[b0, ..., b7] float
+	* Return : [mask[31]?a0:b0, ..., mask[255]?a7:b7] float
+	*/
+	static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) {
+		return _mm256_blendv_ps(a, b, mask);
+	}
+
+	/*
+	 * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7]
+	 */
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_ps(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	 * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit)
+	 * floating-point elements in a, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7]
+	 */
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_ps(a, b); }
+
+	static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7]
+	 */
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_ps(a, b); }
+
+	static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); }
+
+	/*
+	 * Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b,
+	 * and store the results in dst.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0/b0, a1/b1, a2/b2, a3/b3, a4/b4, a5/b5, a6/b6, a7/b7]
+	 */
+	static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_ps(a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
+	 */
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fmadd_ps(a, b, c);
+		return _mm256_fmadd_ps(a, b, c);
 #else
-        return add(c, mul(a, b));
+		return add(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
-     */
-    static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
-     * packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
-     */
-    static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
-
-    static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
+	 */
+	static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to
+	 * packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7]
+	 */
+	static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); }
+
+	static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7]
+	 */
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fnmadd_ps(a, b, c);
+		return _mm256_fnmadd_ps(a, b, c);
 #else
-        return sub(c, mul(a, b));
+		return sub(c, mul(a, b));
 #endif
-    }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result
-     * to packed elements in c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7]
-     */
-    static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
+	}
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result
+	 * to packed elements in c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7]
+	 */
+	static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7]
+	 */
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) {
 #ifdef __FMA__
-        return _mm256_fmsub_ps(a, b, c);
+		return _mm256_fmsub_ps(a, b, c);
 #else
-        return sub(mul(a, b), c);
+		return sub(mul(a, b), c);
 #endif
-    }
-
-    /*
-     * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
-     * the intermediate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
-     * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7]
-     */
-    static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
-
-    static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results
-     in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [(a0==b0) ? 0xFFFFFFFF : 0,
-     (a1==b1) ? 0xFFFFFFFF : 0,
-     (a2==b2) ? 0xFFFFFFFF : 0,
-     (a3==b3) ? 0xFFFFFFFF : 0,
-     (a4==b4) ? 0xFFFFFFFF : 0,
-     (a5==b5) ? 0xFFFFFFFF : 0,
-     (a6==b6) ? 0xFFFFFFFF : 0,
-     (a7==b7) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [(a0<b0) ? 0xFFFFFFFF : 0,
-     (a1<b1) ? 0xFFFFFFFF : 0,
-     (a2<b2) ? 0xFFFFFFFF : 0,
-     (a3<b3) ? 0xFFFFFFFF : 0,
-     (a4<b4) ? 0xFFFFFFFF : 0,
-     (a5<b5) ? 0xFFFFFFFF : 0,
-     (a6<b6) ? 0xFFFFFFFF : 0,
-     (a7<b7) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [(a0<=b0) ? 0xFFFFFFFF : 0,
-     (a1<=b1) ? 0xFFFFFFFF : 0,
-     (a2<=b2) ? 0xFFFFFFFF : 0,
-     (a3<=b3) ? 0xFFFFFFFF : 0,
-     (a4<=b4) ? 0xFFFFFFFF : 0,
-     (a5<=b5) ? 0xFFFFFFFF : 0,
-     (a6<=b6) ? 0xFFFFFFFF : 0,
-     (a7<=b7) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the
-     results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [(a0>b0) ? 0xFFFFFFFF : 0,
-     (a1>b1) ? 0xFFFFFFFF : 0,
-     (a2>b2) ? 0xFFFFFFFF : 0,
-     (a3>b3) ? 0xFFFFFFFF : 0,
-     (a4>b4) ? 0xFFFFFFFF : 0,
-     (a5>b5) ? 0xFFFFFFFF : 0,
-     (a6>b6) ? 0xFFFFFFFF : 0,
-     (a7>b7) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GT_OS); }
-
-    /*
-     * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store
-     the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [(a0>=b0) ? 0xFFFFFFFF : 0,
-     (a1>=b1) ? 0xFFFFFFFF : 0,
-     (a2>=b2) ? 0xFFFFFFFF : 0,
-     (a3>=b3) ? 0xFFFFFFFF : 0,
-     (a4>=b4) ? 0xFFFFFFFF : 0,
-     (a5>=b5) ? 0xFFFFFFFF : 0,
-     (a6>=b6) ? 0xFFFFFFFF : 0,
-     (a7>=b7) ? 0xFFFFFFFF : 0]
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GE_OS); }
-
-    /*
-     * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_ps(a, b); }
-
-    /*
-     * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_ps(a, b); }
-
-    /*
-     * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_ps(a, b); }
-
-    /*
-     * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     * ANDNOT b7]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_ps(a, b); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the
-     * results as packed double-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : [floor(a0), floor(a1), floor(a2), floor(a3), floor(a4), floor(a5), floor(a6), floor(a7)]
-     */
-    static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_ps(a); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the
-     * results as packed single-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3), ceil(a4), ceil(a5), ceil(a6), ceil(a7)]
-     */
-    static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_ps(a); }
-
-    /*
-     * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed
-     * single-precision floating-point elements in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : [round(a0), round(a1), round(a2), round(a3), round(a4), round(a5), round(a6), round(a7)]
-     */
-    static INLINE CONST vect_t round(const vect_t a) {
-        return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    }
-
-    /*
-     * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the
-     * results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0+a1, b0+b1, a2+a3, b2+b3, a4+a5, b4+b5, a6+a7, b6+b7]
-     */
-    static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_ps(a, b); }
-
-    /*
-     * Horizontally add single-precision (32-bit) floating-point elements in a.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : a0+a1+a2+a3+a4+a5+a6+a7
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
-               ((const scalar_t *)&a)[3] + ((const scalar_t *)&a)[4] + ((const scalar_t *)&a)[5] +
-               ((const scalar_t *)&a)[6] + ((const scalar_t *)&a)[7];
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-        FLOAT_MOD(C, P, INVP, Q);
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-
-        return C;
-    }
-
-#else // __AVX__
+	}
+
+	/*
+	 * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from
+	 * the intermediate result, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7]
+	 * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7]
+	 */
+	static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); }
+
+	static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results
+	 in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [(a0==b0) ? 0xFFFFFFFF : 0,
+	 (a1==b1) ? 0xFFFFFFFF : 0,
+	 (a2==b2) ? 0xFFFFFFFF : 0,
+	 (a3==b3) ? 0xFFFFFFFF : 0,
+	 (a4==b4) ? 0xFFFFFFFF : 0,
+	 (a5==b5) ? 0xFFFFFFFF : 0,
+	 (a6==b6) ? 0xFFFFFFFF : 0,
+	 (a7==b7) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [(a0<b0) ? 0xFFFFFFFF : 0,
+	 (a1<b1) ? 0xFFFFFFFF : 0,
+	 (a2<b2) ? 0xFFFFFFFF : 0,
+	 (a3<b3) ? 0xFFFFFFFF : 0,
+	 (a4<b4) ? 0xFFFFFFFF : 0,
+	 (a5<b5) ? 0xFFFFFFFF : 0,
+	 (a6<b6) ? 0xFFFFFFFF : 0,
+	 (a7<b7) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [(a0<=b0) ? 0xFFFFFFFF : 0,
+	 (a1<=b1) ? 0xFFFFFFFF : 0,
+	 (a2<=b2) ? 0xFFFFFFFF : 0,
+	 (a3<=b3) ? 0xFFFFFFFF : 0,
+	 (a4<=b4) ? 0xFFFFFFFF : 0,
+	 (a5<=b5) ? 0xFFFFFFFF : 0,
+	 (a6<=b6) ? 0xFFFFFFFF : 0,
+	 (a7<=b7) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the
+	 results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [(a0>b0) ? 0xFFFFFFFF : 0,
+	 (a1>b1) ? 0xFFFFFFFF : 0,
+	 (a2>b2) ? 0xFFFFFFFF : 0,
+	 (a3>b3) ? 0xFFFFFFFF : 0,
+	 (a4>b4) ? 0xFFFFFFFF : 0,
+	 (a5>b5) ? 0xFFFFFFFF : 0,
+	 (a6>b6) ? 0xFFFFFFFF : 0,
+	 (a7>b7) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GT_OS); }
+
+	/*
+	 * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store
+	 the results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [(a0>=b0) ? 0xFFFFFFFF : 0,
+	 (a1>=b1) ? 0xFFFFFFFF : 0,
+	 (a2>=b2) ? 0xFFFFFFFF : 0,
+	 (a3>=b3) ? 0xFFFFFFFF : 0,
+	 (a4>=b4) ? 0xFFFFFFFF : 0,
+	 (a5>=b5) ? 0xFFFFFFFF : 0,
+	 (a6>=b6) ? 0xFFFFFFFF : 0,
+	 (a7>=b7) ? 0xFFFFFFFF : 0]
+	 */
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GE_OS); }
+
+	/*
+	 * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
+	 */
+	static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_ps(a, b); }
+
+	/*
+	 * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
+	 */
+	static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_ps(a, b); }
+
+	/*
+	 * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
+	 */
+	static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_ps(a, b); }
+
+	/*
+	 * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
+	 * ANDNOT b7]
+	 */
+	static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_ps(a, b); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the
+	 * results as packed double-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	 * Return : [floor(a0), floor(a1), floor(a2), floor(a3), floor(a4), floor(a5), floor(a6), floor(a7)]
+	 */
+	static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_ps(a); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the
+	 * results as packed single-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	 * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3), ceil(a4), ceil(a5), ceil(a6), ceil(a7)]
+	 */
+	static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_ps(a); }
+
+	/*
+	 * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed
+	 * single-precision floating-point elements in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	 * Return : [round(a0), round(a1), round(a2), round(a3), round(a4), round(a5), round(a6), round(a7)]
+	 */
+	static INLINE CONST vect_t round(const vect_t a) {
+		return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+	}
+
+	/*
+	 * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the
+	 * results in vect_t.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7]
+	 * Return : [a0+a1, b0+b1, a2+a3, b2+b3, a4+a5, b4+b5, a6+a7, b6+b7]
+	 */
+	static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_ps(a, b); }
+
+	/*
+	 * Horizontally add single-precision (32-bit) floating-point elements in a.
+	 * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	 * Return : a0+a1+a2+a3+a4+a5+a6+a7
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] +
+				((const scalar_t *)&a)[3] + ((const scalar_t *)&a)[4] + ((const scalar_t *)&a)[5] +
+				((const scalar_t *)&a)[6] + ((const scalar_t *)&a)[7];
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+		FLOAT_MOD(C, P, INVP, Q);
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+
+		return C;
+	}
+
+#else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 #error "You need AVX instructions to perform 256bits operations on float"
 #endif
 };
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256_int16.inl b/fflas-ffpack/fflas/fflas_simd/simd256_int16.inl
index 44596d9..e416545 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256_int16.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256_int16.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,487 +31,668 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL
 
+#ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
+#error "You need AVX2 instructions to perform 256bits operations on int16_t"
+#endif
+
 /*
  * Simd256 specialized for int16_t
  */
-template <> struct Simd256_impl<true, true, true, 2> {
-#if defined(__FFLASFFPACK_USE_AVX2)
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m256i;
-
-    /*
-     * alias to 256 bit simd register
-     */
-    using half_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int16_t;
-
-    /*
-     * Simd128 for scalar_t, to deal half_t
-     */
-    using simdHalf = Simd128<scalar_t>;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 16;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 32;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *      Converter conv;
-     *      conv.v = a;
-     *      scalart_t x = conv.t[1]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] int16_t
-     */
-    static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); }
-
-    /*
-     *  Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x] int16_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi16(x); }
-
-    /*
-     *  Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15] int16_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
-                                   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7,
-                                   const scalar_t x8, const scalar_t x9, const scalar_t x10, const scalar_t x11,
-                                   const scalar_t x12, const scalar_t x13, const scalar_t x14, const scalar_t x15) {
-        return _mm256_set_epi16(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
-    }
-
-    /*
-     *  Gather 16-bit integer elements with indexes idx[0], ..., idx[15] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]],
-     p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]],
-     p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]],
-     p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]] int16_t
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]],
-                   p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]);
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm256_store_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm256_storeu_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) {
-        _mm256_stream_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Shift packed 16-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s,
-     *           a8 << s, a9 << s, a10 << s, a11 << s, a12 << s, a13 << s, a14 << s, a15 << s] int16_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi16(a, s); }
-
-    /*
-     * Shift packed 16-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s,
-     *           a8 >> s, a9 >> s, a10 >> s, a11 >> s, a12 >> s, a13 >> s, a14 >> s, a15 >> s] int16_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi16(a, s); }
-
-
-    static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_sra_epi16(a, Simd128<int>::set1(s)); }
-
-    /*
-     * Add packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7,
-     a8+b8, a9+b9, a10+b10, a11+b11, a12+b12, a13+b13, a14+b14, a15+b15]   int16_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi16(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7,
-     a8-b8, a9-b9, a10-b10, a11-b11, a12-b12, a13-b13, a14-b14, a15-b15]  int16_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi16(a, b); }
-
-    static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]  		 int16_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1, a2*b2 mod 2^16-1, a3*b3 mod 2^16-1,
-     a4*b4 mod 2^16-1, a5*b5 mod 2^16-1, a6*b6 mod 2^16-1, a7*b7 mod 2^16-1,
-     a8*b8 mod 2^16-1, a9*b9 mod 2^16-1, a10*b10 mod 2^16-1, a11*b11 mod 2^16-1,
-     a12*b12 mod 2^16-1, a13*b13 mod 2^16-1, a14-b14 mod 2^16-1, a15*b15 mod 2^16-1] int16_t
-     */
-    static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi16(a, b); }
-
-    /*
-     * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]           int16_t
-     * Return : [a0*b0 mod 2^16-1, a1*b1 mod 2^16-1, a2*b2 mod 2^16-1, a3*b3 mod 2^16-1,
-     a4*b4 mod 2^16-1, a5*b5 mod 2^16-1, a6*b6 mod 2^16-1, a7*b7 mod 2^16-1,
-     a8*b8 mod 2^16-1, a9*b9 mod 2^16-1, a10*b10 mod 2^16-1, a11*b11 mod 2^16-1,
-     a12*b12 mod 2^16-1, a13*b13 mod 2^16-1, a14-b14 mod 2^16-1, a15*b15 mod 2^16-1] int16_t
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    /*
-     * Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers, and add the low 16-bits of
-     the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]           int16_t
-     [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15]           int16_t
-     * Return : [(a0*b0 mod 2^16-1)+c0, (a1*b1 mod 2^16-1)+c1, (a2*b2 mod 2^16-1)+c2, (a3*b3 mod 2^16-1)+c3,
-     (a4*b4 mod 2^16-1)+c4, (a5*b5 mod 2^16-1)+c5, (a6*b6 mod 2^16-1)+c6, (a7*b7 mod 2^16-1)+c7,
-     (a8*b8 mod 2^16-1)+c8, (a9*b9 mod 2^16-1)+c9, (a10*b10 mod 2^16-1)+c10, (a11*b11 mod 2^16-1)+c11,
-     (a12*b12 mod 2^16-1)+c12, (a13*b13 mod 2^16-1)+c13, (a14*b14 mod 2^16-1)+c14, (a15*b15 mod 2^16-1)+c15]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fmaddin(vect_t c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     * Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers, and substract elements of c
-     to the low 16-bit of the intermiate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]           int16_t
-     [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15]           int16_t
-     * Return : [-(a0*b0 mod 2^16-1)+c0, -(a1*b1 mod 2^16-1)+c1, -(a2*b2 mod 2^16-1)+c2, -(a3*b3 mod 2^16-1)+c3,
-     -(a4*b4 mod 2^16-1)+c4, -(a5*b5 mod 2^16-1)+c5, -(a6*b6 mod 2^16-1)+c6, -(a7*b7 mod 2^16-1)+c7,
-     -(a8*b8 mod 2^16-1)+c8, -(a9*b9 mod 2^16-1)+c9, -(a10*b10 mod 2^16-1)+c10, -(a11*b11 mod 2^16-1)+c11,
-     -(a12*b12 mod 2^16-1)+c12, -(a13*b13 mod 2^16-1)+c13, -(a14*b14 mod 2^16-1)+c14, -(a15*b15 mod 2^16-1)+c15]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers, and substract the low 16-bits
-     of the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]           int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]           int16_t
-     [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15]           int16_t
-     * Return : [(a0*b0 mod 2^16-1)-c0, (a1*b1 mod 2^16-1)-c1, (a2*b2 mod 2^16-1)-c2, (a3*b3 mod 2^16-1)-c3,
-     (a4*b4 mod 2^16-1)-c4, (a5*b5 mod 2^16-1)-c5, (a6*b6 mod 2^16-1)-c6, (a7*b7 mod 2^16-1)-c7,
-     (a8*b8 mod 2^16-1)-c8, (a9*b9 mod 2^16-1)-c9, (a10*b10 mod 2^16-1)-c10, (a11*b11 mod 2^16-1)-c11,
-     (a12*b12 mod 2^16-1)-c12, (a13*b13 mod 2^16-1)-c13, (a14*b14 mod 2^16-1)-c14, (a15*b15 mod 2^16-1)-c15]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fsubin(vect_t c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return :
-     */
-    static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm256_mulhi_epi16(a, b); }
-
-    /*
-     * Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results
-     in dst.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]    int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]    int16_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7, a8*b8, a9*b9, a10*b10, a11*b11, a12*b12,
-     a13*b13, a14*b14, a15*b15] int16_t
-     */
-    static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
-        vect_t mask = set1(0x00FF);
-        a = vand(a, mask);
-        b = vand(b, mask);
-        return mullo(a, b);
-    }
-
-    /*
-     * Compare packed 16-bits in a and b for equality, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0,
-     (a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0,
-     (a4==b4) ? 0xFFFF : 0, (a5==b5) ? 0xFFFF : 0,
-     (a6==b6) ? 0xFFFF : 0, (a7==b7) ? 0xFFFF : 0,
-     (a8==b8) ? 0xFFFF : 0, (a9==b9) ? 0xFFFF : 0,
-     (a10==b10) ? 0xFFFF : 0, (a11==b11) ? 0xFFFF : 0,
-     (a12==b12) ? 0xFFFF : 0, (a13==b13) ? 0xFFFF : 0,
-     (a14==b14) ? 0xFFFF : 0, (a15==b15) ? 0xFFFF : 0]                     int16_t
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi16(a, b); }
-
-    /*
-     * Compare packed 16-bits in a and b for greater-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0,
-     (a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0,
-     (a4>b4) ? 0xFFFF : 0, (a5>b5) ? 0xFFFF : 0,
-     (a6>b6) ? 0xFFFF : 0, (a7>b7) ? 0xFFFF : 0,
-     (a8>b8) ? 0xFFFF : 0, (a9>b9) ? 0xFFFF : 0,
-     (a10>b10) ? 0xFFFF : 0, (a11>b11) ? 0xFFFF : 0,
-     (a12>b12) ? 0xFFFF : 0, (a13>b13) ? 0xFFFF : 0,
-     (a14>b14) ? 0xFFFF : 0, (a15>b15) ? 0xFFFF : 0]					  int16_t
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(a, b); }
-
-    /*
-     * Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [(a0<b0) ? 0xFFFF : 0, (a1<b1) ? 0xFFFF : 0,
-     (a2<b2) ? 0xFFFF : 0, (a3<b3) ? 0xFFFF : 0,
-     (a4<b4) ? 0xFFFF : 0, (a5<b5) ? 0xFFFF : 0,
-     (a6<b6) ? 0xFFFF : 0, (a7<b7) ? 0xFFFF : 0,
-     (a8<b8) ? 0xFFFF : 0, (a9<b9) ? 0xFFFF : 0,
-     (a10<b10) ? 0xFFFF : 0, (a11<b11) ? 0xFFFF : 0,
-     (a12<b12) ? 0xFFFF : 0, (a13<b13) ? 0xFFFF : 0,
-     (a14<b14) ? 0xFFFF : 0, (a15>b15) ? 0xFFFF : 0] 					  int16_t
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(b, a); }
-
-    /*
-     * Compare packed 16-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0,
-     (a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0,
-     (a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0,
-     (a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0,
-     (a8>=b8) ? 0xFFFF : 0, (a9>=b9) ? 0xFFFF : 0,
-     (a10>=b10) ? 0xFFFF : 0, (a11>=b11) ? 0xFFFF : 0,
-     (a12>=b12) ? 0xFFFF : 0, (a13>=b13) ? 0xFFFF : 0,
-     (a14>=b14) ? 0xFFFF : 0, (a15>=b15) ? 0xFFFF : 0]					  int16_t
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
-     * Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0,
-     (a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0,
-     (a4<=b4) ? 0xFFFF : 0, (a5<=b5) ? 0xFFFF : 0,
-     (a6<=b6) ? 0xFFFF : 0, (a7<=b7) ? 0xFFFF : 0,
-     (a8<=b8) ? 0xFFFF : 0, (a9<=b9) ? 0xFFFF : 0,
-     (a10<=b10) ? 0xFFFF : 0, (a11<=b11) ? 0xFFFF : 0,
-     (a12<=b12) ? 0xFFFF : 0, (a13<=b13) ? 0xFFFF : 0,
-     (a14<=b14) ? 0xFFFF : 0, (a15<=b15) ? 0xFFFF : 0] 					   int16_t
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7,
-     a8 AND b8, a9 AND b9, a10 AND b10, a11 AND b11, a12 AND b12, a13 AND b13, a14 AND b14, a15 AND b15]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); }
-
-    /*
-     * Compute the bitwise OR of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7,
-     a8 OR b8, a9 OR b9, a10 OR b10, a11 OR b11, a12 OR b12, a13 OR b13, a14 OR b14, a15 OR b15]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); }
-
-    /*
-     * Compute the bitwise XOR of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7,
-     a8 XOR b8, a9 XOR b9, a10 XOR b10, a11 XOR b11, a12 XOR b12, a13 XOR b13, a14 XOR b14, a15 XOR b15]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 16-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     ANDNOT b7,
-     a8 ANDNOT b8, a9 ANDNOT b9, a10 ANDNOT b10, a11 ANDNOT b11, a12 ANDNOT b12, a13 ANDNOT b13, a14 ANDNOT b14, a15
-     ANDNOT b15]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(b, a); }
-
-    /*
-     * Horizontally add 16-bits elements of a.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
-     * Return : a0+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10+a11+a12+a13+a14+a15
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter ca;
-        ca.v = a;
-        return ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7] + ca.t[8] + ca.t[9] +
-               ca.t[10] + ca.t[11] + ca.t[12] + ca.t[13] + ca.t[14] + ca.t[15];
-    }
-
-    static INLINE PURE half_t load_half(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const half_t *>(p));
-    }
-
-    static INLINE PURE half_t loadu_half(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const half_t *>(p));
-    }
-
-    static INLINE void store_half(const scalar_t *p, half_t v) {
-        _mm_store_si128(reinterpret_cast<half_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    static INLINE void storeu_half(const scalar_t *p, half_t v) {
-        _mm_storeu_si128(reinterpret_cast<half_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]    int16_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15]    int16_t
-     [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15]    int16_t
-     * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7, a8*b8+c8, a9*b9+c9,
-     a10*b10+c10, a11*b11+c11, a12*b12+c12, a13*b13+c13, a14*b14+c14, a15*b15+c15] int16_t
-     */
-    static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
-
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
+template <> struct Simd256_impl<true, true, true, 2> : public Simd256i_base {
+
+	/*
+	* alias to 256 bit simd register
+	*/
+	using vect_t = __m256i;
+
+	/*
+	* alias to 256 bit simd register
+	*/
+	using half_t = __m128i;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = int16_t;
+
+	/*
+	* Simd128 for scalar_t, to deal half_t
+	*/
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 16;
+
+	/*
+	*  alignement required by scalar_t pointer to be loaded in a vect_t
+	*/
+	static const constexpr size_t alignment = 32;
+
+	/*
+	* Check if the pointer p is a multiple of alignemnt
+	*/
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	* Check if the number n is a multiple of vect_size
+	*/
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 16-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x] int16_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi16(x); }
+
+	/*
+	*  Set packed 16-bit integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15] int16_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7,
+								   const scalar_t x8, const scalar_t x9, const scalar_t x10, const scalar_t x11,
+								   const scalar_t x12, const scalar_t x13, const scalar_t x14, const scalar_t x15) {
+		return _mm256_set_epi16(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 16-bit integer elements with indexes idx[0], ..., idx[15] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]],
+	p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]],
+	p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]],
+	p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]] int16_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]],
+				p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]);
+	}
+
+	/*
+	* Load 256-bits of integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 256-bits of integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 16-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
+	* Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s,
+	*	   a8 << s, a9 << s, a10 << s, a11 << s, a12 << s, a13 << s, a14 << s, a15 << s] int16_t
+	*/
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi16(a, s); }
+
+	/*
+	* Shift packed 16-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
+	* Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s,
+	*	   a8 >> s, a9 >> s, a10 >> s, a11 >> s, a12 >> s, a13 >> s, a14 >> s, a15 >> s] int16_t
+	*/
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi16(a, s); }
+
+	/*
+	* Shift packed 16-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+	* Return : 	[a0 >> s, ..., a15 >> s] int16_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srai_epi16(a, s); }
+
+	/*
+	* Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   : [a0, ..., a15] int16_t
+	* Return : [a[s[0..3]], ..., a[s[60..63]] int16_t
+	*/
+	template<uint64_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		//#pragma warning "The simd shuffle function is emulated, it may impact the performances.";
+		Converter conv;
+		conv.v = a;
+		return set (conv.t[( s      & 0x000000000000000F)], conv.t[( s      & 0x00000000000000F0)],
+					conv.t[((s>> 8) & 0x000000000000000F)], conv.t[((s>> 8) & 0x00000000000000F0)],
+					conv.t[((s>>16) & 0x000000000000000F)], conv.t[((s>>16) & 0x00000000000000F0)],
+					conv.t[((s>>24) & 0x000000000000000F)], conv.t[((s>>24) & 0x00000000000000F0)],
+					conv.t[((s>>32) & 0x000000000000000F)], conv.t[((s>>32) & 0x00000000000000F0)],
+					conv.t[((s>>40) & 0x000000000000000F)], conv.t[((s>>40) & 0x00000000000000F0)],
+					conv.t[((s>>48) & 0x000000000000000F)], conv.t[((s>>48) & 0x00000000000000F0)],
+					conv.t[((s>>56) & 0x000000000000000F)], conv.t[((s>>56) & 0x00000000000000F0)]);
+	}
+
+	/*
+	* Unpack and interleave 16-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[a0, b0, a1, b1, ..., a8, b8, a9, b9, ...] int16_t
+	*/
+	static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi16(a, b); }
+
+	/*
+	* Unpack and interleave 16-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[a4, b4, a5, b5, ..., a12, b12, a13, b13, ...] int16_t
+	*/
+	static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi16(a, b); }
+
+	/*
+	* Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[a0, b0, ..., a7, b7] int16_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] uint64
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpacklo_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[a8, b8, ..., a15, b15] int16_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 16-bit integers from the low then high half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[a0, b0, ..., a7, b7] int16_t
+	*			[a8, b8, ..., a15, b15] int16_t
+	*/
+	static INLINE CONST void unpacklohi(vect_t& s1, vect_t& s2, const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		s1 = unpacklo_twice(a1, b1);
+		s2 = unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Blend packed 16-bit integers from a and b in each 128 lane using control mask imm8, and store the results in dst.
+	* Args   :	[a0, ..., a15] int16_t
+				[b0, ..., b15] int16_t
+	* Return :	[s[0]?a0:b0,   , s[15]?a15:b15] int16_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend_twice(const vect_t a, const vect_t b) {
+		return _mm256_blend_epi16(a, b, s);
+	}
+
+	/*
+	* Add packed 16-bits integer in a and b, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return :	 [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7,
+	a8+b8, a9+b9, a10+b10, a11+b11, a12+b12, a13+b13, a14+b14, a15+b15]   int16_t
+	*/
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi16(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	* Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : 	[a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7,
+	a8-b8, a9-b9, a10-b10, a11-b11, a12-b12, a13-b13, a14-b14, a15-b15]  int16_t
+	*/
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi16(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits
+	of the intermediate integers in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [a0*b0 smod 2^16, ..., a15*b15 smod 2^16]	int16_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi16(a, b); }
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
+	* Return : [Floor(a0*b0/2^16), ..., Floor(a15*b15/2^16)] int16_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm256_mulhi_epi16(a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results
+	in dst.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t
+	* Return : [(a0 smod 2^8)*(b0 smod 2^8), ..., (a15 smod 2^8)*(b15 smod 2^8)]	int16_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1, mask2;
+		mask1 = set1(0x00FF);
+		mask2 = set1(0x0080);
+		a1 = add(a,mask2);
+		a1 = vand(a1,mask1);
+		a1 = sub(a1,mask2);
+		b1 = add(b,mask2);
+		b1 = vand(b1,mask1);
+		b1 = sub(b1,mask2);
+		return mul(a1,b1);
+	}
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* keep the low 16 bits of the intermediate and add the low 16-bits of c.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[(a0*b0+c0) smod 2^16, ..., (a15*b15+c15) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and add the low 16-bits of c.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[((a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ...,
+	*		 ((a15 smod 2^8)*(b15 smod 2^8)+c15) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and substract the low 16 bits of the intermediate from elements of c.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[(-a0*b0+c0) smod 2^16, ..., (-a15*b15+c15) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and substract them from elements of c.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[(-(a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ...,
+	*		 (-(a15 smod 2^8)*(b15 smod 2^8)+c15) smod 2^16]		int16_t
+	*/
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	* Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and substract elements of c to the low 16-bits of the intermediate.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[(a0*b0-c0) smod 2^16, ..., (a15*b15-c15) smod 2^16]	int16_t
+	*/
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	* Multiply the low 8-bit integers from each packed 16-bit element in a and b,
+	* keep the signed 16-bit results and substract elements of c from them.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+			[c0, ..., c15]		int16_t
+	* Return :	[((a0 smod 2^8)*(b0 smod 2^8)-c0) smod 2^16, ...,
+	*		 ((a15 smod 2^8)*(b15 smod 2^8)-c15) smod 2^16]		int16_t
+	*/
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for equality, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0,
+	(a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0,
+	(a4==b4) ? 0xFFFF : 0, (a5==b5) ? 0xFFFF : 0,
+	(a6==b6) ? 0xFFFF : 0, (a7==b7) ? 0xFFFF : 0,
+	(a8==b8) ? 0xFFFF : 0, (a9==b9) ? 0xFFFF : 0,
+	(a10==b10) ? 0xFFFF : 0, (a11==b11) ? 0xFFFF : 0,
+	(a12==b12) ? 0xFFFF : 0, (a13==b13) ? 0xFFFF : 0,
+	(a14==b14) ? 0xFFFF : 0, (a15==b15) ? 0xFFFF : 0]		     int16_t
+	*/
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi16(a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for greater-than, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0,
+	(a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0,
+	(a4>b4) ? 0xFFFF : 0, (a5>b5) ? 0xFFFF : 0,
+	(a6>b6) ? 0xFFFF : 0, (a7>b7) ? 0xFFFF : 0,
+	(a8>b8) ? 0xFFFF : 0, (a9>b9) ? 0xFFFF : 0,
+	(a10>b10) ? 0xFFFF : 0, (a11>b11) ? 0xFFFF : 0,
+	(a12>b12) ? 0xFFFF : 0, (a13>b13) ? 0xFFFF : 0,
+	(a14>b14) ? 0xFFFF : 0, (a15>b15) ? 0xFFFF : 0]					  int16_t
+	*/
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(a, b); }
+
+	/*
+	* Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [(a0<b0) ? 0xFFFF : 0, (a1<b1) ? 0xFFFF : 0,
+	(a2<b2) ? 0xFFFF : 0, (a3<b3) ? 0xFFFF : 0,
+	(a4<b4) ? 0xFFFF : 0, (a5<b5) ? 0xFFFF : 0,
+	(a6<b6) ? 0xFFFF : 0, (a7<b7) ? 0xFFFF : 0,
+	(a8<b8) ? 0xFFFF : 0, (a9<b9) ? 0xFFFF : 0,
+	(a10<b10) ? 0xFFFF : 0, (a11<b11) ? 0xFFFF : 0,
+	(a12<b12) ? 0xFFFF : 0, (a13<b13) ? 0xFFFF : 0,
+	(a14<b14) ? 0xFFFF : 0, (a15>b15) ? 0xFFFF : 0] 					  int16_t
+	*/
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(b, a); }
+
+	/*
+	* Compare packed 16-bits in a and b for greater or equal than, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0,
+	(a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0,
+	(a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0,
+	(a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0,
+	(a8>=b8) ? 0xFFFF : 0, (a9>=b9) ? 0xFFFF : 0,
+	(a10>=b10) ? 0xFFFF : 0, (a11>=b11) ? 0xFFFF : 0,
+	(a12>=b12) ? 0xFFFF : 0, (a13>=b13) ? 0xFFFF : 0,
+	(a14>=b14) ? 0xFFFF : 0, (a15>=b15) ? 0xFFFF : 0]					  int16_t
+	*/
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	* Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t.
+	* Args   :	[a0, ..., a15]		int16_t
+			[b0, ..., b15]		int16_t
+	* Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0,
+	(a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0,
+	(a4<=b4) ? 0xFFFF : 0, (a5<=b5) ? 0xFFFF : 0,
+	(a6<=b6) ? 0xFFFF : 0, (a7<=b7) ? 0xFFFF : 0,
+	(a8<=b8) ? 0xFFFF : 0, (a9<=b9) ? 0xFFFF : 0,
+	(a10<=b10) ? 0xFFFF : 0, (a11<=b11) ? 0xFFFF : 0,
+	(a12<=b12) ? 0xFFFF : 0, (a13<=b13) ? 0xFFFF : 0,
+	(a14<=b14) ? 0xFFFF : 0, (a15<=b15) ? 0xFFFF : 0] 					   int16_t
+	*/
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Horizontally add 16-bits elements of a.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
+	* Return : a0+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10+a11+a12+a13+a14+a15
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7] + ca.t[8] + ca.t[9] +
+				ca.t[10] + ca.t[11] + ca.t[12] + ca.t[13] + ca.t[14] + ca.t[15]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
 #ifdef __INTEL_COMPILER
-        C = _mm256_rem_epi16(C, P);
+		C = _mm256_rem_epi16(C, P);
 #else
-        FFLASFFPACK_abort("pas implementé");
+		FFLASFFPACK_abort("pas implementé");
 #endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
-
-#else
-
-#error "You need AVX2 instructions to perform 256bits operations on int16_t"
-
-#endif // defined(__FFLASFFPACK_USE_AVX2)
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+		return C;
+	}
 };
 
-// uint16_t
+/*
+ * Simd128 specialized for uint16_t
+ */
 template <> struct Simd256_impl<true, true, false, 2> : public Simd256_impl<true, true, true, 2> {
-    using scalar_t = uint16_t;
 
-#if defined(__FFLASFFPACK_USE_AVX2)
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi16(a, b);
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi16(a, b);
-    }
-
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-#else
-
-#error "You need AVX2 instructions to perform 256bits operations on uint16_t"
-
-#endif // defined(__FFLASFFPACK_USE_AVX2)
-};
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = uint16_t;
+
+	/*
+	 * Simd128 for scalar_t, to deal half_t
+	 */
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	 * Converter from vect_t to a tab.
+	 * exple:
+	 *	Converter conv;
+	 *	conv.v = a;
+	 *	scalart_t x = conv.t[1]
+	 */
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x] uint16_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi16(x); }
+
+	/*
+	*  Set packed 16-bit unsigned integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15] uint16_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7,
+								   const scalar_t x8, const scalar_t x9, const scalar_t x10, const scalar_t x11,
+								   const scalar_t x12, const scalar_t x13, const scalar_t x14, const scalar_t x15) {
+		return _mm256_set_epi16(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 16-bit integer elements with indexes idx[0], ..., idx[15] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]],
+	p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]],
+	p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]],
+	p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]] uint16_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]],
+				p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]);
+	}
+
+	/*
+	* Load 256-bits of unsigned integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] uint16_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 256-bits of unsigned integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] uint16_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 16-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, ..., a15]				int16_t
+	 * Return : [Floor(a0/2^s), ..., Floor(a15/2^s)]	int16_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi16(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm256_cmpgt_epi16(a, b);
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm256_cmpgt_epi16(b, a);
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers,
+	* and store the high 16 bits of the intermediate integers in vect_t.
+	* Args   :	[a0, ..., a15]		uint16_t
+			[b0, ..., b15]		uint16_t
+	* Return : [Floor(a0*b0/2^16), ..., Floor(a15*b15/2^16)] uint16_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm256_mulhi_epu16(a, b); }
+
+	/*
+	* Multiply the low unsigned 8-bit integers from each packed 16-bit element in a and b,
+	* and store the signed 16-bit results in vect_t.
+	* Args   :	[a0, ..., a15]		uint16_t
+			[b0, ..., b15]		uint16_t
+	* Return : [(a0 mod 2^8)*(b0 mod 2^8), ..., (a15 mod 2^8)*(b15 mod 2^8)] uint16_t
+	*/
+	static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1;
+		mask1 = set1(0x00FF);
+		a1 = vand(a,mask1);
+		b1 = vand(b,mask1);
+		return mul(a1,b1);
+	}
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Horizontally add 16-bits elements of a.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15]
+	* Return : a0+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10+a11+a12+a13+a14+a15
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7] + ca.t[8] + ca.t[9] +
+				ca.t[10] + ca.t[11] + ca.t[12] + ca.t[13] + ca.t[14] + ca.t[15]);
+	}
+};  //Simd256_impl<true,true,false,2>
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256_int32.inl b/fflas-ffpack/fflas/fflas_simd/simd256_int32.inl
index ffe0b7e..7b870a1 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256_int32.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256_int32.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,455 +31,680 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL
 
+#ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
+#error "You need AVX2 instructions to perform 256bits operations on int32_t"
+#endif
+
+#include "fflas-ffpack/fflas/fflas_simd/simd256_int64.inl"
+
 /*
  * Simd256 specialized for int32_t
  */
-template <> struct Simd256_impl<true, true, true, 4> {
-#if defined(__FFLASFFPACK_USE_AVX2)
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m256i;
-
-    /*
-     * alias to 256 bit simd register
-     */
-    using half_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int32_t;
-
-    /*
-     * Simd128 for scalar_t, to deal half_t
-     */
-    using simdHalf = Simd128<scalar_t>;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 8;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 32;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *		Converter conv;
-     *		conv.v = a;
-     *		scalart_t x = conv.t[1]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0,0,0,0,0] int32_t
-     */
-    static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); }
-
-    /*
-     *  Broadcast 32-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x,x,x,x,x,x,x] int32_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi32(x); }
-
-    /*
-     *  Broadcast 32-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1,x2,x3,x4,x5,x6,x7] int32_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
-                                   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
-        return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0);
-    }
-
-    /*
-     *  Gather 32-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]],
-     p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] int32_t
-    */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm256_store_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm256_storeu_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) {
-        _mm256_stream_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-	
-    /*
-     * Shift packed 32-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int32_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi32(a, s); }
-
-    /*
-     * Shift packed 32-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int32_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi32(a, s); }
-
-
-	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_sra_epi32(a, Simd128<int>::set1(s)); }
-
-    /*
-     * Add packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 						   int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 						   int32_t
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7]   int32_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi32(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 32-bits integers in b from packed 32-bits integers in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 						  int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 						  int32_t
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7]  int32_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi32(a, b); }
-
-    static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the low 32
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7, a8]           						     int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7, b8]  		 							 int32_t
-     * Return : [a0*b0 mod 2^32-1, a1*b1 mod 2^32-1, a2*b2 mod 2^32-1, a3*b3 mod 2^32-1,
-     a4*b4 mod 2^32-1, a5*b5 mod 2^32-1, a6*b6 mod 2^32-1, a7*b7 mod 2^32-1] int32_t
-    */
-    static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi32(a, b); }
-
-    /*
-     * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 16 bits
-     of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]           int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]           int32_t
-     * Return : [a0*b0 mod 2^32-1, a1*b1 mod 2^32-1, a2*b2 mod 2^32-1, a3*b3 mod 2^32-1,
-     a4*b4 mod 2^32-1, a5*b5 mod 2^32-1, a6*b6 mod 2^32-1, a7*b7 mod 2^32-1] int32_t
-    */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    /*
-     * Multiply packed 32-bit integers in a and b, producing intermediate 64-bit integers, and add the low 32-bits of
-     the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]           int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]           int32_t
-     [c0, c1, c2, c3, c4, c5, c6, c7]           int32_t
-     * Return : [(a0*b0 mod 2^32-1)+c0, (a1*b1 mod 2^32-1)+c1, (a2*b2 mod 2^32-1)+c2, (a3*b3 mod 2^32-1)+c3,
-     (a4*b4 mod 2^32-1)+c4, (a5*b5 mod 2^32-1)+c5, (a6*b6 mod 2^32-1)+c6, (a7*b7 mod 2^32-1)+c7]
-    */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fmaddin(vect_t c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     * Multiply packed 32-bit integers in a and b, producing intermediate 64-bit integers, and substract elements of c
-     to the low 32-bit of the intermiate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]           int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]           int32_t
-     [c0, c1, c2, c3, c4, c5, c6, c7]           int32_t
-     * Return : [-(a0*b0 mod 2^32-1)+c0, -(a1*b1 mod 2^32-1)+c1, -(a2*b2 mod 2^32-1)+c2, -(a3*b3 mod 2^32-1)+c3,
-     -(a4*b4 mod 2^32-1)+c4, -(a5*b5 mod 2^32-1)+c5, -(a6*b6 mod 2^32-1)+c6, -(a7*b7 mod 2^32-1)+c7]
-    */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fnmaddin(vect_t c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
-
-    /*
-     * Multiply packed 32-bit integers in a and b, producing intermediate 64-bit integers, and substract the low 32-bits
-     of the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]           int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]           int32_t
-     [c0, c1, c2, c3, c4, c5, c6, c7]           int32_t
-     * Return : [(a0*b0 mod 2^32-1)-c0, (a1*b1 mod 2^32-1)-c1, (a2*b2 mod 2^32-1)-c2, (a3*b3 mod 2^32-1)-c3,
-     (a4*b4 mod 2^32-1)-c4, (a5*b5 mod 2^32-1)-c5, (a6*b6 mod 2^32-1)-c6, (a7*b7 mod 2^32-1)-c7]
-    */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    static INLINE CONST vect_t fsubin(vect_t c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
-
-    /*
-     * Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the high 32
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return :
-     */
-    static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        vect_t a1, a2, b1, b2, c1, c2;
-        a1 = set(0, ca.t[0], 0, ca.t[1], 0, ca.t[2], 0, ca.t[3]);
-        a2 = set(0, ca.t[4], 0, ca.t[5], 0, ca.t[6], 0, ca.t[7]);
-        b1 = set(0, cb.t[0], 0, cb.t[1], 0, cb.t[2], 0, cb.t[3]);
-        b2 = set(0, cb.t[4], 0, cb.t[5], 0, cb.t[6], 0, cb.t[7]);
-        c1 = mulx(a1, b1);
-        c2 = mulx(a2, b2);
-        ca.v = c1;
-        cb.v = c2;
-        return set(ca.t[0], ca.t[2], ca.t[4], ca.t[6], cb.t[0], cb.t[2], cb.t[4], cb.t[6]);
-    }
-
-    /*
-     * Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results
-     in dst.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]    int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7]    int32_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7] int32_t
-     */
-    static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
-        vect_t mask = set1(0x0000FFFF);
-        a = vand(a, mask);
-        b = vand(b, mask);
-        return mullo(a, b);
-    }
-
-    /*
-     * Compare packed 32-bits in a and b for equality, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]								   int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 								   int32_t
-     * Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0,
-     (a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0,
-     (a4==b4) ? 0xFFFF : 0, (a5==b5) ? 0xFFFF : 0,
-     (a6==b6) ? 0xFFFF : 0, (a7==b7) ? 0xFFFF : 0]                     int32_t
-    */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi32(a, b); }
-
-    /*
-     * Compare packed 32-bits in a and b for greater-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]								   int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 								   int32_t
-     * Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0,
-     (a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0,
-     (a4>b4) ? 0xFFFF : 0, (a5>b5) ? 0xFFFF : 0,
-     (a6>b6) ? 0xFFFF : 0, (a7>b7) ? 0xFFFF : 0]                     	int32_t
-    */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi32(a, b); }
-
-    /*
-     * Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
-     * Return : [(a0<b0) ? 0xFFFF : 0, (a1<b1) ? 0xFFFF : 0,
-     (a2<b2) ? 0xFFFF : 0, (a3<b3) ? 0xFFFF : 0,
-     (a4<b4) ? 0xFFFF : 0, (a5<b5) ? 0xFFFF : 0,
-     (a6<b6) ? 0xFFFF : 0, (a7<b7) ? 0xFFFF : 0] 					  int32_t
-    */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi32(b, a); }
-
-    /*
-     * Compare packed 32-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]									 int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 									 int32_t
-     * Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0,
-     (a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0,
-     (a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0,
-     (a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0]					  int32_t
-    */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 					int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 					int32_t
-     * Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0,
-     (a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0,
-     (a4<=b4) ? 0xFFFF : 0, (a5<=b5) ? 0xFFFF : 0,
-     (a6<=b6) ? 0xFFFF : 0, (a7<=b7) ? 0xFFFF : 0] 		int32_t
-    */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); }
-
-    /*
-     * Compute the bitwise OR of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); }
-
-    /*
-     * Compute the bitwise XOR of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 32-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     [b0, b1, b2, b3, b4, b5, b6, b7]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7
-     ANDNOT b7]
-    */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(b, a); }
-
-    /*
-     * Horizontally add 32-bits elements of a.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
-     * Return : a0+a1+a2+a3+a4+a5+a6+a7
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter ca;
-        ca.v = a;
-        return ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7];
-    }
-
-    static INLINE PURE half_t load_half(const scalar_t *const p) {
-        return _mm_load_si128(reinterpret_cast<const half_t *>(p));
-    }
-
-    static INLINE PURE half_t loadu_half(const scalar_t *const p) {
-        return _mm_loadu_si128(reinterpret_cast<const half_t *>(p));
-    }
-
-    static INLINE void store_half(const scalar_t *p, half_t v) {
-        _mm_store_si128(reinterpret_cast<half_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    static INLINE void storeu_half(const scalar_t *p, half_t v) {
-        _mm_storeu_si128(reinterpret_cast<half_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     *
-     * Args   : [0, a1, 0, a3, 0, a5, 0, a7] int32_t
-     [0, b1, 0, b3, 0, b5, 0, b7] int32_t
-     [c0, c1, c2, c3] 			 int64_t
-     * Return : [c0+a1*b1, c1+a3*b2, c2+a5*b5, c3+a7*b7] int64_t
-     */
-    static INLINE CONST vect_t fmaddx(vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
-                             const vect_t &MAX, vect_t &Q, vect_t &T) {
-#ifdef __INTEL_COMPILER
-        C = _mm256_rem_epi32(C, P);
+template <> struct Simd256_impl<true, true, true, 4> : public Simd256i_base {
+
+	/*
+	* alias to 256 bit simd register
+	*/
+	using vect_t = __m256i;
+
+	/*
+	* alias to 256 bit simd register
+	*/
+	using half_t = __m128i;
+
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = int32_t;
+
+	/*
+	* Simd128 for scalar_t, to deal half_t
+	*/
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	*  number of scalar_t in a simd register
+	*/
+	static const constexpr size_t vect_size = 8;
+
+	/*
+	*  alignement required by scalar_t pointer to be loaded in a vect_t
+	*/
+	static const constexpr size_t alignment = 32;
+
+	/*
+	* Check if the pointer p is a multiple of alignemnt
+	*/
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	* Check if the number n is a multiple of vect_size
+	*/
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x] int32_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi32(x); }
+
+	/*
+	*  Set packed 32-bit integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3,x4,x5,x6,x7] int32_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
+		return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 32-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] int32_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
+	}
+
+	/*
+	* Load 256-bits of integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 256-bits of integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 32-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
+	* Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int32_t
+	*/
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi32(a, s); }
+
+	/*
+	* Shift packed 32-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
+	* Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int32_t
+	*/
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi32(a, s); }
+
+	/*
+	* Shift packed 32-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
+	* Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int32_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srai_epi32(a, s); }
+
+	/*
+	* Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst.
+	* Args   : [a0, ..., a7] int32_t
+	* Return : [a[s[0..1]], ..., a[s[6..7]],a[4+s[0..1]], ..., a[4+s[6..7]]] int32_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle_twice(const vect_t a) {
+		return _mm256_shuffle_epi32(a, s);
+	}
+
+	/*
+	* Shuffle 32-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   : [a0, ..., a7] int32_t
+	* Return : [a[s[0..3]], ..., a[s[28..31]]] int32_t
+	*/
+	template<uint32_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		//#pragma warning "The simd shuffle function is emulated, it may impact the performances."
+		Converter conv;
+		conv.v = a;		
+		return set (conv.t[( s      & 0x0000000F)], conv.t[( s      & 0x000000F0)],
+					conv.t[((s>> 8) & 0x0000000F)], conv.t[((s>> 8) & 0x000000F0)],
+					conv.t[((s>>16) & 0x0000000F)], conv.t[((s>>16) & 0x000000F0)],
+					conv.t[((s>>24) & 0x0000000F)], conv.t[((s>>24) & 0x000000F0)]);
+	}
+
+	/*
+	* Unpack and interleave 32-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[a0, b0, a1, b1, a4, b4, a5, b5] int32_t
+	*/
+	static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi32(a, b); }
+
+	/*
+	* Unpack and interleave 32-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[a2, b2, a3, b3, a6, b6, a7, b7] int32_t
+	*/
+	static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi32(a, b); }
+
+	/*
+	* Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[a0, b0, ..., a3, b3] int32_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] uint64
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpacklo_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[a4, b4, ..., a7, b7] int32_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 32-bit integers from the low then high half of a and b, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[a0, b0, ..., a3, b3] int32_t
+	*			[a4, b4, ..., a7, b7] int32_t
+	*/
+	static INLINE CONST void unpacklohi(vect_t& s1, vect_t& s2, const vect_t a, const vect_t b) {
+		using Simd256_64 = Simd256<uint64_t>;
+		vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+		vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		s1 = unpacklo_twice(a1, b1);
+		s2 = unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.
+	* Args   :	[a0, ..., a7] int32_t
+				[b0, ..., b7] int32_t
+	* Return :	[s[0]?a0:b0,   , s[7]?a7:b7] int32_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		return _mm256_blend_epi32(a, b, s);
+	}
+
+	/*
+	* Add packed 32-bits integer in a and b, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 						int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7] 						int32_t
+	* Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7]   int32_t
+	*/
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi32(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	* Subtract packed 32-bits integers in b from packed 32-bits integers in a, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 						int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7] 						int32_t
+	* Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7]  int32_t
+	*/
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi32(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	* Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the low 32
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]						int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	 					int32_t
+	* Return : [a0*b0 smod 2^32, ..., a7*b7 smod 2^32]	int32_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi32(a, b); }
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	* Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the high 32
+	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7] int32_t
+	* Return : [Floor(a0*b0/2^32), ..., Floor(a7*b7/2^32)] int32_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulhi function is emulated, it may impact the performances."
+#if 0
+		typedef Simd256_impl<true, true, true, 8> Simd256_64;
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		vect_t a1, a2, b1, b2, c1, c2;
+		a1 = set(ca.t[0], 0, ca.t[1], 0, ca.t[2], 0, ca.t[3], 0);
+		a2 = set(ca.t[4], 0, ca.t[5], 0, ca.t[6], 0, ca.t[7], 0);
+		b1 = set(cb.t[0], 0, cb.t[1], 0, cb.t[2], 0, cb.t[3], 0);
+		b2 = set(cb.t[4], 0, cb.t[5], 0, cb.t[6], 0, cb.t[7], 0);
+		c1 = Simd256_64::mulx(a1, b1);
+		c2 = Simd256_64::mulx(a2, b2);
+		ca.v = c1;
+		cb.v = c2;
+		return set(ca.t[1], ca.t[3], ca.t[5], ca.t[7], cb.t[1], cb.t[3], cb.t[5], cb.t[7]);
 #else
-        FFLASFFPACK_abort("pas implementé");
-// C = fnmadd(C,_mm256_castps_si128(_mm256_floor_ps(_mm256_mul_ps(INVP,_mm256_castsi128_ps(C)))),P);
+		typedef Simd256_impl<true, true, true, 8> Simd256_64;
+		vect_t C,A1,B1;
+		C  = Simd256_64::mulx(a,b);
+		A1 = Simd256_64::srl(a,32);
+		B1 = Simd256_64::srl(b,32);
+		A1 = Simd256_64::mulx(A1,B1);
+		C  = Simd256_64::srl(C,32);
+		A1 = Simd256_64::srl(A1,32);
+		A1 = Simd256_64::sll(A1,32);
+		return Simd256_64::vor(C,A1);
 #endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
-
+	}
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results
+	in dst.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0 smod 2^16)*(b0 smod 2^16), ..., (a7 smod 2^16)*(b7 smod 2^16)]	int32_t
+	*	where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	*/
+	static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1, mask2;
+		mask1 = set1(0x0000FFFF);
+		mask2 = set1(0x00008000);
+		a1 = add(a,mask2);
+		a1 = vand(a1,mask1);
+		a1 = sub(a1,mask2);
+		b1 = add(b,mask2);
+		b1 = vand(b1,mask1);
+		b1 = sub(b1,mask2);
+		return mul(a1,b1);
+	}
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* keep the low 32 bits of the intermediate and add the low 32-bits of c.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return :	[(a0*b0+c0) smod 2^32, ..., (a7*b7+c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and add the low 32-bits of c.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return :	[((a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ...,
+	*		 ((a7 smod 2^16)*(b7 smod 2^16)+c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and substract the low 32 bits of the intermediate from elements of c.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return :	[(-a0*b0+c0) smod 2^32, ..., (-a7*b7+c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and substract them from elements of c.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return :	[(-(a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ...,
+	*		 (-(a7 smod 2^16)*(b7 smod 2^16)+c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and substract elements of c to the low 32-bits of the intermediate.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return : [(a0*b0-c0) smod 2^32, ..., (a7*b7-c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	* Multiply the low 16-bit integers from each packed 32-bit element in a and b,
+	* keep the signed 32-bit results and substract elements of c from them.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	*	   [c0, c1, c2, c3, c4, c5, c6, c7]	int32_t
+	* Return :	[((a0 smod 2^16)*(b0 smod 2^16)-c0) smod 2^32, ...,
+	*		 ((a7 smod 2^16)*(b7 smod 2^16)-c7) smod 2^32]	int32_t
+	*/
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for equality, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0,
+	*	    (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0,
+	*	    (a4==b4) ? 0xFFFFFFFF : 0, (a5==b5) ? 0xFFFFFFFF : 0,
+	*	    (a6==b6) ? 0xFFFFFFFF : 0, (a7==b7) ? 0xFFFFFFFF : 0]	int32_t
+	*/
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi32(a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for greater-than, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0>b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0,
+	*	    (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0,
+	*	    (a4>b4) ? 0xFFFFFFFF : 0, (a5>b5) ? 0xFFFFFFFF : 0,
+	*	    (a6>b6) ? 0xFFFFFFFF : 0, (a7>b7) ? 0xFFFFFFFF : 0]		int32_t
+	*/
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi32(a, b); }
+
+	/*
+	* Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0<b0) ? 0xFFFFFFFF : 0, (a1<b1) ? 0xFFFFFFFF : 0,
+	*	    (a2<b2) ? 0xFFFFFFFF : 0, (a3<b3) ? 0xFFFFFFFF : 0,
+	*	    (a4<b4) ? 0xFFFFFFFF : 0, (a5<b5) ? 0xFFFFFFFF : 0,
+	*	    (a6<b6) ? 0xFFFFFFFF : 0, (a7<b7) ? 0xFFFFFFFF : 0]		int32_t
+	*/
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi32(b, a); }
+
+	/*
+	* Compare packed 32-bits in a and b for greater or equal than, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0>=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0,
+	*	    (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0,
+	*	    (a4>=b4) ? 0xFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFF : 0,
+	*	    (a6>=b6) ? 0xFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFF : 0]	int32_t
+	*/
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	* Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	int32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	int32_t
+	* Return : [(a0<=b0) ? 0xFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFF : 0,
+	*	    (a2<=b2) ? 0xFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFF : 0,
+	*	    (a4<=b4) ? 0xFFFFFFFF : 0, (a5<=b5) ? 0xFFFFFFFF : 0,
+	*	    (a6<=b6) ? 0xFFFFFFFF : 0, (a7<=b7) ? 0xFFFFFFFF : 0]	int32_t
+	*/
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Horizontally add 32-bits elements of a.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	* Return : a0+a1+a2+a3+a4+a5+a6+a7
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN,
+							 const vect_t &MAX, vect_t &Q, vect_t &T) {
+#ifdef __INTEL_COMPILER
+		C = _mm256_rem_epi32(C, P);
 #else
-
-#error "You need AVX2 instructions to perform 256bits operations on int32_t"
-
-#endif // defined(__FFLASFFPACK_USE_AVX2)
+		FFLASFFPACK_abort("pas implementé");
+		// C = fnmadd(C,_mm256_castps_si128(_mm256_floor_ps(_mm256_mul_ps(INVP,_mm256_castsi128_ps(C)))),P);
+#endif
+		NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+		return C;
+	}
 };
 
-// uint16_t
+/*
+ * Simd256 specialized for uint32_t
+ */
 template <> struct Simd256_impl<true, true, false, 4> : public Simd256_impl<true, true, true, 4> {
-#if defined(__FFLASFFPACK_USE_AVX2)
-
-    using scalar_t = uint32_t;
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi32(a, b);
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi32(a, b);
-    }
 
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-#else
-
-#error "You need AVX2 instructions to perform 256bits operations on uint32_t"
-
-#endif // defined(__FFLASFFPACK_USE_AVX2)
-};
+	/*
+	* define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = uint32_t;
+
+	/*
+	 * Simd128 for scalar_t, to deal half_t
+	 */
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	*  Broadcast 32-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	*  Return [x,x,x,x,x,x,x,x] uint32_t
+	*/
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi32(x); }
+
+	/*
+	*  Set packed 32-bit unsigned integers in dst with the supplied values.
+	*  Return [x0,x1,x2,x3,x4,x5,x6,x7] uint32_t
+	*/
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3,
+								   const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) {
+		return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0);
+	}
+
+	/*
+	*  Gather 32-bit unsigned integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t.
+	*  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] uint32_t
+	*/
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]);
+	}
+
+	/*
+	* Load 256-bits of unsigned integer data from memory into dst.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t
+	*/
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Load 256-bits of unsigned integer data from memory into dst.
+	* p does not need to be aligned on any particular boundary.
+	* Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t
+	*/
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory.
+	* p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	*/
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory.
+	* p does not need to be aligned on any particular boundary.
+	*/
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	* p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	*/
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 32-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, ..., a7]			int32_t
+	 * Return : [Floor(a0/2^s), ..., Floor(a7/2^s)]	int32_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi32(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm256_cmpgt_epi32(a, b);
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1((static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(a,x);
+		b = sub(b,x);
+		return _mm256_cmpgt_epi32(b, a);
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	* Multiply the packed unsigned 32-bit integers in a and b, producing intermediate 64-bit integers,
+	* and store the high 32	bits of the intermediate integers in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7] uint32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7] uint32_t
+	* Return : [Floor(a0*b0/2^32), ..., Floor(a7*b7/2^32)] uint32_t
+	*/
+	static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) {
+		//#pragma warning "The simd mulhi function is emulated, it may impact the performances."
+		typedef Simd256_impl<true, true, false, 8> Simd256_64;
+		vect_t C,A1,B1;
+		C  = Simd256_64::mulx(a,b);
+		A1 = Simd256_64::srl(a,32);
+		B1 = Simd256_64::srl(b,32);
+		A1 = Simd256_64::mulx(A1,B1);
+		C  = Simd256_64::srl(C,32);
+		A1 = Simd256_64::srl(A1,32);
+		A1 = Simd256_64::sll(A1,32);
+		return Simd256_64::vor(C,A1);
+	}
+
+	/*
+	* Multiply the low unsigned 16-bit integers from each packed 32-bit element in a and b,
+	* and store the signed 32-bit results in vect_t.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]	uint32_t
+	*	   [b0, b1, b2, b3, b4, b5, b6, b7]	uint32_t
+	* Return : [(a0 mod 2^16)*(b0 mod 2^16), ..., (a7 mod 2^16)*(b7 mod 2^16)]	uint32_t
+	*/
+	static INLINE CONST vect_t mulx(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulx function is emulated, it may impact the performances."
+		vect_t a1, b1, mask1;
+		mask1 = set1(0x0000FFFF);
+		a1 = vand(a,mask1);
+		b1 = vand(b,mask1);
+		return mul(a1,b1);
+	}
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	* Horizontally add 32-bits elements of a.
+	* Args   : [a0, a1, a2, a3, a4, a5, a6, a7]
+	* Return : a0+a1+a2+a3+a4+a5+a6+a7
+	*/
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7]);
+	}
+}; //Simd256_impl<true,true,false,4>
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd256_int64.inl b/fflas-ffpack/fflas/fflas_simd/simd256_int64.inl
index 6f5b829..7f61345 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd256_int64.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd256_int64.inl
@@ -1,10 +1,11 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
  * Written by   Bastien Vialla<bastien.vialla at lirmm.fr>
  * Brice Boyer (briceboyer) <boyer.brice at gmail.com>
+ * Romain Lebreton <romain.lebreton at lirmm.fr>
  *
  *
  * ========LICENCE========
@@ -30,491 +31,715 @@
 #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL
 #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL
 
+#ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
+#error "You need AVX2 instructions to perform 256bits operations on int64_t"
+#endif
+
 /*
  * Simd256 specialized for int64_t
  */
-template <> struct Simd256_impl<true, true, true, 8> {
-
-#if defined(__FFLASFFPACK_USE_AVX2)
-    /*
-     * alias to 256 bit simd register
-     */
-    using vect_t = __m256i;
-
-    /*
-     * alias to 256 bit simd register
-     */
-    using half_t = __m128i;
-
-    /*
-     * define the scalar type corresponding to the specialization
-     */
-    using scalar_t = int64_t;
-
-    /*
-     * Simd128 for scalar_t, to deal half_t
-     */
-    using simdHalf = Simd128<scalar_t>;
-
-    /*
-     *  number of scalar_t in a simd register
-     */
-    static const constexpr size_t vect_size = 4;
-
-    /*
-     *  alignement required by scalar_t pointer to be loaded in a vect_t
-     */
-    static const constexpr size_t alignment = 32;
-
-    /*
-     * Check if the pointer p is a multiple of alignemnt
-     */
-    template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
-
-    /*
-     * Check if the number n is a multiple of vect_size
-     */
-    template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
-
-    /*
-     * Converter from vect_t to a tab.
-     * exple:
-     *		Converter conv;
-     *		conv.v = a;
-     *		scalar_t x = conv.t[i]
-     */
-    union Converter {
-        vect_t v;
-        scalar_t t[vect_size];
-    };
-
-    /*
-     *  Return vector of type vect_t with all elements set to zero
-     *  Return [0,0,0,0] int64_t
-     */
-    static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); }
-
-    /*
-     *  Broadcast 64-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x,x,x,x] int64_t
-     */
-    static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi64x(x); }
-
-    /*
-     *  Broadcast 64-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
-     *  Return [x0,x1,x2,x3] int64_t
-     */
-    static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
-        return _mm256_set_epi64x(x3, x2, x1, x0);
-    }
-
-    /*
-     *  Gather 64-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
-     *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int64_t
-     */
-    template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
-        return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     * Return [p[0],p[1],p[2],p[3]] int32_t
-     */
-    static INLINE PURE vect_t load(const scalar_t *const p) {
-        return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Load 256-bits of integer data from memory into dst.
-     * p does not need to be aligned on any particular boundary.
-     * Return [p[0],p[1],p[2],p[3]] int64_t
-     */
-    static INLINE PURE vect_t loadu(const scalar_t *const p) {
-        return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
-     */
-    static INLINE void store(const scalar_t *p, vect_t v) {
-        _mm256_store_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory.
-     * p does not need to be aligned on any particular boundary.
-     */
-    static INLINE void storeu(const scalar_t *p, vect_t v) {
-        _mm256_storeu_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Store 256-bits of integer data from a into memory using a non-temporal memory hint.
-     * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
-     */
-    static INLINE void stream(const scalar_t *p, const vect_t v) {
-        _mm256_stream_si256(reinterpret_cast<vect_t *>(const_cast<scalar_t *>(p)), v);
-    }
-
-    /*
-     * Add packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] 						   int64_t
-     [b0, b1, b2, b3] 						   int64_t
-     * Return : [a0+b0, a1+b1, a2+b2, a3+b3]   int64_t
-     */
-    static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi64(a, b); }
-
-    static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
-
-    /*
-     * Subtract packed 64-bits integers in b from packed 64-bits integers in a, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] 						  int64_t
-     [b0, b1, b2, b3] 						  int64_t
-     * Return : [a0-b0, a1-b1, a2-b2, a3-b3]  int64_t
-     */
-    static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi64(a, b); }
-
-    static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
-
-    /*
-     * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int64_t
-     * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int64_t
-     */
-    static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi64(a, s); }
-
-    /*
-     * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int64_t
-     * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t
-     */
-    static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi64(a, s); }
-
-    static INLINE CONST vect_t sra(const vect_t a, const int s) {
-#ifdef __AVX512__
-        return _mm256_sra_epi64(a, set1(s));
+template <> struct Simd256_impl<true, true, true, 8> : public Simd256i_base {
+
+	/*
+	 * alias to 256 bit simd register
+	 */
+	using vect_t = __m256i;
+
+	/*
+	 * alias to 256 bit simd register
+	 */
+	using half_t = __m128i;
+
+	/*
+	 * define the scalar type corresponding to the specialization
+	 */
+	using scalar_t = int64_t;
+
+	/*
+	 * Simd128 for scalar_t, to deal half_t
+	 */
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	 *  number of scalar_t in a simd register
+	 */
+	static const constexpr size_t vect_size = 4;
+
+	/*
+	 *  alignement required by scalar_t pointer to be loaded in a vect_t
+	 */
+	static const constexpr size_t alignment = 32;
+
+	/*
+	 * Check if the pointer p is a multiple of alignemnt
+	 */
+	template <class T> static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; }
+
+	/*
+	 * Check if the number n is a multiple of vect_size
+	 */
+	template <class T> static constexpr bool compliant(T n) { return n % vect_size == 0; }
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	 *  Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	 *  Return [x,x,x,x] int64_t
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi64x(x); }
+
+	/*
+	 *  Set packed 64-bit integers in dst with the supplied values.
+	 *  Return [x0,x1,x2,x3] int64_t
+	 */
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
+		return _mm256_set_epi64x(x3, x2, x1, x0);
+	}
+
+	/*
+	 *  Gather 64-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
+	 *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int64_t
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
+	}
+
+	/*
+	 * Load 256-bits of integer data from memory into dst.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0],p[1],p[2],p[3]] int32_t
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Load 256-bits of integer data from memory into dst.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0],p[1],p[2],p[3]] int64_t
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Store 256-bits of integer data from a into memory.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 256-bits of integer data from a into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 256-bits of integer data from a into memory using a non-temporal memory hint.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int64_t
+	 */
+	static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi64(a, s); }
+
+	/*
+	 * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t
+	 */
+	static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi64(a, s); }
+
+	/*
+	* Shift packed 64-bit integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) {
+#ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+		return _mm256_srai_epi64(a, s);
 #else
-        const int b = 63 - s;
-        vect_t m = sll(set1(1), b);
-        vect_t x = srl(a, s);
-        vect_t result = sub(vxor(x, m), m); // result = x^m - m
-        return result;
+		const int b = 63 - s;
+		vect_t m = sll(set1(1), b);
+		vect_t x = srl(a, s);
+		vect_t result = sub(vxor(x, m), m); // result = x^m - m
+		return result;
 #endif
-    }
-
-    /*
-     * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3]           						     int64_t
-     [b0, b1, b2, b3]  		 							 int64_t
-     * Return : [a0*b0 mod 2^64-1, a1*b1 mod 2^64-1, a2*b2 mod 2^64-1, a3*b3 mod 2^64-1] int64_t
-     */
-    static INLINE CONST vect_t mullo(vect_t a, vect_t b) {
-//#warning "The simd mullo function is emulate, it may impact the performances."
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set(ca.t[0] * cb.t[0], ca.t[1] * cb.t[1], ca.t[2] * cb.t[2], ca.t[3] * cb.t[3]);
-    }
-
-    static INLINE CONST vect_t mullox(const vect_t x0, const vect_t x1) { return _mm256_mullo_epi32(x0, x1); }
-
-    /*
-     * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3]           						     int64_t
-     [b0, b1, b2, b3]  		 							 int64_t
-     * Return : [a0*b0 mod 2^64-1, a1*b1 mod 2^64-1, a2*b2 mod 2^64-1, a3*b3 mod 2^64-1] int64_t
-     */
-    static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
-
-    /*
-     * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the high 64
-     bits of the intermediate integers in vect_t.
-     * Args   : [a0, a1, a2, a3]           						     int64_t
-     [b0, b1, b2, b3]  		 							 int64_t
-     * Return :
-     */
-    static INLINE CONST vect_t mulhi(vect_t a, vect_t b) {
-        // ugly solution, but it works.
-        // tested with gcc, clang, icc
-        Converter ca, cb;
-        ca.v = a;
-        cb.v = b;
-        return set((int128_t(ca.t[0]) * cb.t[0]) >> 64, (int128_t(ca.t[1]) * cb.t[1]) >> 64,
-                   (int128_t(ca.t[2]) * cb.t[2]) >> 64, (int128_t(ca.t[3]) * cb.t[3]) >> 64);
-    }
-
-    /*
-     * Multiply packed 64-bit integers in a and b, producing intermediate 128-bit integers, and add the low 64-bits of
-     the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]           int64_t
-     [b0, b1, b2, b3]           int64_t
-     [c0, c1, c2, c3]           int64_t
-     * Return : [(a0*b0 mod 2^64-1)+c0, (a1*b1 mod 2^64-1)+c1, (a2*b2 mod 2^64-1)+c2, (a3*b3 mod 2^64-1)+c3]
-     */
-    static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
-
-    static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
-
-    /*
-     * Multiply packed 64-bit integers in a and b, producing intermediate 128-bit integers, and substract elements of c
-     to the low 64-bit of the intermiate result, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]           int64_t
-     [b0, b1, b2, b3]           int64_t
-     [c0, c1, c2, c3]           int64_t
-     * Return : [-(a0*b0 mod 2^64-1)+c0, -(a1*b1 mod 2^64-1)+c1, -(a2*b2 mod 2^64-1)+c2, -(a3*b3 mod 2^64-1)+c3]
-     */
-    static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
-
-    /*
-     * Multiply packed 64-bit integers in a and b, producing intermediate 128-bit integers, and substract the low
-     64-bits of the intermediate with c, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]           int64_t
-     [b0, b1, b2, b3]           int64_t
-     [c0, c1, c2, c3]           int64_t
-     * Return : [(a0*b0 mod 2^64-1)-c0, (a1*b1 mod 2^64-1)-c1, (a2*b2 mod 2^64-1)-c2, (a3*b3 mod 2^64-1)-c3]
-     */
-    static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
-
-    /*
-     * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the signed 64-bit results
-     in dst.
-     * Args   : [a0, a1, a2, a3]    int64_t
-     [b0, b1, b2, b3]    int64_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3] int64_t
-     */
-    static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm256_mul_epi32(a, b); }
-
-    /*
-     * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the unsigned 64-bit
-     results in dst.
-     * Args   : [a0, a1, a2, a3]    int64_t
-     [b0, b1, b2, b3]    int64_t
-     * Return : [a0*b0, a1*b1, a2*b2, a3*b3] uint64_t
-     */
-    static INLINE CONST vect_t mulux(const vect_t a, const vect_t b) { return _mm256_mul_epu32(a, b); }
-
-    /*
-     * Compare packed 64-bits in a and b for equality, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]								   int32_t
-     [b0, b1, b2, b3] 								   int32_t
-     * Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0,
-     (a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0]                     int32_t
-     */
-    static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi64(a, b); }
-
-    /*
-     * Compare packed 64-bits in a and b for greater-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]								   int32_t
-     [b0, b1, b2, b3] 								   int32_t
-     * Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0,
-     (a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0]                     	int32_t
-     */
-    static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi64(a, b); }
-
-    /*
-     * Compare packed 64-bits in a and b for lesser-than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3] int32_t
-     [b0, b1, b2, b3] int32_t
-     * Return : [(a0<b0) ? 0xFFFF : 0, (a1<b1) ? 0xFFFF : 0,
-     (a2<b2) ? 0xFFFF : 0, (a3<b3) ? 0xFFFF : 0] 					  int32_t
-     */
-    static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi64(b, a); }
-
-    /*
-     * Compare packed 64-bits in a and b for greater or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]									 int32_t
-     [b0, b1, b2, b3] 									 int32_t
-     * Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0,
-     (a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0,
-     (a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0,
-     (a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0]					  int32_t
-     */
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    /*
-     * Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3, a4, a5, a6, a7] 					int32_t
-     [b0, b1, b2, b3, b4, b5, b6, b7] 					int32_t
-     * Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0,
-     (a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0] 		int32_t
-     */
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
-
-    /*
-     * Compute the bitwise AND of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     [b0, b1, b2, b3]
-     * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3]
-     */
-    static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); }
-
-    /*
-     * Compute the bitwise OR of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     [b0, b1, b2, b3]
-     * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3]
-     */
-    static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); }
-
-    /*
-     * Compute the bitwise XOR of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     [b0, b1, b2, b3]
-     * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3]
-     */
-    static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); }
-
-    /*
-     * Compute the bitwise AND NOT of packed 64-bits integer in a and b, and store the results in vect_t.
-     * Args   : [a0, a1, a2, a3]
-     [b0, b1, b2, b3]
-     * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3]
-     */
-    static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(b, a); }
-
-    /*
-     * Horizontally add 64-bits elements of a.
-     * Args   : [a0, a1, a2, a3]
-     * Return : a0+a1+a2+a3
-     */
-    static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
-        Converter ca;
-        ca.v = a;
-        return ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3];
-    }
-
-    /*
-     *
-     * Args   : [a0, a1, a2, a3]    int64_t
-     [b0, b1, b2, b3]    int64_t
-     [c0, c1, c2, c3] 			 int64_t
-     * Return : [c0+a1*b1, c1+a3*b2, c2+a5*b5, c3+a7*b7] int64_t
-     */
-
-    static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
-
-    static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
-
-    static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
-
-    static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
-
-    static INLINE CONST vect_t round(const vect_t a) { return a; }
-
-    // mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF
-    static INLINE CONST vect_t mask_high() { return srl(_mm256_set1_epi8(-1), 32); }
-
-    static INLINE CONST vect_t signbits(const vect_t x) {
-        vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
-        return signBits;
-    }
-
-    // warning : may be off by 1 multiple, but we save a mul...
-    static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y) {
-        // unsigned mulhi starts:
-        // x1 = xy_high = mulhiu_fast(x,y)
-        const vect_t mask = mask_high();
-
-        vect_t x0 = vand(x, mask), x1 = srl(x, 32);
-        vect_t y0 = vand(y, mask), y1 = srl(y, 32);
-
-        x0 = mulux(x0, y1); // x0y1
-        y0 = mulux(x1, y0); // x1y0
-        y1 = mulux(x1, y1); // x1y1
-
-        x1 = vand(y0, mask);
-        y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0
-        x1 = srl(add(x1, x0), 32);
-        y0 = add(y1, y0);
-
-        x1 = add(x1, y0);
-        // unsigned mulhi ends
-
-        // fixing signs
-        x0 = vand(signbits(x), y);
-        x1 = sub(x1, x0);
-        x0 = vand(signbits(y), x);
-        x1 = sub(x1, x0);
-        // end fixing
-        return x1;
-    }
-
-    template <bool overflow, bool poweroftwo>
-    static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
-                             const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) {
-#ifdef __INTEL_COMPILER
-        // Works fine with ICC 15.0.1 - A.B.
-        C = _mm256_rem_epi64(C, P);
+	}
+
+	/*
+	* Shuffle 64-bit integers in a using the control in imm8, and store the results in dst.
+	* Args   : [a0, ..., a3] int32_t
+	* Return : [a[s[0..1]], ..., a[s[6..7]]] int32_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t shuffle(const vect_t a) {
+		return _mm256_permute4x64_epi64(a, s);
+	}
+
+	/*
+	* Unpack and interleave 64-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [a0, b0, a2, b2] int64_t
+	*/
+	static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi64(a, b); }
+
+	/*
+	* Unpack and interleave 64-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [a1, b1, a3, b3] int64_t
+	*/
+	static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi64(a, b); }
+
+	/*
+	* Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [a0, b0, a1, b1] int64_t
+	*/
+	static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) {
+		vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3]
+		vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpacklo_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [a2, b2, a3, b3] int64_t
+	*/
+	static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) {
+		vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4
+		vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		return unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Unpack and interleave 64-bit integers from the low then high half of a and b, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [a0, b0, a1, b1] int64_t
+	*		   [a2, b2, a3, b3] int64_t
+	*/
+	static INLINE CONST void unpacklohi(vect_t& l, vect_t& h, const vect_t a, const vect_t b) {
+		vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3]
+		vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4
+		l = unpacklo_twice(a1, b1);
+		h = unpackhi_twice(a1, b1);
+	}
+
+	/*
+	* Blend packed 64-bit integers from a and b using control mask imm8, and store the results in dst.
+	* Args   : [a0, a1, a2, a3] int64_t
+			   [b0, b1, b2, b3] int64_t
+	* Return : [s[0]?a0:b0,   , s[3]?a3:b3] int64_t
+	*/
+	template<uint8_t s>
+	static INLINE CONST vect_t blend(const vect_t a, const vect_t b) {
+		// _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2
+		// We have to transform s = [d3 d2 d1 d0]_base2 to s1 = [d3 d3 d2 d2 d1 d1 d0 d0]_base2
+		constexpr uint8_t s1 = (s & 0x1) * 3 + (((s & 0x2) << 1)*3)  + (((s & 0x4) << 2)*3) + (((s & 0x8) << 3)*3);
+		return _mm256_blend_epi32(a, b, s1);
+	}
+
+	/*
+	 * Add packed 64-bits integer in a and b, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [a0+b0, a1+b1, a2+b2, a3+b3]   int64_t
+	 */
+	static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi64(a, b); }
+
+	static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); }
+
+	/*
+	 * Subtract packed 64-bits integers in b from packed 64-bits integers in a, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [a0-b0, a1-b1, a2-b2, a3-b3]  int64_t
+	 */
+	static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi64(a, b); }
+
+	static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); }
+
+	/*
+	 * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64
+	 bits of the intermediate integers in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	* Return : [a0*b0 smod 2^32, ..., a3*b3 smod 2^32]	int64_t
+	*	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	 */
+	static INLINE CONST vect_t mullo(vect_t a, vect_t b) {
+#ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS
+		return _mm256_mullo_epi64(a, b);
 #else
-        if (poweroftwo) {
-            Q = srl(C, 63);
-            vect_t un = set1(1);
-            T = sub(sll(un, shifter), un);
-            Q = add(C, vand(Q, T));
-            Q = sll(srl(Q, shifter), shifter);
-            C = sub(C, Q);
-            Q = vand(greater(zero(), Q), P);
-            C = add(C, Q);
-        } else {
-            Q = mulhi_fast(C, magic);
-            if (overflow) {
-                Q = add(Q, C);
-            }
-            Q = sra(Q, shifter);
-            vect_t q1 = mulux(Q, P);
-            vect_t q2 = sll(mulux(srl(Q, 32), P), 32);
-            C = sub(C, add(q1, q2));
-            T = greater_eq(C, P);
-            C = sub(C, vand(T, P));
-        }
+		//#pragma warning "The simd mullo function is emulate, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set(ca.t[0] * cb.t[0], ca.t[1] * cb.t[1], ca.t[2] * cb.t[2], ca.t[3] * cb.t[3]);
+#endif
+	}
+
+	static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); }
+
+	/*
+	 * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the high 64
+	 bits of the intermediate integers in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [Floor(a0*b0/2^64), ..., Floor(a3*b3/2^64)] int64_t
+	 */
+#ifdef __FFLASFFPACK_HAVE_INT128
+	static INLINE CONST vect_t mulhi(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulhi function is emulate, it may impact the performances."
+		// ugly solution, but it works.
+		// tested with gcc, clang, icc
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set((scalar_t)((int128_t(ca.t[0]) * cb.t[0]) >> 64), (scalar_t)((int128_t(ca.t[1]) * cb.t[1]) >> 64),
+				(scalar_t)((int128_t(ca.t[2]) * cb.t[2]) >> 64), (scalar_t)((int128_t(ca.t[3]) * cb.t[3]) >> 64));
+	}
 #endif
-        NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
-        return C;
-    }
-
-#else
-
-#error "You need AVX2 instructions to perform 256bits operations on int64_t"
 
-#endif // defined(__FFLASFFPACK_USE_AVX2)
-};
+	/*
+	 * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the signed 64-bit results
+	 in dst.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0 smod 2^32)*(b0 smod 2^32), ..., (a3 smod 2^32)*(b3 smod 2^32)]	int64_t
+	 *	   where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2
+	 */
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm256_mul_epi32(a, b); }
+
+	/*
+	 * Multiply packed 64-bit integers in a and b, producing intermediate 128-bit integers, and add the low 64-bits of
+	 the intermediate with c, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return : [(a0*b0+c0) smod 2^64, ..., (a3*b3+c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); }
+
+	static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); }
+
+	/*
+	 * Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	 * keep the signed 64-bit results and add the low 64-bits of c.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return :	[((a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, ...,
+	 *		 ((a3 smod 2^32)*(b3 smod 2^32)+c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	/*
+	* Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers,
+	* and substract the low 64 bits of the intermediate from elements of c.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return :	[(-a0*b0+c0) smod 2^64, ..., (-a3*b3+c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); }
+
+	static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); }
+
+	/*
+	 * Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	 * keep the signed 64-bit results and substract them from elements of c.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return :	[(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, ...,
+	 *		 (-(a3 smod 2^32)*(b3 smod 2^32)+c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	/*
+	 * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers,
+	 * and substract elements of c to the low 64-bits of the intermediate.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return :	[(a0*b0-c0) smod 2^64, ..., (a3*b3-c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); }
+
+	static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); }
+
+	/*
+	 * Multiply the low 32-bit integers from each packed 64-bit element in a and b,
+	 * keep the signed 64-bit results and substract elements of c from them.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 *	    [c0, c1, c2, c3]	int64_t
+	 * Return :	[((a0 smod 2^32)*(b0 smod 2^32)-c0) smod 2^64, ...,
+	 *		 ((a3 smod 2^32)*(b3 smod 2^32)-c3) smod 2^64]	int64_t
+	 */
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	 * Compare packed 64-bits in a and b for equality, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	 */
+	static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi64(a, b); }
+
+	/*
+	 * Compare packed 64-bits in a and b for greater-than, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	 */
+	static INLINE CONST vect_t greater(const vect_t a, const vect_t b) {
+		return _mm256_cmpgt_epi64(a, b);
+	}
+
+	/*
+	 * Compare packed 64-bits in a and b for lesser-than, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0<b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2<b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3<b3) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	 */
+	static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi64(b, a); }
+
+	/*
+	 * Compare packed 64-bits in a and b for greater or equal than, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a4>=b4) ? 0xFFFFFFFFFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a6>=b6) ? 0xFFFFFFFFFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	 */
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	/*
+	 * Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 *	    [b0, b1, b2, b3]	int64_t
+	 * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0,
+	 (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0]	int64_t
+	 */
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	 * Horizontally add 64-bits elements of a.
+	 * Args   : [a0, a1, a2, a3]	int64_t
+	 * Return : a0+a1+a2+a3	int64_t
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3]);
+	}
+
+	static INLINE CONST vect_t round(const vect_t a) { return a; }
+
+	static INLINE CONST vect_t signbits(const vect_t x) {
+		vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1));
+		return signBits;
+	}
+
+	// mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF
+	static INLINE CONST vect_t mask_high() { return srl(_mm256_set1_epi8(-1), 32); }
+
+	static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y);
+
+	template <bool overflow, bool poweroftwo>
+	static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
+							 const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T);
+}; // Simd256_impl<true, true, true, 8>
 
-// uint64_t
+/*
+ * Simd256 specialized for uint64_t
+ */
 template <> struct Simd256_impl<true, true, false, 8> : public Simd256_impl<true, true, true, 8> {
-    using scalar_t = uint64_t;
-
-#if defined(__FFLASFFPACK_USE_AVX2)
-
-    static INLINE CONST vect_t greater(vect_t a, vect_t b) {
-
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi64(a, b);
-    }
-
-    static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
-        vect_t x;
-        x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
-        a = sub(x, a);
-        b = sub(x, b);
-        return _mm256_cmpgt_epi64(a, b);
-    }
 
-    static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
-
-    static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+	/*
+	* define the scalar type corresponding to the specialization
+	*/
+	using scalar_t = uint64_t;
+
+	/*
+	 * Simd128 for scalar_t, to deal half_t
+	 */
+	using simdHalf = Simd128<scalar_t>;
+
+	/*
+	* Converter from vect_t to a tab.
+	* exple:
+	*	Converter conv;
+	*	conv.v = a;
+	*	scalart_t x = conv.t[1]
+	*/
+	union Converter {
+		vect_t v;
+		scalar_t t[vect_size];
+	};
+
+	/*
+	 *  Broadcast 64-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw.
+	 *  Return [x,x,x,x] uint64_t
+	 */
+	static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi64x(x); }
+
+	/*
+	 *  Set packed 64-bit unsigned integers in dst with the supplied values.
+	 *  Return [x0,x1,x2,x3] uint64_t
+	 */
+	static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) {
+		return _mm256_set_epi64x(x3, x2, x1, x0);
+	}
+
+	/*
+	 *  Gather 64-bit unsigned integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t.
+	 *  Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] uint64_t
+	 */
+	template <class T> static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) {
+		return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]);
+	}
+
+	/*
+	 * Load 256-bits of unsigned integer data from memory into dst.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 * Return [p[0],p[1],p[2],p[3]] uint64_t
+	 */
+	static INLINE PURE vect_t load(const scalar_t *const p) {
+		return _mm256_load_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Load 256-bits of unsigned integer data from memory into dst.
+	 * p does not need to be aligned on any particular boundary.
+	 * Return [p[0],p[1],p[2],p[3]] uint64_t
+	 */
+	static INLINE PURE vect_t loadu(const scalar_t *const p) {
+		return _mm256_loadu_si256(reinterpret_cast<const vect_t *>(p));
+	}
+
+	/*
+	 * Store 256-bits of unsigned integer data from a into memory.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception will be generated.
+	 */
+	static INLINE void store(scalar_t *p, vect_t v) {
+		_mm256_store_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 256-bits of unsigned integer data from a into memory.
+	 * p does not need to be aligned on any particular boundary.
+	 */
+	static INLINE void storeu(scalar_t *p, vect_t v) {
+		_mm256_storeu_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	 * Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint.
+	 * p must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	 */
+	static INLINE void stream(scalar_t *p, const vect_t v) {
+		_mm256_stream_si256(reinterpret_cast<vect_t *>(p), v);
+	}
+
+	/*
+	* Shift packed 64-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t.
+	 * Args   : [a0, ..., a3]			uint64_t
+	 * Return : [Floor(a0/2^s), ..., Floor(a3/2^s)]	uint64_t
+	*/
+	static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi64(a, s); }
+
+	static INLINE CONST vect_t greater(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(x, a);
+		b = sub(x, b);
+		return _mm256_cmpgt_epi64(b,a);
+	}
+
+	static INLINE CONST vect_t lesser(vect_t a, vect_t b) {
+		vect_t x;
+		x = set1(-(static_cast<scalar_t>(1) << (sizeof(scalar_t) * 8 - 1)));
+		a = sub(x, a);
+		b = sub(x, b);
+		return _mm256_cmpgt_epi64(a, b);
+	}
+
+	static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); }
+
+	static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); }
+
+	/*
+	 * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64
+	 bits of the intermediate integers in vect_t.
+	 * Args   : [a0, a1, a2, a3]	   							uint64_t
+	 [b0, b1, b2, b3]  		 							uint64_t
+	 * Return : [a0*b0 mod 2^64, a1*b1 mod 2^64, a2*b2 mod 2^64, a3*b3 mod 2^64]		uint64_t
+	 */
+	static INLINE CONST vect_t mullo(vect_t a, vect_t b) {
+		//#pragma warning "The simd mullo function is emulate, it may impact the performances."
+		Converter ca, cb;
+		ca.v = a;
+		cb.v = b;
+		return set(ca.t[0] * cb.t[0], ca.t[1] * cb.t[1], ca.t[2] * cb.t[2], ca.t[3] * cb.t[3]);
+	}
+
+	/*
+	 * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the high 64
+	 bits of the intermediate integers in vect_t.
+	 * Args   : [a0, a1, a2, a3]	   							 uint64_t
+	 [b0, b1, b2, b3]  		 							 uint64_t
+	 * Return :
+	 */
+#ifdef __FFLASFFPACK_HAVE_INT128
+	static INLINE CONST vect_t mulhi(vect_t a, vect_t b) {
+		//#pragma warning "The simd mulhi function is emulate, it may impact the performances."
+		// ugly solution, but it works.
+		// tested with gcc, clang, icc
+		Converter c0, c1;
+		c0.v = a;
+		c1.v = b;
+		return set((scalar_t)(((uint128_t)(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)(((uint128_t)(c0.t[1]) * c1.t[1]) >> 64),
+				(scalar_t)(((uint128_t)(c0.t[2]) * c1.t[2]) >> 64), (scalar_t)(((uint128_t)(c0.t[3]) * c1.t[3]) >> 64));
+	}
+#endif
 
+	/*
+	 * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the unsigned 64-bit
+	 results in dst.
+	 * Args   : [a0, a1, a2, a3]	uint64_t
+	 [b0, b1, b2, b3]	uint64_t
+	 * Return : [a0*b0, a1*b1, a2*b2, a3*b3] uint64_t
+	 */
+	static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm256_mul_epu32(a, b); }
+
+	static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); }
+
+	static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); }
+
+	static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); }
+
+	static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); }
+
+	static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); }
+
+	/*
+	 * Horizontally add 64-bits elements of a.
+	 * Args   : [a0, a1, a2, a3]
+	 * Return : a0+a1+a2+a3
+	 */
+	static INLINE CONST scalar_t hadd_to_scal(const vect_t a) {
+		Converter ca;
+		ca.v = a;
+		return ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3];
+	}
+}; // Simd256_impl<true, true, false, 8>
+
+#define vect_t Simd256_impl<true, true, true, 8>::vect_t
+
+// warning : may be off by 1 multiple, but we save a mul...
+INLINE CONST vect_t Simd256_impl<true, true, true, 8>::mulhi_fast(vect_t x, vect_t y) {
+	// unsigned mulhi starts:
+	// x1 = xy_high = mulhiu_fast(x,y)
+	const vect_t mask = mask_high();
+
+	vect_t x0 = vand(x, mask), x1 = srl(x, 32);
+	vect_t y0 = vand(y, mask), y1 = srl(y, 32);
+
+	x0 = Simd256_impl<true, true, false, 8>::mulx(x0, y1); // x0y1
+	y0 = Simd256_impl<true, true, false, 8>::mulx(x1, y0); // x1y0
+	y1 = Simd256_impl<true, true, false, 8>::mulx(x1, y1); // x1y1
+
+	x1 = vand(y0, mask);
+	y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0
+	x1 = srl(add(x1, x0), 32);
+	y0 = add(y1, y0);
+
+	x1 = add(x1, y0);
+	// unsigned mulhi ends
+
+	// fixing signs
+	x0 = vand(signbits(x), y);
+	x1 = sub(x1, x0);
+	x0 = vand(signbits(y), x);
+	x1 = sub(x1, x0);
+	// end fixing
+	return x1;
+}
+
+template <bool overflow, bool poweroftwo>
+INLINE vect_t Simd256_impl<true, true, true, 8>::mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP,
+													 const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) {
+#ifdef __INTEL_COMPILER
+	// Works fine with ICC 15.0.1 - A.B.
+	C = _mm256_rem_epi64(C, P);
 #else
+	if (poweroftwo) {
+			Q = srl(C, 63);
+			vect_t un = set1(1);
+			T = sub(sll(un, shifter), un);
+			Q = add(C, vand(Q, T));
+			Q = sll(srl(Q, shifter), shifter);
+			C = sub(C, Q);
+			Q = vand(greater(zero(), Q), P);
+			C = add(C, Q);
+		} else {
+			Q = mulhi_fast(C, magic);
+			if (overflow) {
+					Q = add(Q, C);
+				}
+			Q = sra(Q, shifter);
+			vect_t q1 = Simd256_impl<true, true, false, 8>::mulx(Q, P);
+			vect_t q2 = sll(Simd256_impl<true, true, false, 8>::mulx(srl(Q, 32), P), 32);
+			C = sub(C, add(q1, q2));
+			T = greater_eq(C, P);
+			C = sub(C, vand(T, P));
+		}
+#endif
+	NORML_MOD(C, P, NEGP, MIN, MAX, Q, T);
+	return C;
+}
 
-#error "You need AVX2 instructions to perform 256bits operations on uint64_t"
-
-#endif // defined(__FFLASFFPACK_USE_AVX2)
-};
+#undef vect_t
 
 #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL
diff --git a/fflas-ffpack/fflas/fflas_simd/simd_modular.inl b/fflas-ffpack/fflas/fflas_simd/simd_modular.inl
index 6daaa85..e494e78 100644
--- a/fflas-ffpack/fflas/fflas_simd/simd_modular.inl
+++ b/fflas-ffpack/fflas/fflas_simd/simd_modular.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 the FFLAS-FFPACK group
  *
@@ -29,151 +29,151 @@
 // functions wih _r are relaxed, meaning no modular reduction
 
 template <class _Field> class FieldSimd {
-  public:
-    using Field = _Field;
-    using Element = typename Field::Element;
-    using simd = Simd<typename _Field::Element>;
-    using vect_t = typename simd::vect_t;
-    using scalar_t = typename simd::scalar_t;
-
-    static const constexpr size_t vect_size = simd::vect_size;
-
-    static const constexpr size_t alignment = simd::alignment;
-
-  private:
-    using Self = FieldSimd<Field>;
-
-    const Field *_field;
-    vect_t _modulus;
-    vect_t _invmod;
-    vect_t _negmod;
-    vect_t _mask;
-    vect_t _min;
-    vect_t _max;
+public:
+	using Field = _Field;
+	using Element = typename Field::Element;
+	using simd = Simd<typename _Field::Element>;
+	using vect_t = typename simd::vect_t;
+	using scalar_t = typename simd::scalar_t;
+
+	static const constexpr size_t vect_size = simd::vect_size;
+
+	static const constexpr size_t alignment = simd::alignment;
+
+private:
+	using Self = FieldSimd<Field>;
+
+	const Field *_field;
+	vect_t _modulus;
+	vect_t _invmod;
+	vect_t _negmod;
+	vect_t _mask;
+	vect_t _min;
+	vect_t _max;
 
-  public:
-    FieldSimd(const Field &f) : _field(&f) { init(); }
+public:
+	FieldSimd(const Field &f) : _field(&f) { init(); }
 
-  private:
-    void init() {
-        _modulus = simd::set1((Element)_field->characteristic());
-        _min = simd::set1(_field->minElement());
-        _max = simd::set1(_field->maxElement());
-        _negmod = simd::set1(-(Element)_field->characteristic());
-        if (std::is_floating_point<Element>::value) {
-            _invmod = simd::set1(1 / ((Element)_field->characteristic()));
-        }
-    }
+private:
+	void init() {
+		_modulus = simd::set1((Element)_field->characteristic());
+		_min = simd::set1(_field->minElement());
+		_max = simd::set1(_field->maxElement());
+		_negmod = simd::set1(-(Element)_field->characteristic());
+		if (std::is_floating_point<Element>::value) {
+				_invmod = simd::set1(1 / ((Element)_field->characteristic()));
+			}
+	}
 
-  public:
-    FieldSimd(const Self &) = default;
-    FieldSimd(Self &&) = default;
+public:
+	FieldSimd(const Self &) = default;
+	FieldSimd(Self &&) = default;
 
-    Self &operator=(const Self &) = default;
-    Self &operator=(Self &&) = default;
+	Self &operator=(const Self &) = default;
+	Self &operator=(Self &&) = default;
 
-    INLINE vect_t init(vect_t &x, const vect_t a) const { return x = mod(a); }
+	INLINE vect_t init(vect_t &x, const vect_t a) const { return x = mod(a); }
 
-    INLINE vect_t init(const vect_t a) const { return mod(a); }
+	INLINE vect_t init(const vect_t a) const { return mod(a); }
 
-    INLINE vect_t add(vect_t &c, const vect_t a, const vect_t b) const {
-        c = simd::add(a, b);
-        _mask = simd::greater(c, _max);
-        _mask = simd::vand(_mask, _modulus);
-        return c = simd::sub(c, _mask);
-    }
+	INLINE vect_t add(vect_t &c, const vect_t a, const vect_t b) {
+		c = simd::add(a, b);
+		_mask = simd::greater(c, _max);
+		_mask = simd::vand(_mask, _modulus);
+		return c = simd::sub(c, _mask);
+	}
 
-    INLINE vect_t add(const vect_t a, const vect_t b) const {
-        vect_t c;
-        c = simd::add(a, b);
-        _mask = simd::greater(c, _max);
-        _mask = simd::vand(_mask, _modulus);
-        return c = simd::sub(c, _mask);
-    }
+	INLINE vect_t add(const vect_t a, const vect_t b) {
+		vect_t c;
+		c = simd::add(a, b);
+		_mask = simd::greater(c, _max);
+		_mask = simd::vand(_mask, _modulus);
+		return c = simd::sub(c, _mask);
+	}
 
-    INLINE vect_t addin(vect_t &a, const vect_t b) const { return a = add(a, b); }
+	INLINE vect_t addin(vect_t &a, const vect_t b) const { return a = add(a, b); }
 
-    INLINE vect_t add_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::add(a, b); }
+	INLINE vect_t add_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::add(a, b); }
 
-    INLINE vect_t add_r(const vect_t a, const vect_t b) const { return simd::add(a, b); }
+	INLINE vect_t add_r(const vect_t a, const vect_t b) const { return simd::add(a, b); }
 
-    INLINE vect_t addin_r(vect_t &a, const vect_t b) const { return a = add_r(a, b); }
+	INLINE vect_t addin_r(vect_t &a, const vect_t b) const { return a = add_r(a, b); }
 
-    INLINE vect_t sub(vect_t &c, const vect_t a, const vect_t b) const {
-        c = simd::sub(a, b);
-        _mask = simd::lesser(c, _min);
-        _mask = simd::vand(_mask, _modulus);
-        return c = simd::add(c, _mask);
-    }
+	INLINE vect_t sub(vect_t &c, const vect_t a, const vect_t b) {
+		c = simd::sub(a, b);
+		_mask = simd::lesser(c, _min);
+		_mask = simd::vand(_mask, _modulus);
+		return c = simd::add(c, _mask);
+	}
 
-    INLINE vect_t sub(const vect_t a, const vect_t b) const {
-        vect_t c;
-        c = simd::sub(a, b);
-        _mask = simd::greater(c, _max);
-        _mask = simd::vand(_mask, _modulus);
-        return c = simd::add(c, _mask);
-    }
+	INLINE vect_t sub(const vect_t a, const vect_t b) {
+		vect_t c;
+		c = simd::sub(a, b);
+		_mask = simd::greater(c, _max);
+		_mask = simd::vand(_mask, _modulus);
+		return c = simd::add(c, _mask);
+	}
 
-    INLINE vect_t subin(vect_t &a, const vect_t b) const { return a = sub(a, b); }
+	INLINE vect_t subin(vect_t &a, const vect_t b) const { return a = sub(a, b); }
 
-    INLINE vect_t sub_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::sub(a, b); }
+	INLINE vect_t sub_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::sub(a, b); }
 
-    INLINE vect_t sub_r(const vect_t a, const vect_t b) const { return simd::sub(a, b); }
+	INLINE vect_t sub_r(const vect_t a, const vect_t b) const { return simd::sub(a, b); }
 
-    INLINE vect_t subin_r(vect_t &a, const vect_t b) const { return a = sub_r(a, b); }
+	INLINE vect_t subin_r(vect_t &a, const vect_t b) const { return a = sub_r(a, b); }
 
-    INLINE vect_t zero(vect_t &x) const { return x = simd::zero(); }
+	INLINE vect_t zero(vect_t &x) const { return x = simd::zero(); }
 
-    INLINE vect_t zero() const { return simd::zero(); }
+	INLINE vect_t zero() const { return simd::zero(); }
 
-    INLINE vect_t mod(vect_t &c) const {
-        if (std::is_floating_point<Element>::value) {
-            vect_t q, t;
-            q = simd::mul(c, _invmod);
-            q = simd::floor(q);
-            c = simd::fnmadd(c, q, _modulus);
-            q = simd::greater(c, _max);
-            t = simd::lesser(c, _min);
-            q = simd::vand(q, _negmod);
-            t = simd::vand(t, _modulus);
-            q = simd::vor(q, t);
-            return c = simd::add(c, q);
-        } else {
-            FFLASFFPACK_abort("pas implementé");
-        }
-    }
+	INLINE vect_t mod(vect_t &c) const {
+		if (std::is_floating_point<Element>::value) {
+				vect_t q, t;
+				q = simd::mul(c, _invmod);
+				q = simd::floor(q);
+				c = simd::fnmadd(c, q, _modulus);
+				q = simd::greater(c, _max);
+				t = simd::lesser(c, _min);
+				q = simd::vand(q, _negmod);
+				t = simd::vand(t, _modulus);
+				q = simd::vor(q, t);
+				return c = simd::add(c, q);
+			} else {
+				FFLASFFPACK_abort("pas implementé");
+			}
+	}
 
-    INLINE vect_t mul(vect_t &c, const vect_t a, const vect_t b) const { return c = mod(simd::mul(a, b)); }
+	INLINE vect_t mul(vect_t &c, const vect_t a, const vect_t b) const { return c = mod(simd::mul(a, b)); }
 
-    INLINE vect_t mul(const vect_t a, const vect_t b) const { return mod(simd::mul(a, b)); }
+	INLINE vect_t mul(const vect_t a, const vect_t b) const { return mod(simd::mul(a, b)); }
 
-    INLINE vect_t mulin(vect_t &a, const vect_t b) const { return mul(a, a, b); }
+	INLINE vect_t mulin(vect_t &a, const vect_t b) const { return mul(a, a, b); }
 
-    INLINE vect_t mul_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::mul(a, b); }
+	INLINE vect_t mul_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::mul(a, b); }
 
-    INLINE vect_t mul_r(const vect_t a, const vect_t b) const { return simd::mul(a, b); }
+	INLINE vect_t mul_r(const vect_t a, const vect_t b) const { return simd::mul(a, b); }
 
-    INLINE vect_t axpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
-        return r = mod(simd::fmadd(c, a, b));
-    }
+	INLINE vect_t axpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
+		return r = mod(simd::fmadd(c, a, b));
+	}
 
-    INLINE vect_t axpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmadd(c, a, b)); }
+	INLINE vect_t axpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmadd(c, a, b)); }
 
-    INLINE vect_t axpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy(c, a, b); }
+	INLINE vect_t axpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy(c, a, b); }
 
-    INLINE vect_t axpy_r(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
-        return r = simd::fmadd(c, a, b);
-    }
+	INLINE vect_t axpy_r(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
+		return r = simd::fmadd(c, a, b);
+	}
 
-    INLINE vect_t axpy_r(const vect_t c, const vect_t a, const vect_t b) const { return simd::fmadd(c, a, b); }
+	INLINE vect_t axpy_r(const vect_t c, const vect_t a, const vect_t b) const { return simd::fmadd(c, a, b); }
 
-    INLINE vect_t axpyin_r(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy_r(c, a, b); }
+	INLINE vect_t axpyin_r(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy_r(c, a, b); }
 
-    INLINE vect_t maxpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
-        return r = mod(simd::fmsub(c, a, b));
-    }
+	INLINE vect_t maxpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const {
+		return r = mod(simd::fmsub(c, a, b));
+	}
 
-    INLINE vect_t maxpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmsub(c, a, b)); }
+	INLINE vect_t maxpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmsub(c, a, b)); }
 
-    INLINE vect_t maxpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = maxpy(c, a, b); }
+	INLINE vect_t maxpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = maxpy(c, a, b); }
 };
diff --git a/fflas-ffpack/fflas/fflas_sparse.h b/fflas-ffpack/fflas/fflas_sparse.h
index 6425b75..fba297c 100644
--- a/fflas-ffpack/fflas/fflas_sparse.h
+++ b/fflas-ffpack/fflas/fflas_sparse.h
@@ -82,7 +82,7 @@
 #include "fflas-ffpack/utils/fflas_memory.h"
 #include "fflas-ffpack/paladin/parallel.h"
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 #include "fflas-ffpack/fflas/fflas_simd.h"
 #endif
 
diff --git a/fflas-ffpack/fflas/fflas_sparse.inl b/fflas-ffpack/fflas/fflas_sparse.inl
index 1848afd..e599973 100644
--- a/fflas-ffpack/fflas/fflas_sparse.inl
+++ b/fflas-ffpack/fflas/fflas_sparse.inl
@@ -112,7 +112,7 @@ namespace FFLAS {
 		>::type
 		fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y,
 		      FieldCategories::UnparametricTag, NotZOSparseMatrix) {
-			// #ifdef __FFLASFFPACK_USE_SIMD
+			// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 			sparse_details_impl::fspmv_simd(F, A, x, y, FieldCategories::UnparametricTag());
 			// #else
 			//     sparse_details_impl::fspmv(F, A, x, y, FieldCategories::UnparametricTag());
@@ -137,7 +137,7 @@ namespace FFLAS {
 		>::type
 		fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y,
 		      FieldCategories::ModularTag, NotZOSparseMatrix) {
-			// #ifdef __FFLASFFPACK_USE_SIMD
+			// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 			if (A.delayed) {
 				sparse_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::false_type());
 				freduce(F, A.m, y, 1);
@@ -193,7 +193,7 @@ namespace FFLAS {
 		>::type
 		fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y,
 		      FieldCategories::UnparametricTag, ZOSparseMatrix) {
-			// #ifdef __FFLASFFPACK_USE_SIMD
+			// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 			if (A.cst == 1) {
 				sparse_details_impl::fspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag());
 			} else if (A.cst == -1) {
@@ -417,7 +417,7 @@ namespace FFLAS {
 			sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag());
 		}
 
-#if defined(__FFLASFFPACK_USE_SIMD)
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
 
 		template <class Field, class SM>
 		inline typename std::enable_if<support_simd<typename Field::Element>::value>::type
@@ -468,7 +468,7 @@ namespace FFLAS {
 			}
 		}
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 		// ZO matrix
 		template <class Field, class SM>
@@ -487,7 +487,7 @@ namespace FFLAS {
 			}
 		}
 
-#if defined(__FFLASFFPACK_USE_SIMD)
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
 
 		template <class Field, class SM>
 		inline typename std::enable_if<support_simd<typename Field::Element>::value>::type
@@ -549,7 +549,7 @@ namespace FFLAS {
 			freduce(F, blockSize, A.m, y, ldy);
 		}
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 		// /***************************** pfspmv ******************************/
 
@@ -611,7 +611,7 @@ namespace FFLAS {
 		// template <class Field>
 		// inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A, typename Field::ConstElement_ptr x,
 		//                    typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::true_type) {
-		// #ifdef __FFLASFFPACK_USE_SIMD
+		// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		//     if (A.cst == 1) {
 		//         sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag());
 		//     } else if (A.cst == -1) {
@@ -640,7 +640,7 @@ namespace FFLAS {
 		// inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd> &A, typename Field::ConstElement_ptr
 		// x,
 		//                    typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::true_type) {
-		// #ifdef __FFLASFFPACK_USE_SIMD
+		// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		//     if (A.cst == 1) {
 		//         sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag());
 		//     } else if (A.cst == -1) {
@@ -713,7 +713,7 @@ namespace FFLAS {
 		// inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx,
 		//                   typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, std::false_type) {
 		// // std::cout << "no ZO Unparametric" << std::endl;
-		// #ifdef __FFLASFFPACK_USE_SIMD
+		// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		//     using simd = Simd<typename Field::Element>;
 		//     if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) &&
 		//         (blockSize % simd::vect_size == 0)) {
@@ -738,7 +738,7 @@ namespace FFLAS {
 		//                               typename std::false_type());
 		//         freduce(F, A.m, blockSize, y, ldy);
 		//     } else {
-		// #ifdef __FFLASFFPACK_USE_SIMD
+		// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		//         using simd = Simd<typename Field::Element>;
 		//         if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) &&
 		//             (blockSize % simd::vect_size == 0)) {
@@ -773,7 +773,7 @@ namespace FFLAS {
 		// inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx,
 		//                   typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, std::true_type) {
 		// // std::cout << "ZO Unparametric" << std::endl;
-		// #ifdef __FFLASFFPACK_USE_SIMD
+		// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		//     using simd = Simd<typename Field::Element>;
 		//     if (F.isOne(A.cst)) {
 		//         if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) &&
diff --git a/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl b/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl
index 5da2769..43a20a4 100644
--- a/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl
@@ -118,7 +118,7 @@ inline void fspmm_mkl(const Givaro::FloatDomain &F, const Sparse<Givaro::FloatDo
 #endif // __FFLASFFPACK_HAVE_MKL
 
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::COO> &A, size_t blockSize,
@@ -235,7 +235,7 @@ inline void fspmm_mone(const Field &F, const Sparse<Field, SparseMatrix_t::COO_Z
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_one_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::COO_ZO> &A, size_t blockSize,
@@ -329,7 +329,7 @@ inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse<Field, Sparse
     }
 }
 
-// #endif /*  __FFLASFFPACK_USE_SIMD */
+// #endif /*  __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS */
 
 } // coo_details
 
diff --git a/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl b/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl
index 468b16e..80ee575 100644
--- a/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl
@@ -111,7 +111,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A,
     */
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A, size_t blockSize,
@@ -310,7 +310,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A,
     }
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmm_simd_unaligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A, size_t blockSize,
@@ -645,7 +645,7 @@ inline void pfspmm_mone(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_
     */
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmm_one_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_ZO> &A, size_t blockSize,
@@ -930,7 +930,7 @@ inline void pfspmm_mone_simd_unaligned(const Field &F, const Sparse<Field, Spars
     */
 }
 
-#endif //__FFLASFFPACK_USE_SIMD
+#endif //__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // CSR_details
 
diff --git a/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl b/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl
index 467f265..9e16657 100644
--- a/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl
@@ -123,7 +123,7 @@ inline void fspmm_mkl(const Givaro::FloatDomain &F, const Sparse<Givaro::FloatDo
 
 
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A, size_t blockSize,
@@ -247,7 +247,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A, s
     }
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_unaligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR> &A, size_t blockSize,
@@ -460,7 +460,7 @@ inline void fspmm_mone(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_Z
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_one_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_ZO> &A, size_t blockSize,
@@ -602,7 +602,7 @@ inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse<Field, Sparse
     }
 }
 
-// #endif //__FFLASFFPACK_USE_SIMD
+// #endif //__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 } // CSR_details
 
diff --git a/fflas-ffpack/fflas/fflas_sparse/csr/csr_utils.inl b/fflas-ffpack/fflas/fflas_sparse/csr/csr_utils.inl
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl b/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl
index a7e9062..f9bb5c5 100644
--- a/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl
@@ -394,7 +394,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB>
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field, class LFunc, class SFunc>
 inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB> &A, size_t blockSize,
                    typename Field::ConstElement_ptr x, typename Field::Element_ptr y, LFunc &&lfunc, SFunc &&sfunc,
diff --git a/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl b/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl
index bcb127f..e64d2e8 100644
--- a/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl
@@ -132,7 +132,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB> &
     }
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB> &A, size_t blockSize,
@@ -292,7 +292,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB> &
     // TODO
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::CSR_HYB> &A, size_t blockSize,
diff --git a/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl b/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl
index f7c42fd..839158a 100644
--- a/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl
@@ -192,7 +192,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A,
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field, class LFunc, class SFunc>
 inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A, size_t blockSize,
@@ -571,7 +571,7 @@ inline void pfspmm_zo(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_ZO
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field, class LFunc, class SFunc, class VectFunc, class ScalFunc>
 inline void pfspmm_zo(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_ZO> &A, size_t blockSize,
diff --git a/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl b/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl
index 773120a..cc79486 100644
--- a/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl
@@ -79,7 +79,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A, s
     }
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A, size_t blockSize,
@@ -202,7 +202,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A, s
     }
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::ELL> &A, size_t blockSize,
@@ -428,7 +428,7 @@ inline void fspmm_one(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_ZO
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_one_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_ZO> &A, size_t blockSize,
@@ -558,7 +558,7 @@ inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse<Field, Sparse
     }
 }
 
-// #endif /*  __FFLASFFPACK_USE_SIMD */
+// #endif /*  __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS */
 
 } // ell_details
 
diff --git a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl
index 5b7aee6..8422a57 100644
--- a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl
@@ -70,7 +70,7 @@ inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd>
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void pfspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd> &A,
                         typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_,
@@ -185,7 +185,7 @@ inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd>
 #endif // TBB
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void pfspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd> &A,
                         typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) {
@@ -415,7 +415,7 @@ inline void pfspmv_mone(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void pfspmv_one_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd_ZO> &A,
                             typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_,
diff --git a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl
index 28e06b5..b627397 100644
--- a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl
@@ -51,7 +51,7 @@ inline void fspmv(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd>
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd> &A,
@@ -117,7 +117,7 @@ inline void fspmv(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd>
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void fspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd> &A,
                        typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) {
@@ -299,7 +299,7 @@ inline void fspmv_mone(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_s
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void fspmv_one_simd(const Field &F, const Sparse<Field, SparseMatrix_t::ELL_simd_ZO> &A,
                            typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_,
diff --git a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl
index b5325c5..50fa355 100644
--- a/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl
@@ -56,7 +56,7 @@ template <class Field, class IndexT>
 inline void sparse_init(const Field &F, Sparse<Field, SparseMatrix_t::ELL_simd> &A, const IndexT *row,
                         const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim,
                         uint64_t nnz) {
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
     using simd = Simd<typename Field::Element>;
     A.chunk = simd::vect_size;
 #else
@@ -110,7 +110,7 @@ template <class Field, class IndexT>
 inline void sparse_init(const Field &F, Sparse<Field, SparseMatrix_t::ELL_simd_ZO> &A, const IndexT *row,
                         const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim,
                         uint64_t nnz) {
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
     using simd = Simd<typename Field::Element>;
     A.chunk = simd::vect_size;
 #else
diff --git a/fflas-ffpack/fflas/fflas_sparse/hyb_zo.h b/fflas-ffpack/fflas/fflas_sparse/hyb_zo.h
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl b/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl
index 42bc4ab..8bd94c8 100644
--- a/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl
@@ -57,7 +57,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &
         sparse_details_impl::pfspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag());
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A, size_t blockSize,
@@ -102,7 +102,7 @@ inline void pfspmm(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &
         sparse_details_impl::pfspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax);
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A, size_t blockSize,
diff --git a/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl b/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl
index 0c0585f..fc1a952 100644
--- a/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl
@@ -56,7 +56,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A
         sparse_details_impl::fspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag());
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A, size_t blockSize,
@@ -101,7 +101,7 @@ inline void fspmm(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A
         sparse_details_impl::fspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax);
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmm_simd_aligned(const Field &F, const Sparse<Field, SparseMatrix_t::HYB_ZO> &A, size_t blockSize,
diff --git a/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_utils.inl b/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_utils.inl
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl b/fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl
index 20b53a6..c608914 100644
--- a/fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl
@@ -74,7 +74,7 @@ inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
@@ -193,7 +193,7 @@ inline void pfspmv(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
 #endif // TBB
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void pfspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
                         typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) {
@@ -454,7 +454,7 @@ inline void pfspmv_mone(const Field &F, const Sparse<Field, SparseMatrix_t::SELL
 #endif
 }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void pfspmv_one_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL_ZO> &A,
diff --git a/fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl b/fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl
index 0f529ee..e5b106f 100644
--- a/fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl
@@ -54,7 +54,7 @@ inline void fspmv(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template <class Field>
 inline void fspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
@@ -121,7 +121,7 @@ inline void fspmv(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void fspmv_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL> &A,
                        typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) {
@@ -264,7 +264,7 @@ inline void fspmv_mone(const Field &F, const Sparse<Field, SparseMatrix_t::SELL_
     }
 }
 
-// #ifdef __FFLASFFPACK_USE_SIMD
+// #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 template <class Field>
 inline void fspmv_one_simd(const Field &F, const Sparse<Field, SparseMatrix_t::SELL_ZO> &A,
                            typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_,
diff --git a/fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl b/fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl
index aef50c4..3a1fb35 100644
--- a/fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl
+++ b/fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl
@@ -110,7 +110,7 @@ inline void sparse_init(const Field &F, Sparse<Field, SparseMatrix_t::SELL> &A,
     A.m = rowdim;
     A.n = coldim;
     A.nnz = nnz;
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
     using simd = Simd<typename Field::Element>;
     A.chunk = simd::vect_size;
 #else
diff --git a/fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h b/fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h
index 90b16d3..6fe41c7 100644
--- a/fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h
+++ b/fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h
@@ -87,13 +87,13 @@ using NotZOSparseMatrix = std::false_type;
 
 template<class F, class M> struct isSparseMatrixSimdFormat : public std::false_type {};
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 template<class Field> struct isSparseMatrixSimdFormat<Field, Sparse<Field, SparseMatrix_t::SELL>> : public support_simd<typename Field::Element>::type {};
 
 template<class Field> struct isSparseMatrixSimdFormat<Field, Sparse<Field, SparseMatrix_t::ELL_simd>> : public support_simd<typename Field::Element>::type {};
 
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 
 using SimdSparseMatrix = std::true_type;
 using NoSimdSparseMatrix = std::false_type;
diff --git a/fflas-ffpack/ffpack/ffpack.h b/fflas-ffpack/ffpack/ffpack.h
index aa71fa3..7a5eeb0 100644
--- a/fflas-ffpack/ffpack/ffpack.h
+++ b/fflas-ffpack/ffpack/ffpack.h
@@ -153,11 +153,9 @@ namespace FFPACK { /* Permutations */
 	void cyclic_shift_col(const Field& F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda);
 
 
-	/** Apply a permutation submatrix of P (between ibeg and iend) to a matrix
-	 * to (iend-ibeg) vectors of size M stored in A (as column for NoTrans
-	 * and rows for Trans).
-	 * Side==FFLAS::FflasLeft for row permutation Side==FFLAS::FflasRight for a column
-	 * permutation
+	/** Apply a permutation P, stored in the LAPACK format (a sequence of transpositions) 
+	 * between indices ibeg and iend of P to (iend-ibeg) vectors of size M stored in A (as column for NoTrans and rows for Trans).
+	 * Side==FFLAS::FflasLeft for row permutation Side==FFLAS::FflasRight for a column permutation
 	 * Trans==FFLAS::FflasTrans for the inverse permutation of P
 	 * @param F
 	 * @param Side
@@ -177,11 +175,57 @@ namespace FFPACK { /* Permutations */
 		const FFLAS::FFLAS_TRANSPOSE Trans,
 		const size_t M, const size_t ibeg, const size_t iend,
 		typename Field::Element_ptr A, const size_t lda, const size_t * P );
+	
+	
+	/** Apply a R-monotonically increasing permutation P, to the matrix A.
+	 * The permutation represented by P is defined as follows:
+	 *  - the first R values of P is a LAPACK reprensentation (a sequence of transpositions)
+	 *  - the remaining iend-ibeg-R values of the permutation are in a monotonically increasing progression
+	 * Side==FFLAS::FflasLeft for row permutation Side==FFLAS::FflasRight for a column permutation
+	 * Trans==FFLAS::FflasTrans for the inverse permutation of P
+	 * @param F
+	 * @param Side
+	 * @param Trans
+	 * @param M
+	 * @param ibeg
+	 * @param iend
+	 * @param A
+	 * @param lda
+	 * @param P
+	 * @param R
+	 */
+	template<class Field>
+	void
+	MonotonicApplyP (const Field& F,
+					 const FFLAS::FFLAS_SIDE Side,
+					 const FFLAS::FFLAS_TRANSPOSE Trans,
+					 const size_t M, const size_t ibeg, const size_t iend,
+					 typename Field::Element_ptr A, const size_t lda, const size_t * P, const size_t R);
+	template<class Field>
+	void
+	MonotonicCompress (const Field& F,
+					   const FFLAS::FFLAS_SIDE Side,
+					   const size_t M,
+					   typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * P,
+					   const size_t R, const size_t maxpiv, const size_t rowstomove,
+					   const std::vector<bool> &ispiv);
+	template<class Field>
+	void
+	MonotonicCompressMorePivots (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+								 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+								 const size_t * MathP, const size_t R, const size_t rowstomove, const size_t lenP);
+	template<class Field>
+	void
+	MonotonicCompressCycles (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+							 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+							 const size_t * MathP, const size_t lenP);
 
-
-
-//#ifdef __FFLASFFPACK_USE_OPENMP
-
+	template<class Field>
+	void
+	MonotonicExpand (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+					 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+					 const size_t * MathP, const size_t R, const size_t maxpiv,
+					 const size_t rowstomove, const std::vector<bool> &ispiv);
 
 	//! Parallel applyP with OPENMP tasks
 	template<class Field>
@@ -1425,9 +1469,18 @@ namespace FFPACK { /* not used */
 #include "fflas-ffpack/interfaces/libs/ffpack_inst.h"
 #endif
 
+//---------------------------------------------------------------------
+// Checkers
+#include "fflas-ffpack/checkers/checkers_ffpack.h"
+//---------------------------------------------------------------------
+
 #include "ffpack_fgesv.inl"
 #include "ffpack_fgetrs.inl"
 #include "ffpack_ftrtr.inl"
+//---------------------------------------------------------------------
+// Checkers
+#include "fflas-ffpack/checkers/checkers_ffpack.inl"
+//---------------------------------------------------------------------
 #include "ffpack_pluq.inl"
 #include "ffpack_pluq_mp.inl"
 #include "ffpack_ppluq.inl"
@@ -1446,5 +1499,6 @@ namespace FFPACK { /* not used */
 #include "ffpack_permutation.inl"
 #include "ffpack_rankprofiles.inl"
 #include "ffpack.inl"
+
 #endif // __FFLASFFPACK_ffpack_H
 
diff --git a/fflas-ffpack/ffpack/ffpack_charpoly.inl b/fflas-ffpack/ffpack/ffpack_charpoly.inl
index c5765f3..d236487 100644
--- a/fflas-ffpack/ffpack/ffpack_charpoly.inl
+++ b/fflas-ffpack/ffpack/ffpack_charpoly.inl
@@ -151,7 +151,8 @@ namespace FFPACK {
 		  typename Field::Element_ptr A, const size_t lda,
 		  const FFPACK_CHARPOLY_TAG CharpTag/*= FfpackArithProg*/)
 	{
-
+		Checker_charpoly<Field,Polynomial> checker(F,N,A,lda);
+		
 		std::list<Polynomial> factor_list;
 		CharPoly (F, factor_list, N, A, lda, CharpTag);
 		typename std::list<Polynomial >::const_iterator it;
@@ -167,6 +168,8 @@ namespace FFPACK {
 			++it;
 		}
 
+		checker.check(charp);
+
 		return charp;
 	}
 
diff --git a/fflas-ffpack/ffpack/ffpack_invert.inl b/fflas-ffpack/ffpack/ffpack_invert.inl
index 5472267..a005531 100644
--- a/fflas-ffpack/ffpack/ffpack_invert.inl
+++ b/fflas-ffpack/ffpack/ffpack_invert.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 FFLAS-FFACK group
  *
@@ -38,21 +38,24 @@ template <class Field>
 		typename Field::Element_ptr A, const size_t lda,
 		int& nullity)
 	{
-		FFLASFFPACK_check(lda >= M);
+        FFLASFFPACK_check(lda >= M);
+
+		Checker_invert<Field> checker(F,M,A,lda);
 
 		if (M == 0) {
 			nullity = 0 ;
 			return NULL ;
 		}
-
 		size_t * P = FFLAS::fflas_new<size_t>(M);
 		size_t * Q = FFLAS::fflas_new<size_t>(M);
-		size_t R =  ReducedColumnEchelonForm (F, M, M, A, lda, P, Q);
+		size_t R =  ReducedColumnEchelonForm (F, M, M, A, lda, P, Q, true);
 		nullity = (int)(M - R);
 		applyP (F, FFLAS::FflasLeft, FFLAS::FflasTrans,
 			M, 0, (int)R, A, lda, P);
 		delete [] P;
 		delete [] Q;
+
+		checker.check(A,nullity);
 		return A;
 	}
 
@@ -69,10 +72,8 @@ template <class Field>
 			nullity = 0 ;
 			return NULL ;
 		}
-
-
 		FFLAS::fassign(F,M,M,A,lda,X,ldx);
-		Invert (F,  M, X, lda, nullity);
+		Invert (F, M, X, ldx, nullity);
 		return X;
 	}
 
diff --git a/fflas-ffpack/ffpack/ffpack_ludivine.inl b/fflas-ffpack/ffpack/ffpack_ludivine.inl
index d5c6619..d10009f 100644
--- a/fflas-ffpack/ffpack/ffpack_ludivine.inl
+++ b/fflas-ffpack/ffpack/ffpack_ludivine.inl
@@ -67,9 +67,8 @@ namespace FFPACK {
 				fgemv (F, FFLAS::FflasTrans, r, N-r, F.mOne, A+r, lda, A+(k+1)*lda, 1, F.one, A+(k+1)*lda+r, 1);
 			}
 			else
-				return r;
+				break; // return r;
 		}
-
 		return r;
 	}
 
@@ -418,7 +417,6 @@ namespace FFPACK {
 			  , const size_t cutoff // =__FFPACK_LUDIVINE_CUTOFF
 		 )
 	{
-		//std::cout<<"LUDivine ("<<M<<","<<N<<")"<<std::endl;
 		if ( !(M && N) ) return 0;
 		typedef typename Field::Element elt;
 		size_t MN = std::min(M,N);
diff --git a/fflas-ffpack/ffpack/ffpack_permutation.inl b/fflas-ffpack/ffpack/ffpack_permutation.inl
index 720e048..d6fee55 100644
--- a/fflas-ffpack/ffpack/ffpack_permutation.inl
+++ b/fflas-ffpack/ffpack/ffpack_permutation.inl
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 FFLAS-FFACK group
  *
@@ -30,19 +30,276 @@
 #ifndef __FFLASFFPACK_ffpack_permutation_INL
 #define __FFLASFFPACK_ffpack_permutation_INL
 
+
 #include <givaro/zring.h>
 
 #include "fflas-ffpack/fflas/fflas_fassign.h"
 
+#define FFLASFFPACK_PERM_BKSIZE 32
+
 namespace FFPACK {
+	    /** MonotonicApplyP
+	     * Apply a permutation defined by the first R entries of the vector P (the pivots).
+	     * The non pivot elements, are located in montonically increasing order.
+	     */
+	template<class Field>
+	void
+	MonotonicApplyP (const Field& F,
+					 const FFLAS::FFLAS_SIDE Side,
+					 const FFLAS::FFLAS_TRANSPOSE Trans,
+					 const size_t M, const size_t ibeg, const size_t iend,
+					 typename Field::Element_ptr A, const size_t lda, const size_t * P, const size_t R)
+	{
+		const size_t B = FFLASFFPACK_PERM_BKSIZE;
+		size_t lenP = iend-ibeg;
+		size_t * MathP = new size_t[lenP];
+		for (size_t i=0; i<lenP; ++i)
+			MathP[i] = i;
+		LAPACKPerm2MathPerm (MathP, P, lenP);
+
+		std::vector<bool> ispiv(lenP,false);
+		size_t pivrowstomove = 0;
+		size_t nonpivrowstomove = 0;
+		size_t maxpiv = R-1;
+		for (size_t i=0; i<R; i++) {
+			ispiv[MathP[i]] = true;
+			if (MathP[i] != i){
+				pivrowstomove++;
+				if(maxpiv < MathP[i]) maxpiv = MathP[i];
+			}
+		}
+		if (!pivrowstomove) // Permutation is the identity
+			return;
+
+		for (size_t i=R; i<lenP; i++)
+			if (MathP[i] != i)
+				nonpivrowstomove++;
+		size_t NB = M/B;
+		size_t last = M%B;
+		size_t incA, llda;
+		if (Side == FFLAS::FflasLeft)  {incA = 1; llda = lda;}
+		else {incA = lda; llda = 1;}
+		size_t inc = B*incA;
+
+		if (((Side == FFLAS::FflasLeft) && (Trans == FFLAS::FflasNoTrans)) ||
+			((Side == FFLAS::FflasRight) && (Trans == FFLAS::FflasTrans))){
+				// Compressing
+#ifdef MONOTONIC_CYCLES
+			for (size_t i = 0; i<NB; i++)
+				MonotonicCompressCycles (F, Side, B, A+i*inc, llda, incA, MathP, lenP);
+			MonotonicCompressCycles (F, Side, last, A+NB*inc, llda, incA, MathP, lenP);
+#elif defined MONOTONIC_MOREPIVOTS
+			for (size_t i = 0; i<NB; i++)
+				MonotonicCompressMorePivots (F, Side, B, A+i*inc, llda, incA, MathP, R, nonpivrowstomove, lenP);
+			MonotonicCompressMorePivots (F, Side, last, A+NB*inc, llda, incA, MathP, R, nonpivrowstomove, lenP);
+#else
+			for (size_t i = 0; i<NB; i++)
+				MonotonicCompress (F, Side, B, A+i*inc, llda, incA, MathP, R, maxpiv, pivrowstomove, ispiv);
+			MonotonicCompress (F, Side, last, A+NB*inc, llda, incA, MathP, R, maxpiv, pivrowstomove, ispiv);	
+#endif
+		} else {
+				// Expanding
+			for (size_t i = 0; i<NB; i++)
+				MonotonicExpand (F, Side, B, A+i*inc, llda, incA, MathP, R, maxpiv, pivrowstomove, ispiv);
+			MonotonicExpand (F, Side, last, A+NB*inc, llda, incA, MathP, R, maxpiv, pivrowstomove, ispiv);	
+		}
+		delete[] MathP;
+	}
 
 	template<class Field>
 	void
-	applyP( const Field& F,
-		const FFLAS::FFLAS_SIDE Side,
-		const FFLAS::FFLAS_TRANSPOSE Trans,
-		const size_t M, const size_t ibeg, const size_t iend,
-		typename Field::Element_ptr A, const size_t lda, const size_t * P )
+	MonotonicCompress (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+					   typename Field::Element_ptr A, const size_t lda, const size_t incA,
+					   const size_t * MathP, const size_t R, const size_t maxpiv,
+					   const size_t rowstomove, const std::vector<bool> &ispiv)
+	{	
+			// Storing pivot rows in temp
+		typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M);
+		size_t ldtemp=M;
+		for (size_t i=0,j=0; i<R; i++){
+			if (MathP[i] != i){
+				FFLAS::fassign (F, M, A+MathP[i]*lda, incA, temp+j*ldtemp, 1);
+				j++;
+			}
+		}
+			// Moving non pivot rows to the R+1 .. iend positions
+		int dest = maxpiv;
+		int src = dest - 1;
+		while (dest >= (int)R){
+			if ((src >= 0) && ispiv[src]){ // src points to a pivot row: skip it
+				src--;
+				continue;
+			}
+			FFLAS::fassign(F, M, A+src*lda, incA, A+dest*lda, incA);
+			src--; dest--;
+		}
+			// Moving the pivots to their position in the first R rows
+		for (size_t i=0, j=0; i<R; i++)
+			if (MathP[i] != i){
+				FFLAS::fassign (F, M, temp + j*ldtemp, 1, A + i*lda, incA);
+				j++;
+			}
+		FFLAS::fflas_delete(temp);
+	}
+
+	template<class Field>
+	void
+	MonotonicCompressMorePivots (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+								 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+								 const size_t * MathP, const size_t R, const size_t rowstomove, const size_t lenP)
+	{
+		std::vector<bool> done(lenP,false);
+		typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M);
+		size_t ldtemp=M;
+			// Move every non pivot row to temp
+#ifdef VERBOSE
+		std::cerr<<"R = "<<R<<std::endl;
+		write_perm(std::cerr<<"MathP = ",MathP,lenP);
+#endif
+		for (size_t i=R,j=0; i<lenP; i++){
+			if (MathP[i] != i){
+#ifdef VERBOSE
+				std::cerr<<"A["<<MathP[i]<<"] -> temp["<<j<<"]"<<std::endl;
+#endif
+				FFLAS::fassign (F, M, A+MathP[i]*lda, incA, temp+j*ldtemp, 1);
+				done[MathP[i]]=true;
+				j++;
+			}
+		}
+			// Move the pivot rows of every cycle containing a non pivot row (avoiding to use a temp)
+		for (size_t i=R; i<lenP; i++){
+			size_t j=MathP[i];
+			while ((MathP[j] != j) && (!done[MathP[j]])){
+					// A[P[j]] -> A[j]
+#ifdef VERBOSE
+				std::cerr<<"Moving pivots 1 A["<<MathP[j]<<"] -> A["<<j<<"]"<<std::endl;
+#endif
+				FFLAS::fassign (F, M, A+MathP[j]*lda, incA, A+j*lda, incA);
+				done[MathP[j]] = true;
+				j = MathP[j];
+			}
+		}
+		
+			// Moving the remaining cycles using one vector temp
+		typename Field::Element_ptr tmprow = FFLAS::fflas_new(F,1,M);
+		for (size_t i=0; i<R; i++){
+			if ((MathP[i]!=i)&&(!done[MathP[i]])){ // entering a cycle
+				size_t j=i;
+#ifdef VERBOSE
+				std::cerr<<"Moving pivots 2 A["<<j<<"] -> tmprow"<<std::endl;
+#endif
+				FFLAS::fassign (F, M, A+j*lda, incA, tmprow, 1);
+				done[j] = true;
+				do{
+						// A[P[j]] -> A[j]
+#ifdef VERBOSE
+					std::cerr<<"Moving pivots 2 A["<<MathP[j]<<"] -> A["<<j<<"]"<<std::endl;
+#endif
+					FFLAS::fassign (F, M, A+MathP[j]*lda, incA, A+j*lda, incA);
+					done[MathP[j]] = true;
+					j = MathP[j];
+				} while (!done[MathP[j]]);
+				FFLAS::fassign (F, M, tmprow, 1, A+j*lda, incA);
+#ifdef VERBOSE
+				std::cerr<<"Moving pivots 2 tmprow -> A["<<j<<"]"<<std::endl;
+#endif
+			}
+		}
+			// Move the non pivot rows to the last lenP-R positions
+		for (size_t i=R, j=0; i<lenP; i++)
+			if (MathP[i] != i){
+#ifdef VERBOSE
+				std::cerr<<"temp["<<j<<"] -> A["<<i<<"] "<<std::endl;
+#endif
+				FFLAS::fassign (F, M, temp + j*ldtemp, 1, A + i*lda, incA);
+				j++;
+			}
+
+		FFLAS::fflas_delete(tmprow);
+		FFLAS::fflas_delete(temp);
+	}
+
+	template<class Field>
+	void
+	MonotonicCompressCycles (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+							 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+							 const size_t * MathP, const size_t lenP)
+	{
+		std::vector<bool> done(lenP,false);
+			// Move every non pivot row to temp
+#ifdef VERBOSE
+		write_perm(std::cerr<<"MathP = ",MathP,lenP);
+#endif
+			// Moving the remaining cycles using one vector temp
+		typename Field::Element_ptr tmprow = FFLAS::fflas_new(F,1,FFLASFFPACK_PERM_BKSIZE);
+		for (size_t i=0; i<lenP; i++){
+			if ((MathP[i]!=i)&&(!done[MathP[i]])){ // entering a cycle
+				size_t j=i;
+#ifdef VERBOSE
+				std::cerr<<"Moving pivots A["<<j<<"] -> tmprow"<<std::endl;
+#endif
+				FFLAS::fassign (F, M, A+j*lda, incA, tmprow, 1);
+				done[j] = true;
+				do{
+						// A[P[j]] -> A[j]
+#ifdef VERBOSE
+					std::cerr<<"Moving pivots A["<<MathP[j]<<"] -> A["<<j<<"]"<<std::endl;
+#endif
+					FFLAS::fassign (F, M, A+MathP[j]*lda, incA, A+j*lda, incA);
+					done[MathP[j]] = true;
+					j = MathP[j];
+				} while (!done[MathP[j]]);
+				FFLAS::fassign (F, M, tmprow, 1, A+j*lda, incA);
+#ifdef VERBOSE
+				std::cerr<<"Moving pivots tmprow -> A["<<j<<"]"<<std::endl;
+#endif
+			}
+		}
+		FFLAS::fflas_delete(tmprow);
+	}
+	template<class Field>
+	void
+	MonotonicExpand (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M,
+					 typename Field::Element_ptr A, const size_t lda, const size_t incA,
+					 const size_t * MathP, const size_t R, const size_t maxpiv,
+					 const size_t rowstomove, const std::vector<bool> &ispiv)
+	{
+			// Storing pivot rows in temp
+		typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M);
+		size_t ldtemp=M;
+		for (size_t i=0,j=0; i<R; i++){
+			if (MathP[i] != i){
+				FFLAS::fassign (F, M, A+i*lda, incA, temp+j*ldtemp, 1);
+				j++;
+			}
+		}
+			// Moving the non pivot rows
+		size_t dest = 0;
+		size_t src = R;
+		while (src <= maxpiv){
+			if (ispiv[dest]){ // src points to a pivot row: skip it
+				dest++;
+				continue;
+			}
+			FFLAS::fassign(F, M, A+src*lda, incA, A+dest*lda, incA);
+			src++; dest++;
+		}
+			// Moving the pivots to their final position
+		for (size_t i=0, j=0; i<R; i++)
+			if (MathP[i] != i){
+				FFLAS::fassign (F, M, temp + j*ldtemp, 1, A + MathP[i]*lda, incA);
+				j++;
+			}
+		FFLAS::fflas_delete(temp);
+	}
+
+	template<class Field>
+	void
+	applyP_block (const Field& F,
+				  const FFLAS::FFLAS_SIDE Side,
+				  const FFLAS::FFLAS_TRANSPOSE Trans,
+				  const size_t M, const size_t ibeg, const size_t iend,
+				  typename Field::Element_ptr A, const size_t lda, const size_t * P)
 	{
 		if ( Side == FFLAS::FflasRight ) {
 			if ( Trans == FFLAS::FflasTrans ){
@@ -67,6 +324,25 @@ namespace FFPACK {
 		}
 	}
 
+	template<class Field>
+	void
+	applyP( const Field& F,
+		const FFLAS::FFLAS_SIDE Side,
+		const FFLAS::FFLAS_TRANSPOSE Trans,
+		const size_t M, const size_t ibeg, const size_t iend,
+		typename Field::Element_ptr A, const size_t lda, const size_t * P )
+	{
+	
+		const size_t bk = FFLASFFPACK_PERM_BKSIZE;
+		const size_t NB = M/bk;
+		const size_t last = M%bk;
+		const size_t incA = (Side == FFLAS::FflasLeft)? 1:lda;
+		const size_t inc = bk*incA;
+
+		for (size_t i = 0; i<NB; i++)
+			applyP_block (F, Side, Trans, bk, ibeg, iend, A+i*inc, lda, P);
+		applyP_block (F, Side, Trans, last, ibeg, iend, A+NB*inc, lda, P);
+	}
 
 	template<class Field>
 	inline void doApplyS (const Field& F,
@@ -151,9 +427,6 @@ namespace FFPACK {
 		for (size_t i=0; i<N; i++){
 			if (LapackP[i] != i){
 				std::swap(MathP[i],MathP[LapackP[i]]);
-				// size_t tmp = MathP[i];
-				// MathP[i] = MathP[LapackP[i]];
-				// MathP[LapackP[i]] = tmp;
 			}
 		}
 	}
@@ -448,7 +721,7 @@ namespace FFPACK {
 		 const size_t m, const size_t ibeg, const size_t iend,
 		 typename Field::Element_ptr A, const size_t lda, const size_t * P )
 	{
-		int numthreads = MAX_THREADS;//omp_get_max_threads();
+		int numthreads = MAX_THREADS;
 		size_t BLOCKSIZE=std::max(2*m/numthreads,(size_t)1); // Assume that there is at least 2 ApplyP taking place in parallel
 		size_t NBlocks = m/BLOCKSIZE;
 		size_t LastBlockSize = m % BLOCKSIZE;
@@ -463,7 +736,6 @@ namespace FFPACK {
 				size_t BlockDim = BLOCKSIZE;
 				if (t == NBlocks-1)
 					BlockDim = LastBlockSize;
-				//#pragma omp task shared (A, P, F) firstprivate(BlockDim)
 
 				TASK(MODE(CONSTREFERENCE(F, A,P) READ(A[BLOCKSIZE*t*((Side == FFLAS::FflasRight)?lda:1)])),
 				     applyP(F, Side, Trans, BlockDim, ibeg, iend, A+BLOCKSIZE*t*((Side == FFLAS::FflasRight)?lda:1), lda, P););
diff --git a/fflas-ffpack/ffpack/ffpack_pluq.inl b/fflas-ffpack/ffpack/ffpack_pluq.inl
index badd07b..8f705c5 100644
--- a/fflas-ffpack/ffpack/ffpack_pluq.inl
+++ b/fflas-ffpack/ffpack/ffpack_pluq.inl
@@ -373,8 +373,9 @@ namespace FFPACK {
 				Fi.inv (invpiv, *(CurrRow+i));
 				if (Diag == FFLAS::FflasUnit)
 					FFLAS::fscalin (Fi, N-i-1, invpiv, CurrRow+i+1,1);
-				else
+				else{
 					FFLAS::fscalin (Fi, M-row-1, invpiv, CurrRow+i+lda,lda);
+				}
 
 				if (i > rank){
 					    // Column rotation to move pivot on the diagonal
@@ -426,14 +427,13 @@ namespace FFPACK {
 		MathPerm2LAPACKPerm (P, MathP, M);
 		FFLAS::fflas_delete( MathP);
 		FFLAS::fzero (Fi, M-rank, N-rank, A+rank*(1+lda), lda);
-
 		return (size_t) rank;
 	}
 
 
 	template<class Field>
 	inline size_t
-	PLUQ (const Field& Fi, const FFLAS::FFLAS_DIAG Diag,
+	_PLUQ (const Field& Fi, const FFLAS::FFLAS_DIAG Diag,
 	      const size_t M, const size_t N,
 	      typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q)
 	{
@@ -506,7 +506,7 @@ namespace FFPACK {
 
 		    // A1 = P1 [ L1 ] [ U1 V1 ] Q1
 		    //         [ M1 ]
-		R1 = PLUQ (Fi, Diag, M2, N2, A, lda, P1, Q1);
+		R1 = _PLUQ (Fi, Diag, M2, N2, A, lda, P1, Q1);
 		typename Field::Element_ptr A2 = A + N2;
 		typename Field::Element_ptr A3 = A + M2*lda;
 		typename Field::Element_ptr A4 = A3 + N2;
@@ -514,9 +514,14 @@ namespace FFPACK {
 		typename Field::Element_ptr G = A3 + R1;
 		    // [ B1 ] <- P1^T A2
 		    // [ B2 ]
+#ifdef MONOTONIC_APPLYP
+		MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M2, A2, lda, P1, R1);
+		MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N2, A3, lda, Q1, R1);	
+#else
 		applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M2, A2, lda, P1);
 		    // [ C1 C2 ] <- A3 Q1^T
 		applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N2, A3, lda, Q1);
+#endif
 		    // D <- L1^-1 B1
 		ftrsm (Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R1, N-N2, Fi.one, A, lda, A2, lda);
 		    // E <- C1 U1^-1
@@ -531,33 +536,48 @@ namespace FFPACK {
 		    //        [ M2 ]
 		size_t * P2 = FFLAS::fflas_new<size_t >(M2-R1);
 		size_t * Q2 = FFLAS::fflas_new<size_t >(N-N2);
-		R2 = PLUQ (Fi, Diag, M2-R1, N-N2, F, lda, P2, Q2);
+		R2 = _PLUQ (Fi, Diag, M2-R1, N-N2, F, lda, P2, Q2);
 		    // G = P3 [ L3 ] [ U3 V3 ] Q3
 		    //        [ M3 ]
 		size_t * P3 = FFLAS::fflas_new<size_t >(M-M2);
 		size_t * Q3 = FFLAS::fflas_new<size_t >(N2-R1);
-		R3 = PLUQ (Fi, Diag, M-M2, N2-R1, G, lda, P3, Q3);
+		R3 = _PLUQ (Fi, Diag, M-M2, N2-R1, G, lda, P3, Q3);
 		    // [ H1 H2 ] <- P3^T H Q2^T
 		    // [ H3 H4 ]
+#ifdef MONOTONIC_APPLYP
+		MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N-N2, A4, lda, Q2, R2);
+		MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M-M2, A4, lda, P3, R3);
+#else
 		applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N-N2, A4, lda, Q2);
 		applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M-M2, A4, lda, P3);
+#endif
 		    // [ E1 ] <- P3^T E
 		    // [ E2 ]
+#ifdef MONOTONIC_APPLYP
+		MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M-M2, A3, lda, P3, R3);
+#else
 		applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M-M2, A3, lda, P3);
+#endif
 		    // [ M11 ] <- P2^T M1
 		    // [ M12 ]
+#ifdef MONOTONIC_APPLYP
+		MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M2-R1, A+R1*lda, lda, P2, R2);
+		    // [ D1 D2 ] <- D Q2^T
+		MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N-N2, A2, lda, Q2, R2);
+		    // [ V1 V2 ] <- V1 Q3^T
+		MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N2-R1, A+R1, lda, Q3, R3);
+#else
 		applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M2-R1, A+R1*lda, lda, P2);
 		    // [ D1 D2 ] <- D Q2^T
 		applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N-N2, A2, lda, Q2);
 		    // [ V1 V2 ] <- V1 Q3^T
 		applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N2-R1, A+R1, lda, Q3);
+#endif
 		    // I <- H U2^-1
 		    // K <- H3 U2^-1
 		ftrsm (Fi, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, M-M2, R2, Fi.one, F, lda, A4, lda);
 		    // J <- L3^-1 I (in a temp)
 		typename Field::Element_ptr temp = FFLAS::fflas_new (Fi, R3, R2);
-		// for (size_t i=0; i<R3; ++i)
-			// FFLAS::fassign (Fi, R2, A4 + i*lda, 1, temp + i*R2, 1);
 		FFLAS::fassign (Fi, R3, R2, A4 , lda, temp , R2);
 		ftrsm (Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R3, R2, Fi.one, G, lda, temp, R2);
 		    // N <- L3^-1 H2
@@ -573,16 +593,24 @@ namespace FFPACK {
 		    //         [ M4 ]
 		size_t * P4 = FFLAS::fflas_new<size_t >(M-M2-R3);
 		size_t * Q4 = FFLAS::fflas_new<size_t >(N-N2-R2);
-		R4 = PLUQ (Fi, Diag, M-M2-R3, N-N2-R2, R, lda, P4, Q4);
+		R4 = _PLUQ (Fi, Diag, M-M2-R3, N-N2-R2, R, lda, P4, Q4);
 		    // [ E21 M31 0 K1 ] <- P4^T [ E2 M3 0 K ]
 		    // [ E22 M32 0 K2 ]
+#ifdef MONOTONIC_APPLYP
+		MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N2+R2, size_t(0), M-M2-R3, A3+R3*lda, lda, P4, R4);
+		    // [ D21 D22 ]     [ D2 ]
+		    // [ V21 V22 ]  <- [ V2 ] Q4^T
+		    // [  0   0  ]     [  0 ]
+		    // [ O1   O2 ]     [  O ]
+		MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M2+R3, size_t(0), N-N2-R2, A2+R2, lda, Q4, R4);
+#else
 		applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N2+R2, size_t(0), M-M2-R3, A3+R3*lda, lda, P4);
 		    // [ D21 D22 ]     [ D2 ]
 		    // [ V21 V22 ]  <- [ V2 ] Q4^T
 		    // [  0   0  ]     [  0 ]
 		    // [ O1   O2 ]     [  O ]
 		applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M2+R3, size_t(0), N-N2-R2, A2+R2, lda, Q4);
-
+#endif
 		    // P <- Diag (P1 [ I_R1    ] , P3 [ I_R3    ])
 		    //               [      P2 ]      [      P4 ]
 		size_t* MathP = FFLAS::fflas_new<size_t>(M);
@@ -627,6 +655,18 @@ namespace FFPACK {
 		return R1+R2+R3+R4;
 	}
 
+	template<class Field>
+	inline size_t
+	PLUQ (const Field& Fi, const FFLAS::FFLAS_DIAG Diag,
+	      const size_t M, const size_t N,
+	      typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q)
+	{
+		Checker_PLUQ<Field> checker (Fi,M,N,A,lda);
+		size_t R = FFPACK::_PLUQ(Fi,Diag,M,N,A,lda,P,Q);
+		checker.check(A,lda,R,P,Q);
+		return R;
+	}
+
 
 } // namespace FFPACK
 #endif // __FFLASFFPACK_ffpack_pluq_INL
diff --git a/fflas-ffpack/ffpack/ffpack_ppluq.inl b/fflas-ffpack/ffpack/ffpack_ppluq.inl
index 434d851..2ab5a39 100644
--- a/fflas-ffpack/ffpack/ffpack_ppluq.inl
+++ b/fflas-ffpack/ffpack/ffpack_ppluq.inl
@@ -78,6 +78,7 @@ namespace FFPACK {
 	}
 
 
+	    // TODO: replace pPLUQ and "int nt", by PLUQ and a Parallel Helper ...
 	template<class Field>
 	inline size_t
 	pPLUQ(const Field& Fi, const FFLAS::FFLAS_DIAG Diag,
diff --git a/fflas-ffpack/field/Makefile.am b/fflas-ffpack/field/Makefile.am
index 42ed9a5..a1ac51f 100644
--- a/fflas-ffpack/field/Makefile.am
+++ b/fflas-ffpack/field/Makefile.am
@@ -29,9 +29,9 @@ RNS=rns.h			        \
 	rns-double.h			\
 	rns-double-elt.h		\
 	rns-double.inl			\
+	rns-double-recint.inl		\
 	rns-integer.h			\
-	rns-integer-mod.h     \
-	modular-extended.h
+	rns-integer-mod.h
 
 pkgincludesub_HEADERS=          	\
 	  field-traits.h                \
diff --git a/fflas-ffpack/field/field-traits.h b/fflas-ffpack/field/field-traits.h
index 9c68ac4..8354c46 100644
--- a/fflas-ffpack/field/field-traits.h
+++ b/fflas-ffpack/field/field-traits.h
@@ -172,15 +172,15 @@ namespace FFLAS { /*  Traits */
 	template <typename Element, typename Compute> 
 	struct ModeTraits<Givaro::Modular<Element,Compute> >{typedef typename ModeCategories::DelayedTag value;};
 
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<int8_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<int16_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<int32_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<uint8_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<uint16_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<uint32_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<int8_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<int16_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<int32_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<uint8_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<uint16_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<uint32_t,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::MachineFloatTag> value;};
 
 #ifndef INTEGER_NO_RNS
-	template <> template<typename Compute> struct ModeTraits<Givaro::Modular<Givaro::Integer,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::RNSElementTag> value;};
+	template<typename Compute> struct ModeTraits<Givaro::Modular<Givaro::Integer,Compute> > {typedef typename ModeCategories::ConvertTo<ElementCategories::RNSElementTag> value;};
 #endif
 
 	template <typename Element>
diff --git a/fflas-ffpack/field/modular-extended.h b/fflas-ffpack/field/modular-extended.h
deleted file mode 100644
index 0209805..0000000
--- a/fflas-ffpack/field/modular-extended.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-
-/*
- * Copyright (C) FFLAS group
- *
- * Written by Bastien Vialla <bastien.vialla at lirmm.fr>
- *
- * ========LICENCE========
- * This file is part of the library FFLAS-FFPACK.
- *
- * FFLAS-FFPACK is free software: you can redistribute it and/or modify
- * it under the terms of the  GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- * ========LICENCE========
- *
- */
-
-#ifndef __FFLASFFPACK_MODULAR_EXTENDED_H
-#define __FFLASFFPACK_MODULAR_EXTENDED_H
-
-#include "givaro/givranditer.h"
-#include "givaro/ring-interface.h"
-#include "givaro/modular-general.h"
-
-// namespace Givaro{
-//  template<class T>
-//  class ModularExtended;// : public RingInterface<double>{};
-// } // Givaro
-
-namespace Givaro{
-/*
- *
- * Modular double/float allowing big moduli
- * !!: RandIter does not works, use your own random
- *
- */
-template<class _Element>
-class ModularExtended// : public RingInterface<double>
-{
-public:
-
-	typedef double Element;
-	typedef Element* Element_ptr ;
-	typedef const Element ConstElement;
-	typedef const Element* ConstElement_ptr;
-	// ----- Exported Types and constantes
-	typedef ModularExtended<Element> Self_t;
-	typedef uint64_t Residu_t;
-	enum { size_rep = sizeof(Residu_t) };
-
-private:
-	// Verkampt Split
-	inline void split(const Element x, Element &x_h, Element &x_l) const {
-    	Element c;
-    	if(std::is_same<Element, double>::value){
-    		c = (Element)((1 << 27)+1);	
-    	}else if(std::is_same<Element, float>::value){
-    		c = (Element)((1 << 13)+1);	
-    	}
-    	 
-    	x_h = (c*x)+(x-(c*x));
-    	x_l = x - x_h;
-	}	
-
-	// Dekker mult, a * b = s + t
-	inline void mult(const Element a, const Element b, Element &s, Element &t) const{
-    	s = a*b;
-//#ifdef __FMA__
-    	t = std::fma(-a, b, s);
-//#else
-    	Element ah, al, bh, bl;
-    	split(a, ah, al);
-    	split(b, bh, bl);
-    	t = ((((-s+ah*bh)+(ah*bl))+(al*bh))+(al*bl));
-//#endif
-	}
-
-public:
-	// ----- Constantes
-	const Element zero = 0.0;
-	const Element one = 1.0;
-	const Element mOne = -1.0;
-
-	// ----- Constructors
-	ModularExtended() = default;
-
-	template<class XXX> ModularExtended(const XXX& p)
-	: zero(0.0), one(1.0), mOne((Element)p - 1.0), _p((Element)p), _invp(1/_p), _negp(-_p), _lp((Residu_t)p)
-	{
-	    assert(_p >= getMinModulus());
-	    assert(_p <= maxCardinality());
-	}
-
-	//ModularExtended(const Self_t& F) = default;
-	//ModularExtended(Self_t&& F) = default;
-	// : zero(F.zero), one(F.one), mOne(F.mOne), _p(F._p), _lp(F._lp) {}
-
-	// ----- Accessors
-	inline Element minElement() const  { return zero; }
-	inline Element maxElement() const  { return mOne; }
-
-	// ----- Access to the modulus
-	inline Residu_t residu() const { return _lp; }
-	inline Residu_t size() const { return _lp; }
-	inline Residu_t characteristic() const { return _lp; }
-	template<class T> inline T& characteristic(T& p) const { return p = _lp; }
-	inline Residu_t cardinality() const { return _lp; }
-	template<class T> inline T& cardinality(T& p) const { return p = _lp; }
-	static inline Residu_t maxCardinality() { 
-		if(std::is_same<Element, double>::value)
-			return 4503599627370496;
-		else if(std::is_same<Element, float>::value)
-			return 8388608;
-	}
-	static inline Residu_t getMinModulus() { return 2; }
-
-	// ----- Checkers
-	inline bool isZero(const Element& a) const  { return a == zero; }
-	inline bool isOne (const Element& a) const  { return a == one; }
-	inline bool isMOne(const Element& a) const  { return a == mOne; }
-	inline bool areEqual(const Element& a, const Element& b) const  { return a == b; }
-	inline size_t length(const Element a) const { return size_rep; }
-	
-	// ----- Ring-wise operators
-	inline bool operator==(const Self_t& F) const { return _p == F._p; }
-	inline bool operator!=(const Self_t& F) const { return _p != F._p; }
-	inline Self_t& operator=(const Self_t& F)
-	{
-		F.assign(const_cast<Element&>(one),  F.one);
-		F.assign(const_cast<Element&>(zero), F.zero);
-		F.assign(const_cast<Element&>(mOne), F.mOne);
-		_p = F._p;
-		_negp = F._negp;
-		_invp = F._invp;
-		_lp= F._lp;
-		return *this;
-	}
-
-	// ----- Initialisation
-	Element &init (Element &x) const{
-		return x = zero;
-	}
-
-	template<class XXX> Element& init(Element & x, const XXX & y) const{
-		x=Element(y);
-		return reduce(x);
-	}
-
-	Element &assign (Element &x, const Element &y) const{
-		return x = y;
-	}
-
-	// ----- Convert and reduce
-	Integer& convert  (Integer &x, const Element &y) const{
-		return x = (Integer)y;
-	}
-	Residu_t& convert (Residu_t &x, const Element &y) const{
-		return x = (Residu_t)y;
-	}
-	Element& convert   (Element &x, const Element &y) const{
-		return x = y;
-	}
-	float& convert    (float &x, const Element &y) const{
-		return x = (float)y;
-	}
-
-	Element& reduce (Element& x, const Element& y) const{
-		Element q = floor(y*_invp);
-		Element pqh, pql;
-		mult(_p, q, pqh, pql);
-		x = (x-pqh)-pql;
-		if(x >= _p)
-			x -= _p;
-		else if(x < 0)
-			x += _p;
-		return x;	
-	}
-	Element& reduce (Element& x) const{
-		Element q = floor(x*_invp);
-		Element pqh, pql;
-		mult(_p, q, pqh, pql);
-		x = (x-pqh)-pql;
-		if(x >= _p)
-			x -= _p;
-		else if(x < zero)
-			x += _p;
-		return x;	
-	}
-
-	// ----- Classic arithmetic
-	Element& mul(Element& r, const Element& a, const Element& b) const {
-		Element abh, abl, pqh, pql;
-		mult(a, b, abh, abl);
-		Element q = floor(abh*_invp);
-		mult(_p, q, pqh, pql);		
-		r = (abh-pqh)+(abl-pql);
-		if(r > _p)
-			r-= _p;
-		else if(r < 0)
-			r += _p;
-		return r;
-	}
-
-	
-	Element& div(Element& r, const Element& a, const Element& b) const{
-		return mulin(inv(r, a), b);
-	}
-	Element& add(Element& r, const Element& a, const Element& b) const {
-		r = a + b;
-		if(r >= _p)
-			r += _negp;
-		return r;
-	}
-	Element& sub(Element& r, const Element& a, const Element& b) const {
-		r = a - b;
-		if(r < 0)
-			r += _p;
-		return r;
-	}
-	Element& neg(Element& r, const Element& a) const {
-		r = -a;
-		if(r < 0)
-			r += _p;
-		return r;
-	}
-	Element& inv(Element& x, const Element& y) const{
-		int64_t x_int, y_int, tx, ty;
-		x_int = int64_t(_lp);
-		y_int = int64_t(y);
-		tx = 0;
-		ty = 1;
-
-		while (y_int != 0) {
-			// always: gcd (modulus,residue) = gcd (x_int,y_int)
-			//         sx*modulus + tx*residue = x_int
-			//         sy*modulus + ty*residue = y_int
-			int64_t q = x_int / y_int; // integer quotient
-			int64_t temp = y_int;  y_int  = x_int  - q * y_int;
-			x_int  = temp;
-			temp = ty; ty = tx - q * ty;
-			tx = temp;
-		}
-
-		if (tx < 0) tx += int64_t(_p);
-
-		// now x_int = gcd (modulus,residue)
-		return x = Element(tx);
-	}
-
-	Element& mulin(Element& r, const Element& a) const {
-		return mul(r, r, a);
-	}
-	Element& divin(Element& r, const Element& y) const{
-		Element iy;
-		return mulin(r, inv(iy, y));
-	}
-	Element& addin(Element& r, const Element& a) const {
-		return add(r, r, a);
-	}
-	Element& subin(Element& r, const Element& a) const {
-		return sub(r, r, a);
-	}
-	Element& negin(Element& r) const {
-		return neg(r, r);
-	}
-	Element& invin(Element& r) const {
-	  return inv(r, r);
-	}
-	
-	// -- axpy:   r <- a * x + y
-	// -- axpyin: r <- a * x + r
-	Element& axpy  (Element& r, const Element& a, const Element& x, const Element& y) const {
-		Element tmp;
-		mul(tmp, a, x);
-		return add(r, tmp, y);
-	}
-	Element& axpyin(Element& r, const Element& a, const Element& x) const {
-		Element tmp(r);
-		return axpy(r, a, x, tmp);
-	}
-
-	// -- axmy:   r <- a * x - y
-	// -- axmyin: r <- a * x - r
-	Element& axmy  (Element& r, const Element& a, const Element& x, const Element& y) const {
-		Element tmp;
-		mul(tmp, a, x);
-		return sub(r, tmp, y);
-	}
-	Element& axmyin(Element& r, const Element& a, const Element& x) const {
-		return axmy(r, a, x, r);
-	}
-
-	// -- maxpy:   r <- y - a * x
-	// -- maxpyin: r <- r - a * x
-	Element& maxpy  (Element& r, const Element& a, const Element& x, const Element& y) const {
-		Element tmp;
-		mul(tmp, a, x);
-		return sub(r, y, tmp);
-	}
-	Element& maxpyin(Element& r, const Element& a, const Element& x) const {
-		return maxpy(r, a, x, r);
-	}
-
-	// ----- Random generators
-	// typedef ModularRandIter<Self_t> RandIter;
-	// typedef GeneralRingNonZeroRandIter<Self_t> NonZeroRandIter;
- //    template< class Random > Element& random(const Random& g, Element& r) const { return init(r, g()); }
- //    template< class Random > Element& nonzerorandom(const Random& g, Element& a) const
- //    	{ while (isZero(init(a, g())));
- //    	  return a; }
-		
-protected:
-	double _p = 0;
-	double _invp = 0;
-	double _negp = 0;
-	Residu_t _lp = 0;
-
-};
-
-}// Givaro
-
-#endif //__FFLASFFPACK_MODULAR_EXTENDED_H
diff --git a/fflas-ffpack/field/rns-double-recint.inl b/fflas-ffpack/field/rns-double-recint.inl
new file mode 100644
index 0000000..8eb3221
--- /dev/null
+++ b/fflas-ffpack/field/rns-double-recint.inl
@@ -0,0 +1,315 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/*
+ * Copyright (C) 2016 the FFLAS-FFPACK group
+ *
+ * Written by Pascal Giorgi <pascal.giorgi at lirmm.fr>
+ *
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+
+#ifndef __FFLASFFPACK_field_rns_double_recint_INL
+#define __FFLASFFPACK_field_rns_double_recint_INL
+
+#include "fflas-ffpack/fflas/fflas_freduce.h"
+
+namespace FFPACK {
+
+    // Arns must be an array of m*n*_size
+	// abs(||A||) < 2^(16k)
+	
+	template<size_t K>
+	inline void rns_double::init(size_t m, size_t n, double* Arns, size_t rda, const RecInt::ruint<K>* A, size_t lda, size_t k, bool RNS_MAJOR) const
+	{
+		if (k>_ldm){
+			FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_struct: init (too large entry)");
+			std::cerr<<"k="<<k<<" _ldm="<<_ldm<<std::endl;
+		}
+		size_t mn=m*n;
+		double *A_beta = FFLAS::fflas_new<double >(mn*k);
+		const RecInt::ruint<K>* Aiter=A;
+			// split A into A_beta according to a Kronecker transform in base 2^16
+//		auto sp=SPLITTER(MAX_THREADS,FFLAS::CuttingStrategy::Column,FFLAS::StrategyParameter::Threads);
+
+		Givaro::Timer tkr; tkr.start();
+#ifndef __FFLASFFPACK_SEQUENTIAL
+		//auto sp=SPLITTER(MAX_THREADS);
+#endif
+			// FOR2D(i,j,m,n,sp,
+			//       TASK(MODE(READ(Aiter[0]) READWRITE(A_beta[0])),
+		    for(size_t i=0;i<m;i++)
+		    //PAR_BLOCK{
+//			FOR1D(i,m,sp,
+			//PARFOR1D(i,m,sp,
+				  for(size_t j=0;j<n;j++){
+					  size_t idx=j+i*n;
+					  const uint16_t* m0_ptr = reinterpret_cast<const uint16_t*>(Aiter+j+i*lda);
+					  size_t l=0;
+					  size_t maxs=std::min(k,size_t(1UL<<(K-4)));
+					  
+					  //size_t maxs=std::min(k,(Aiter[j+i*lda].size())*sizeof(mp_limb_t)/2);// to ensure 32 bits portability
+
+					  for (;l<maxs;l++){
+						  A_beta[l+idx*k]= m0_ptr[l];						  
+					  }
+					  for (;l<k;l++)
+						  A_beta[l+idx*k]=  0.;
+
+					  // 	   );
+				  }
+					 
+
+			tkr.stop();
+			//if(m>1 && n>1) std::cerr<<"Kronecker : "<<tkr.realtime()<<std::endl;
+			if (RNS_MAJOR==false) {
+					// Arns = _crt_in x A_beta^T
+				Givaro::Timer tfgemm; tfgemm.start();
+				PAR_BLOCK{
+					FFLAS::fgemm (Givaro::ZRing<double>(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,_size,mn,k,1.0,_crt_in.data(),_ldm,A_beta,k,0.,Arns,rda,
+								  //			      FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Block,FFLAS::StrategyParameter::Threads>());
+							  FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive>());
+			
+				}
+				tfgemm.stop();
+			//if(m>1 && n>1) 	std::cerr<<"fgemm : "<<tfgemm.realtime()<<std::endl;
+//			cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)_size,(int)mn,(int)k,1.0,_crt_in.data(),(int)_ldm,A_beta,(int)k,0.,Arns,(int)rda);
+					// reduce each row i of Arns modulo moduli[i]
+					//for(size_t i=0;i<_size;i++)
+					//	FFLAS::freduce (_field_rns[i],mn,Arns+i*rda,1);
+			}
+			else {
+					// Arns =  A_beta x _crt_in^T
+				cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)mn,(int)_size,(int)k,1.0,A_beta,(int)k,_crt_in.data(),(int)_ldm,0.,Arns,(int)_size);
+					// reduce each column j of Arns modulo moduli[i]
+					//for(size_t i=0;i<_size;i++)
+					//	FFLAS::freduce (_field_rns[i],mn,Arns+i,_size);
+			}
+			Givaro::Timer tred; tred.start();
+
+			reduce(mn,Arns,rda,RNS_MAJOR);
+			tred.stop();
+			//if(m>1 && n>1) 			std::cerr<<"Reduce : "<<tred.realtime()<<std::endl;
+	
+		FFLAS::fflas_delete( A_beta);
+
+#ifdef CHECK_RNS
+		bool ok=true;
+		for (size_t i=0;i<m;i++)
+			for(size_t j=0;j<n;j++)
+				for(size_t k=0;k<_size;k++){
+					ok&= (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0)) == (int64_t) Arns[i*n+j+k*rda]);
+					if (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
+					    != (int64_t) Arns[i*n+j+k*rda])
+					{
+						std::cout<<((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
+								 <<" != "
+								 <<(int64_t) Arns[i*n+j+k*rda]
+								 <<std::endl;
+					}
+				}
+	
+					
+		std::cout<<"RNS freduce ... "<<(ok?"OK":"ERROR")<<std::endl;
+#endif
+	}
+
+	
+	template<size_t K>
+	inline void rns_double::convert(size_t m, size_t n, integer gamma, RecInt::ruint<K>* A, size_t lda,
+									const double* Arns, size_t rda, integer p,bool RNS_MAJOR) const
+	{
+		if (p==0 && _M > integer(1)<<(1<<K)){
+			std::cerr<<"RNS convert [error] : ruint<"<<K<<"> too small for the rns basis log[2](M)="<<_M.bitsize()<<std::endl;
+			std::terminate();
+		}
+
+#ifdef CHECK_RNS
+		integer* Acopy=new integer[m*n];
+		for(size_t i=0;i<m;i++)
+			for(size_t j=0;j<n;j++)
+				Acopy[i*n+j]=A[i*lda+j];
+
+#endif
+		
+		integer hM= (_M-1)>>1;
+		size_t  mn= m*n;
+		double *A_beta= FFLAS::fflas_new<double>(mn*_ldm);
+		Givaro::Timer tfgemmc;tfgemmc.start();
+		if (RNS_MAJOR==false)
+				// compute A_beta = Ap^T x M_beta
+			PAR_BLOCK{
+				FFLAS::fgemm(Givaro::ZRing<double>(),FFLAS::FflasTrans, FFLAS::FflasNoTrans,(int) mn,(int) _ldm,(int) _size, 1.0 , Arns,(int) rda, _crt_out.data(),(int) _ldm, 0., A_beta,(int)_ldm,
+							 FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive >());
+//				FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Block,FFLAS::StrategyParameter::Threads >());
+			}
+		else // compute A_beta = Ap x M_Beta
+			cblas_dgemm(CblasRowMajor,CblasNoTrans, CblasNoTrans, (int)mn, (int)_ldm, (int)_size, 1.0 , Arns, (int)_size, _crt_out.data(), (int)_ldm, 0., A_beta,(int)_ldm);
+
+		tfgemmc.stop();
+		//if(m>1 && n>1) std::cerr<<"fgemm Convert : "<<tfgemmc.realtime()<<std::endl;
+			// compute A using inverse Kronecker transform of A_beta expressed in base 2^log_beta
+		RecInt::ruint<K>* Aiter= A;
+		size_t k=_ldm;
+		if ((_ldm+3)*16 > (1<<K) || p!=0){
+			//std::cerr<<"ERROR: RNS with recint<"<<K<<"> -> convert needs "<<(_ldm+3)*16<<"bits ...aborting"<<std::endl;
+			//std::terminate();			
+			size_t k4=((k+3)>>2)+ (((k+3)%4==0)?0:1);
+			std::vector<uint16_t> A0(k4<<2,0),A1(k4<<2,0),A2(k4<<2,0),A3(k4<<2,0);
+			integer a0,a1,a2,a3,res;
+			mpz_t *m0,*m1,*m2,*m3;
+			m0= reinterpret_cast<mpz_t*>(&a0);
+			m1= reinterpret_cast<mpz_t*>(&a1);
+			m2= reinterpret_cast<mpz_t*>(&a2);
+			m3= reinterpret_cast<mpz_t*>(&a3);
+			mp_limb_t *m0_d,*m1_d,*m2_d,*m3_d;
+			m0_d = m0[0]->_mp_d;
+			m1_d = m1[0]->_mp_d;
+			m2_d = m2[0]->_mp_d;
+			m3_d = m3[0]->_mp_d;
+			m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc = m3[0]->_mp_alloc = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability
+			m0[0]->_mp_size  = m1[0]->_mp_size  = m2[0]->_mp_size  = m3[0]->_mp_size  = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability
+			Givaro::Timer tkroc;
+			tkroc.start();
+			//		auto sp=SPLITTER();
+			//		PARFOR1D(i,m,sp,
+			for(size_t i=0;i<m;i++)
+				for (size_t j=0;j<n;j++){
+					size_t idx=i*n+j;
+					for (size_t l=0;l<k;l++){
+						uint64_t tmp=(uint64_t)A_beta[l+idx*k];
+						uint16_t* tptr= reinterpret_cast<uint16_t*>(&tmp);
+						A0[l  ]= tptr[0];
+						A1[l+1]= tptr[1];
+						A2[l+2]= tptr[2];
+						A3[l+3]= tptr[3];
+					}
+					// see A0,A1,A2,A3 as a the gmp integers a0,a1,a2,a3
+					m0[0]->_mp_d= reinterpret_cast<mp_limb_t*>(&A0[0]);
+					m1[0]->_mp_d= reinterpret_cast<mp_limb_t*>(&A1[0]);
+					m2[0]->_mp_d= reinterpret_cast<mp_limb_t*>(&A2[0]);
+					m3[0]->_mp_d= reinterpret_cast<mp_limb_t*>(&A3[0]);
+					res = a0;res+= a1;res+= a2;res+= a3;
+					res%=_M;
+					if (p!=0) res%=p;
+
+					// get the correct result according to the expected sign of A
+					if (res>hM)
+						res-=_M;
+					if (gamma==0)
+						Aiter[j+i*lda]=RecInt::ruint<K>(res);
+					else
+						if (gamma==integer(1))
+							Aiter[j+i*lda]+=RecInt::ruint<K>(res);
+						else
+							if (gamma==integer(-1))
+								Aiter[j+i*lda]=RecInt::ruint<K>(res)-Aiter[j+i*lda];
+							else{
+								Aiter[j+i*lda]*=RecInt::ruint<K>(gamma);
+								Aiter[j+i*lda]+=RecInt::ruint<K>(res);
+							}
+
+				}
+			tkroc.stop();
+			//if(m>1 && n>1) std::cerr<<"Kronecker Convert : "<<tkroc.realtime()<<std::endl;
+
+			m0[0]->_mp_d = m0_d;
+			m1[0]->_mp_d = m1_d;
+			m2[0]->_mp_d = m2_d;
+			m3[0]->_mp_d = m3_d;
+			m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc= m3[0]->_mp_alloc = 1;
+			m0[0]->_mp_size  = m1[0]->_mp_size  = m2[0]->_mp_size = m3[0]->_mp_size  = 0;
+
+			
+		}
+		else {	
+			//size_t k4=((k+3)>>2)+ (((k+3)%4==0)?0:1);
+
+			std::vector<uint16_t> A0(1<<(K-4),0),A1(1<<(K-4),0),A2(1<<(K-4),0),A3(1<<(K-4),0);
+			RecInt::ruint<K> *a0,*a1,*a2,*a3,res;
+			Givaro::Timer tkroc;
+			tkroc.start();
+			//		auto sp=SPLITTER();
+			//		PARFOR1D(i,m,sp,
+			for(size_t i=0;i<m;i++)
+				for (size_t j=0;j<n;j++){
+					size_t idx=i*n+j;
+					for (size_t l=0;l<k;l++){
+						uint64_t tmp=(uint64_t)A_beta[l+idx*k];					
+						uint16_t* tptr= reinterpret_cast<uint16_t*>(&tmp);
+						A0[l  ]= tptr[0];
+						A1[l+1]= tptr[1];
+						A2[l+2]= tptr[2];
+						A3[l+3]= tptr[3];
+					
+					}
+					a0= reinterpret_cast<RecInt::ruint<K>*>(&A0[0]);
+					a1= reinterpret_cast<RecInt::ruint<K>*>(&A1[0]);
+					a2= reinterpret_cast<RecInt::ruint<K>*>(&A2[0]);
+					a3= reinterpret_cast<RecInt::ruint<K>*>(&A3[0]);
+
+					res = *a0;res+= *a1;res+= *a2;res+= *a3;
+					res%= RecInt::ruint<K>(_M);
+				
+					// get the correct result according to the expected sign of A
+					//if (res>hM)
+					//	res-=_M;
+					if (gamma==0)
+						Aiter[j+i*lda]=res;
+					else
+						if (gamma==1)
+							Aiter[j+i*lda]+=res;
+						else
+							if (gamma==-1)
+								Aiter[j+i*lda]=res-Aiter[j+i*lda];
+							else{
+								Aiter[j+i*lda]*=RecInt::ruint<K>(gamma);
+								Aiter[j+i*lda]+=res;
+							}
+
+				}
+			tkroc.stop();
+		}
+		//if(m>1 && n>1) std::cerr<<"Kronecker Convert : "<<tkroc.realtime()<<std::endl;
+
+		FFLAS::fflas_delete( A_beta);
+		
+#ifdef CHECK_RNS
+		std::cout<<"CHECKING RNS CONVERT : ruint<"<<K<<"> with log[2](M)="<<_M.bitsize()<<std::endl;
+		std::cout<<"RNS : _ldm*16="<<(_ldm+2)*16<<std::endl;
+		bool ok=true;
+		for (size_t i=0;i<m;i++)
+			for(size_t j=0;j<n;j++)
+				for(size_t k=0;k<_size;k++){
+					int64_t _p =(int64_t) _basis[k];
+					integer curr=integer(A[i*lda+j]) - gamma*Acopy[i*n+j];
+					if ( curr% _p +(curr%_p<0?_p:0) != (int64_t) Arns[i*n+j+k*rda])
+						std::cout<<A[i*lda+j]<<" mod "<<(int64_t) _basis[k]<<"="<<(int64_t) Arns[i*n+j+k*rda]<<";"<<std::endl;
+					ok&= ( curr% _p +(curr%_p<0?_p:0) == (int64_t) Arns[i*n+j+k*rda]);
+
+				}
+		std::cout<<"RNS convert ... "<<(ok?"OK":"ERROR")<<std::endl;
+#endif
+	}
+
+} // FFPACK
+
+#endif // __FFLASFFPACK_field_rns_double_recint_INL
diff --git a/fflas-ffpack/field/rns-double.h b/fflas-ffpack/field/rns-double.h
index b99a704..49efd2a 100644
--- a/fflas-ffpack/field/rns-double.h
+++ b/fflas-ffpack/field/rns-double.h
@@ -43,11 +43,11 @@
 #include <givaro/modular-double.h>
 #include <givaro/givinteger.h>
 #include <givaro/givintprime.h>
+#include "givaro/modular-extended.h"
 #include <recint/ruint.h>
 #include "fflas-ffpack/config-blas.h"
 #include "fflas-ffpack/utils/fflas_memory.h"
 #include "fflas-ffpack/utils/align-allocator.h"
-#include "fflas-ffpack/field/modular-extended.h"
 #include "fflas-ffpack/field/rns-double-elt.h"
 
 namespace FFPACK {
@@ -133,6 +133,10 @@ namespace FFPACK {
 			precompute_cst();
 		}
 
+		rns_double(const RNSIntegerMod<rns_double>& basis, bool rnsmod=false, long seed=time(NULL)) {
+
+		}
+
 		// can force to reduce integer entries larger than M
 		void precompute_cst(size_t K=0){
 			if (K!=0)
@@ -217,7 +221,7 @@ namespace FFPACK {
 		template<size_t K>
 		void init(size_t m, size_t n, double* Arns, size_t rda, const RecInt::ruint<K>* A, size_t lda, size_t k, bool RNS_MAJOR=false) const;
 		template<size_t K>
-		void convert(size_t m, size_t n, integer gamma, RecInt::ruint<K>* A, size_t lda, const double* Arns, size_t rda, bool RNS_MAJOR=false) const;
+		void convert(size_t m, size_t n, integer gamma, RecInt::ruint<K>* A, size_t lda, const double* Arns, size_t rda, integer p=0,bool RNS_MAJOR=false) const;
 
 		
 	}; // end of struct rns_double
@@ -352,7 +356,7 @@ namespace FFPACK {
 		void init(size_t m, double* Arns, const integer* A, size_t lda) const;
 		void convert(size_t m, integer *A, const double *Arns) const;
 		
-#if defined(__FFLASFFPACK_USE_SIMD)
+#if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS)
 		
 		template<class SimdT>
 		inline void splitSimd(const SimdT x, SimdT & x_h, SimdT & x_l) const {
@@ -395,7 +399,7 @@ namespace FFPACK {
 		  return r = simd::add(r, abh);
 		}
 		
-#endif // __FFLASFFPACK_USE_SIMD
+#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		
 		// reduce entries of Arns to be less than the rns basis elements
 		void reduce(size_t n, double* Arns, size_t rda, bool RNS_MAJOR=false) const;
@@ -404,10 +408,53 @@ namespace FFPACK {
 		
 	}; // end of struct rns_double_extended
 
+
+	template<typename RNS>
+    class rnsRandIter {
+        std::vector<typename RNS::ModField::RandIter> _RNS_rand;
+        const RNS& _domain;
+        
+    public:
+        rnsRandIter(const RNS& R, size_t size=0, uint64_t seed=0)
+                : _domain(R) {
+            for(auto iter : R._field_rns)
+                _RNS_rand.push_back( typename RNS::ModField::RandIter(iter,size,seed) );
+        }
+
+        /** RNS ring Element random assignement.
+         * Element is supposed to be initialized
+         * @return random ring Element
+         */
+        typename RNS::Element& random(typename RNS::Element& elt) const {
+            auto coefficient(elt._ptr);
+            for(auto iter : _RNS_rand) {
+                iter.random( *coefficient );
+                coefficient += elt._stride;
+            }
+            return elt;
+        }
+       
+        typename RNS::Element& operator()(typename RNS::Element& elt) const {
+            return this->random(elt);
+        }
+
+        typename RNS::Element operator()() const {
+            typename RNS::Element tmp; _domain.init(tmp);
+            return this->operator()(tmp);
+        }
+        typename RNS::Element random() const {
+            return this->operator()();
+        }
+
+        const RNS& ring() const { return _domain; }
+
+    };
+
+
 } // end of namespace FFPACK
 
 #include "rns-double.inl"
-//#include "rns-double-recint.inl"
+#include "rns-double-recint.inl"
 namespace FFLAS {
 
 	template<>
@@ -417,5 +464,4 @@ namespace FFLAS {
 
 }
 
-#endif // __FFPACK_rns_double_H
-
+#endif // __FFPACK_rns_double_H
\ No newline at end of file
diff --git a/fflas-ffpack/field/rns-double.inl b/fflas-ffpack/field/rns-double.inl
index 71a5eed..867a381 100644
--- a/fflas-ffpack/field/rns-double.inl
+++ b/fflas-ffpack/field/rns-double.inl
@@ -392,7 +392,7 @@ namespace FFPACK {
 	inline void rns_double::reduce(size_t n, double* Arns, size_t rda, bool RNS_MAJOR) const{
 
 		if (RNS_MAJOR) {
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 			using simd = Simd<double>;
 			using vect_t = typename simd::vect_t;
 
@@ -500,7 +500,7 @@ namespace FFPACK {
 		// reduce entries of Arns to be less than the rns basis elements
 	inline void rns_double_extended::reduce(size_t n, double* Arns, size_t rda, bool RNS_MAJOR) const{
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 		using simd = Simd<double>;
 		using vect_t = typename simd::vect_t;
 
diff --git a/fflas-ffpack/field/rns-integer-mod.h b/fflas-ffpack/field/rns-integer-mod.h
index a72118d..f56d7de 100644
--- a/fflas-ffpack/field/rns-integer-mod.h
+++ b/fflas-ffpack/field/rns-integer-mod.h
@@ -41,10 +41,10 @@
 #include <givaro/modular-integer.h>
 #include <givaro/givinteger.h>
 #include <givaro/udl.h>
+#include "givaro/modular-extended.h"
 
 #include "fflas-ffpack/field/rns-double.h"
 #include "fflas-ffpack/field/rns-integer.h"
-#include "fflas-ffpack/field/modular-extended.h"
 #include "fflas-ffpack/fflas/fflas_level1.inl"
 #include "fflas-ffpack/fflas/fflas_level2.inl"
 #include "fflas-ffpack/fflas/fflas_level3.inl"
@@ -71,6 +71,7 @@ namespace FFPACK {
 		typedef typename RNS::Element                   Element;
 		typedef typename RNS::Element_ptr           Element_ptr;
 		typedef typename RNS::ConstElement_ptr ConstElement_ptr;
+		typedef rnsRandIter<RNS> RandIter;
 
 	protected:
 		typedef typename RNS::BasisElement BasisElement;
@@ -691,7 +692,7 @@ namespace FFPACK {
                         //
 			// FFLAS::fscal(_rns->_field_rns[i], n, _rns->_MMi[i], A+i, _size, Gamma+i,_size);
                         T.start();
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
                         using simd = Simd<BasisElement>;
                         using vect_t = typename simd::vect_t;
 
diff --git a/fflas-ffpack/interfaces/libs/fflas_L1_inst.C b/fflas-ffpack/interfaces/libs/fflas_L1_inst.C
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/interfaces/libs/fflas_L2_inst.C b/fflas-ffpack/interfaces/libs/fflas_L2_inst.C
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/interfaces/libs/fflas_L3_inst.C b/fflas-ffpack/interfaces/libs/fflas_L3_inst.C
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/interfaces/libs/fflas_L3_inst_implem.inl b/fflas-ffpack/interfaces/libs/fflas_L3_inst_implem.inl
index b56a510..cd65fc1 100644
--- a/fflas-ffpack/interfaces/libs/fflas_L3_inst_implem.inl
+++ b/fflas-ffpack/interfaces/libs/fflas_L3_inst_implem.inl
@@ -91,7 +91,7 @@ namespace FFLAS {
 	       const FFLAS_DIAG Diag,
 	       const size_t M, const size_t N,
 	       const FFLAS_ELT alpha,
-	       FFLAS_ELT* A, const size_t lda,
+	       const FFLAS_ELT* A, const size_t lda,
 	       FFLAS_ELT* B, const size_t ldb);
 
 	/** @brief  fgemm: <b>F</b>ield <b>GE</b>neral <b>M</b>atrix <b>M</b>ultiply.
diff --git a/fflas-ffpack/interfaces/libs/ffpack_inst.C b/fflas-ffpack/interfaces/libs/ffpack_inst.C
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/paladin/blockcuts.inl b/fflas-ffpack/paladin/blockcuts.inl
old mode 100755
new mode 100644
index 106cb7e..800d990
--- a/fflas-ffpack/paladin/blockcuts.inl
+++ b/fflas-ffpack/paladin/blockcuts.inl
@@ -30,6 +30,7 @@
 #define __FFLASFFPACK_fflas_blockcuts_INL
 
 #include <fflas-ffpack/fflas/fflas_enum.h>
+#include <math.h>
 
 #define __FFLASFFPACK_MINBLOCKCUTS ((size_t)256)
 
diff --git a/fflas-ffpack/paladin/fflas_pfinit.h b/fflas-ffpack/paladin/fflas_pfinit.h
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/paladin/parallel.h b/fflas-ffpack/paladin/parallel.h
old mode 100755
new mode 100644
diff --git a/fflas-ffpack/utils/Matio.h b/fflas-ffpack/utils/Matio.h
index 6cc5b0c..957ae87 100644
--- a/fflas-ffpack/utils/Matio.h
+++ b/fflas-ffpack/utils/Matio.h
@@ -1,5 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /* Copyright (C) LinBox,FFLAS-FFPACK
  *
  * ========LICENCE========
@@ -34,7 +34,7 @@
 
 // Reading a matrice from a (eventually zipped) file
 template<class Field>
-typename Field::Element_ptr read_field(const Field& F, const char * mat_file,int* tni,int* tnj)
+typename Field::Element_ptr read_field(const Field& F, const char * mat_file,size_t * tni,size_t* tnj)
 {
 	char *UT = NULL;
 	const char* File_Name;
@@ -62,7 +62,7 @@ typename Field::Element_ptr read_field(const Field& F, const char * mat_file,int
 	FILE* FileDes = fopen(File_Name, "r");
 	if (FileDes != NULL) {
 		char  tmp [200];// unsigned long tni, tnj;
-		if (fscanf(FileDes,"%d %d %199s\n",tni, tnj, tmp)<0)
+		if (fscanf(FileDes,"%lu %lu %199s\n",tni, tnj, tmp)<0)
 			printf("Error Reading first line of file \n");
 		int n=*tni;
 		int p=*tnj;
@@ -94,10 +94,9 @@ std::ostream& write_field(const Field& F,std::ostream& c,
 			  typename Field::ConstElement_ptr E,
 			  int n, int m, int id, bool mapleFormat = false, bool column_major=false)
 {
-
-	    //typename Field::Element tmp;
-	// double tmp;
-//	Givaro::Integer tmp;
+//     typename Field::Element tmp;
+//     double tmp;
+//     Givaro::Integer tmp;
 	typename Field::Element tmp;
 	F.init(tmp);
 	if (mapleFormat) c << "Matrix(" << n <<',' << m << ",\n[" ;
@@ -105,13 +104,12 @@ std::ostream& write_field(const Field& F,std::ostream& c,
 		if (mapleFormat) c << '[';
 		for (int j=0; j<m;++j){
 			if (column_major)
-				    //F.convert(tmp,*(E+i+id*j));
-				    tmp = *(E+i+id*j);
-				
+				    F.assign(tmp, *(E+i+id*j));
+// 				    F.convert(tmp,*(E+i+id*j));				
 			else
+				F.assign(tmp, *(E+j+id*i));
 //				F.convert(tmp,*(E+j+id*i));
-				tmp =*(E+j+id*i);
-			c << tmp;
+			F.write(c, tmp);
 			if (mapleFormat && j<m-1) c << ',';
 			c << ' ';
 		}
@@ -125,8 +123,11 @@ std::ostream& write_field(const Field& F,std::ostream& c,
 
 inline std::ostream& write_perm (std::ostream& c, const size_t* P, size_t N){
 	c<<"[ ";
-	for (size_t i=0; i<N; ++i)
-		c<<P[i]<<" ";
+	for (size_t i=0; i<N; ++i){
+		if (i)
+			c<<", ";
+		c<<P[i];
+	}
 	c<<"]"<<std::endl;
 	return c;
 }
diff --git a/fflas-ffpack/utils/align-allocator.h b/fflas-ffpack/utils/align-allocator.h
index 08c6464..6225158 100644
--- a/fflas-ffpack/utils/align-allocator.h
+++ b/fflas-ffpack/utils/align-allocator.h
@@ -55,7 +55,7 @@ enum class Alignment : size_t {
   CACHE_LINE = 64,
   CACHE_PAGESIZE = 4096,
   DEFAULT =
-#ifdef __FFLASFFPACK_USE_AVX
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
   32
 #else
   16
diff --git a/fflas-ffpack/utils/bit_manipulation.h b/fflas-ffpack/utils/bit_manipulation.h
index dcb7c24..4b7ad9e 100644
--- a/fflas-ffpack/utils/bit_manipulation.h
+++ b/fflas-ffpack/utils/bit_manipulation.h
@@ -37,6 +37,7 @@
 #endif
 
 #include <givaro/udl.h>
+#include "fflas-ffpack/fflas-ffpack-config.h"
 
 // count leading zeros
 inline int32_t clz(uint64_t val) {
@@ -97,7 +98,7 @@ inline int32_t ctz(uint64_t val) {
 
 
 
-#ifdef __x86_64__
+#ifdef __FFLASFFPACK_HAVE_INT128
 // division 128bits by 64 bits
 // int128_t(u1,u0) = u1*2^64+u0, div v, rem r
 // return quo
@@ -116,7 +117,7 @@ static uint64_t divide_128(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r)
 #endif
 
 static uint64_t getpoweroftwoden_128(uint32_t d, uint64_t q, uint64_t *r) {
-#ifdef __x86_64__
+#ifdef __FFLASFFPACK_HAVE_INT128
     return divide_128(1_ui64 << (d - 1), 0, q, r);
 #else
     lldiv_t ta;
@@ -137,7 +138,7 @@ static inline uint32_t mullhi_u32(uint32_t x, uint32_t y) {
 }
 
 static inline int64_t mulhi_64(int64_t x, int64_t y) {
-#ifdef __x86_64__
+#ifdef __FFLASFFPACK_HAVE_INT128
         int128_t xl = x, yl = y;
         int128_t rl = xl * yl;
         return (int64_t)(rl >> 64);
@@ -153,7 +154,7 @@ static inline int64_t mulhi_64(int64_t x, int64_t y) {
 }
 
 static inline int64_t mulhi_fast_64(int64_t x, int64_t y) {
-#if 0 // todo check this type
+#ifdef __FFLASFFPACK_HAVE_INT128
         int128_t xl = x, yl = y;
         int128_t rl = xl * yl;
         return (int64_t)(rl >> 64);
diff --git a/fflas-ffpack/utils/fflas_memory.h b/fflas-ffpack/utils/fflas_memory.h
index a2acc12..8589a69 100644
--- a/fflas-ffpack/utils/fflas_memory.h
+++ b/fflas-ffpack/utils/fflas_memory.h
@@ -95,7 +95,7 @@ namespace FFLAS{
 	fflas_delete(std::forward<Args>(args)...);
     }
 
-#ifdef __FFLASFFPACK_USE_SIMD
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
     inline void prefetch(const int64_t* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 #else
     inline void prefetch(const int64_t*) {} 
diff --git a/macros/avx-check.m4 b/macros/avx-check.m4
index 28c505a..edbfd42 100644
--- a/macros/avx-check.m4
+++ b/macros/avx-check.m4
@@ -26,16 +26,9 @@ dnl turn on AVX or AVX2 extensions if available
 
 AC_DEFUN([FF_CHECK_AVX],
 [
-	AC_ARG_ENABLE(avx,
-	[ AC_HELP_STRING([--enable-avx], [ Use Intel(r) AVX ]) ],
-	[ avec_avx=$enable_avx ],
-	[ avec_avx=yes ]
-	)
-	
+	AC_ARG_ENABLE(avx,[AC_HELP_STRING([--disable-avx], [ Disable Intel(r) AVX])])
 	AC_MSG_CHECKING(for AVX)
-
-	dnl Is check enabled?
-	AS_IF([ test  "x$avec_avx" != "xno" ],
+	AS_IF([ test  "x$enable_avx" != "xno" ],
 	[
 		BACKUP_CXXFLAGS=${CXXFLAGS}
 		CODE_AVX=`cat macros/CodeChunk/avx.C`
@@ -65,8 +58,7 @@ AC_DEFUN([FF_CHECK_AVX],
 		AS_IF([ test "x$avx_found" = "xyes" ],
 		[
 			AC_MSG_RESULT(yes)
-			AC_DEFINE(USE_AVX,1,[Define if AVX is available])
-			AC_SUBST(AVXFLAGS)
+			AC_DEFINE(HAVE_AVX_INSTRUCTIONS,1,[Define if AVX is available])
 			
 	        dnl Check for AVX2
 			AC_MSG_CHECKING(for AVX2)
@@ -97,9 +89,8 @@ AC_DEFUN([FF_CHECK_AVX],
 			AS_IF([ test "x$avx2_found" = "xyes" ],
 			[
 				AC_MSG_RESULT(yes)
-				AC_DEFINE(USE_AVX2,1,[Define if AVX2 is available])
+				AC_DEFINE(HAVE_AVX2_INSTRUCTIONS,1,[Define if AVX2 is available])
 				AVXFLAGS=${AVX2FLAGS}
-				AC_SUBST(AVXFLAGS)
 			],
 			[
 		        dnl No AVX2
@@ -116,7 +107,7 @@ AC_DEFUN([FF_CHECK_AVX],
 		CXXFLAGS=${BACKUP_CXXFLAGS}
 	],
 	[
-	    dnl --enable-avx=no
+	    dnl --disable-avx
 	    AC_MSG_RESULT(no [disabled])
     ]
 	)
diff --git a/macros/ax_check_x86_features.m4 b/macros/ax_check_x86_features.m4
new file mode 100644
index 0000000..22e030b
--- /dev/null
+++ b/macros/ax_check_x86_features.m4
@@ -0,0 +1,77 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_x86_features.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_X86_FEATURES([ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
+#
+# DESCRIPTION
+#
+#   Checks if the host cpu supports various x86 instruction set, the
+#   instructions that will get tested are "mmx, popcnt, sse, sse2, sse3,
+#   sse4.1, sse4.2, sse4a, avx, avx2, avx512f, fma, fma4, bmi, bmi2". If the
+#   instruction set is supported by the host cpu, the C preprocessor macro
+#   HAVE_XXX_INSTRUCTIONS is set to 1. The XXX is up-cased instruction case
+#   with dot replaced by underscore. For example, the test for "sse4.2"
+#   would export HAVE_SSE4_2_INSTRUCTIONS=1. Also the compiler flag
+#   "-msse4.2" would be added to X86_FEATURE_CFLAGS variable, that can be
+#   obtained in Makefile.am using @X86_FEATURE_CFLAGS at .
+#
+#   If any of the test for the instruction set were succeeded, the configure
+#   script would run ACTION-IF-FOUND if it is specified, or append
+#   X86_FEATURE_CFLAGS to CFLAGS. If none of the instruction were found,
+#   ACTION-IF-NOT-FOUND hook is triggered.
+#
+#   This macro requires gcc extended builtin function "__builtin_cpu_init"
+#   and "__builtin_cpu_supports" to detect the cpu features. It will error
+#   out if the compiler doesn't has these builtins.
+#
+#   See also AX_GCC_X86_CPU_SUPPORTS, which is the actual macro that perform
+#   the checks for the instruction sets.
+#
+# LICENSE
+#
+#   Copyright (c) 2016 Felix Chern <idryman at gmail.com>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 1
+
+AC_DEFUN([AX_CHECK_X86_FEATURES],
+ [m4_foreach_w(
+   [ax_x86_feature],
+   [mmx popcnt sse sse2 sse3 sse4.1 sse4.2 sse4a avx avx2 avx512f fma fma4 bmi bmi2],
+   [AX_GCC_X86_CPU_SUPPORTS(ax_x86_feature,
+     [X86_FEATURE_CFLAGS="$X86_FEATURE_CFLAGS -m[]ax_x86_feature"],
+     [])
+  ])
+  AC_SUBST([X86_FEATURE_CFLAGS])
+  m4_ifval([$1],[$1],
+    [CXXFLAGS="$CXXFLAGS $X86_FEATURE_CFLAGS"])
+  $2
+])
diff --git a/macros/ax_gcc_x86_cpu_supports.m4 b/macros/ax_gcc_x86_cpu_supports.m4
new file mode 100644
index 0000000..a61a14a
--- /dev/null
+++ b/macros/ax_gcc_x86_cpu_supports.m4
@@ -0,0 +1,104 @@
+# ===========================================================================
+#  http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpu_supports.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_CPU_SUPPORTS(X86-INSTRUCTION-SET,
+#     [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
+#
+# DESCRIPTION
+#
+#   Checks if the host cpu supports X86-INSTRUCTION-SET. The instruction set
+#   that can be tested are "mmx, popcnt, sse, sse2, sse3, sse4.1, sse4.2,
+#   sse4a, avx, avx2, avx512f, fma, fma4, bmi, bmi2". If the instruction set
+#   is supported by the host cpu, the C preprocessor macro
+#   HAVE_XXX_INSTRUCTIONS is set to 1. The XXX is up-cased instruction case
+#   with dot replaced by underscore. For example, the test for "sse4.2"
+#   would export HAVE_SSE4_2_INSTRUCTIONS=1. This macro requires gcc
+#   extended builtin function "__builtin_cpu_init" and
+#   "__builtin_cpu_supports" to detect the cpu features. It will error out
+#   if the compiler doesn't has these builtins.
+#
+#   If the test for the instruction set succeeded, the hook ACTION-IF-FOUND
+#   would run. Otherwise the hook ACTION-IF-NOT-FOUND would run if
+#   specified.
+#
+#   See also AX_CHECK_X86_FEATURES, which checks all the possible
+#   instruction set and export the corresponding CFLAGS.
+#
+# LICENSE
+#
+#   Copyright (c) 2016 Felix Chern <idryman at gmail.com>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 1
+
+AC_DEFUN_ONCE([_AX_GCC_X86_CPU_INIT],
+ [AC_LANG_PUSH([C])
+  AC_CACHE_CHECK([for gcc __builtin_cpu_init function],
+    [ax_cv_gcc_check_x86_cpu_init],
+    [AC_RUN_IFELSE(
+      [AC_LANG_PROGRAM([#include <stdlib.h>],
+        [__builtin_cpu_init ();])
+      ],
+      [ax_cv_gcc_check_x86_cpu_init=yes],
+      [ax_cv_gcc_check_x86_cpu_init=no])])
+  AS_IF([test "X$ax_cv_gcc_check_x86_cpu_init" = "Xno"],
+    [AC_MSG_ERROR([Need GCC to support X86 CPU features tests])])
+])
+
+AC_DEFUN([AX_GCC_X86_CPU_SUPPORTS],
+  [AC_REQUIRE([AC_PROG_CC])
+   AC_REQUIRE([_AX_GCC_X86_CPU_INIT])
+   AC_LANG_PUSH([C])
+   AS_VAR_PUSHDEF([gcc_x86_feature], [AS_TR_SH([ax_cv_gcc_x86_cpu_supports_$1])])
+   AC_CACHE_CHECK([for x86 $1 instruction support], 
+     [gcc_x86_feature],
+     [AC_RUN_IFELSE(
+       [AC_LANG_PROGRAM( [#include <stdlib.h> ], 
+       [ __builtin_cpu_init ();
+         if (__builtin_cpu_supports("$1"))
+           return 0;
+         return 1;
+        ])],
+        [gcc_x86_feature=yes],
+        [gcc_x86_feature=no]
+     )]
+   )
+   AC_LANG_POP([C])
+   AS_VAR_IF([gcc_x86_feature],[yes],
+         [AC_DEFINE(
+           AS_TR_CPP([HAVE_$1_INSTRUCTIONS]),
+           [1],
+           [Define if $1 instructions are supported])
+          $2],
+          [$3]
+         )
+   AS_VAR_POPDEF([gcc_x86_feature])
+])
diff --git a/macros/givaro-check.m4 b/macros/givaro-check.m4
index 3e688ff..9eb5fa5 100644
--- a/macros/givaro-check.m4
+++ b/macros/givaro-check.m4
@@ -54,7 +54,7 @@ dnl -------------- dnl
 
 dnl As we need Integer and Modular, should be updated on each interface changes
 version_min=40001
-version_max=40002
+version_max=40003
 
 dnl Check for existence
 
diff --git a/macros/simd-check.m4 b/macros/simd-check.m4
new file mode 100644
index 0000000..4aef1cb
--- /dev/null
+++ b/macros/simd-check.m4
@@ -0,0 +1,137 @@
+dnl Check for SIMD
+dnl  Copyright (c) 2011 FFLAS-FFPACK
+dnl Created by BB, 2014-03-25
+dnl modified by CP, 2016-07-11
+dnl ========LICENCE========
+dnl This file is part of the library FFLAS-FFPACK.
+dnl
+dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify
+dnl it under the terms of the  GNU Lesser General Public
+dnl License as published by the Free Software Foundation; either
+dnl version 2.1 of the License, or (at your option) any later version.
+dnl
+dnl This library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with this library; if not, write to the Free Software
+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+dnl ========LICENCE========
+dnl
+
+dnl FF_CHECK_SIMD
+dnl
+dnl turn on SSE4.1 AVX, AVX2 extensions if available
+
+AC_DEFUN([FF_CHECK_SIMD],
+[
+	AC_ARG_ENABLE(simd,[AC_HELP_STRING([--disable-simd], [ Disable vectorized instructions: SSE4.1, AVX, AVX2])])
+	AS_IF([ test  "x$enable_simd" != "xno" ],
+	[
+		AS_ECHO("SIMD enabled")
+		arch=`echo $target | cut -d"-" -f1`
+		# if we are on a x86 (32 or 64 bits) with gcc>=4.8 then run the AX_CHECK_X86_FEATURES macro
+		AS_IF([test "x$arch" = "xx86_64" -o "x$arch" = "xi686"],
+			    [archx86="yes"],
+			    [archx86="no"]
+		     )
+		AS_IF([ test  "x$CCNAM" != "xgcc48" -o "x$archx86" = "xno" ],
+		[
+		   CUSTOM_SIMD="yes"
+		   echo "Compiling with $CCNAM for a $arch target: running custom checks for SSE4.1 and AVX1,2"
+		   AC_MSG_CHECKING(for SSE 4.1)
+		   BACKUP_CXXFLAGS=${CXXFLAGS}
+		   SSEFLAGS="-msse4.1"
+		   CXXFLAGS="${BACKUP_CXXFLAGS} ${SSEFLAGS}"
+		   CODE_SSE=`cat macros/CodeChunk/sse.C`
+		   AC_TRY_RUN([ ${CODE_SSE} ],
+			      [ sse_found="yes" ],
+			       [ sse_found="no" ],
+			       [ 
+			       echo "cross compiling...disabling"
+				 sse_found="no"
+			       ])
+	           AS_IF([ test "x$sse_found" = "xyes" ],
+		   [
+			AC_DEFINE(HAVE_SSE4_1_INSTRUCTIONS,1,[Define if SSE is available])
+			AC_SUBST(SSEFLAGS)
+			AC_MSG_RESULT(yes)
+                   ],
+		   [
+			SSEFLAGS=""
+			AC_MSG_RESULT(no)
+		   ])
+		   CXXFLAGS=${BACKUP_CXXFLAGS}
+		   
+		   dnl Check for AVX
+		   AC_MSG_CHECKING(for AVX)
+		   CODE_AVX=`cat macros/CodeChunk/avx.C`
+		   dnl Intel compilers usually do not require option to enable avx
+		   dnl Thus, we test with no option on
+		   for switch_avxflags in "" "-mavx"; do
+		       CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avxflags}"
+		       AC_TRY_RUN([ ${CODE_AVX} ],
+		       [
+				avx_found="yes"
+		        	AVXFLAGS=${switch_avxflags}
+				break
+		       ],
+		       [ avx_found="no" ],
+		       [
+		        echo "cross compiling...disabling"
+		        avx_found="no"
+		        break
+		       ])
+		   done
+			
+		   dnl Is AVX found?
+		   AS_IF([ test "x$avx_found" = "xyes" ],
+		   [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_AVX_INSTRUCTIONS,1,[Define if AVX is available])
+			
+	                dnl Check for AVX2
+			AC_MSG_CHECKING(for AVX2)
+			for switch_avx2flags in "" "-mfma -mavx2"; do
+			    CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avx2flags}"
+			    AC_TRY_RUN(
+			    [
+			        #define __try_avx2
+				${CODE_AVX}
+			    ],
+			    [
+			        avx2_found="yes"
+			        AVX2FLAGS="${switch_avx2flags}"
+			        break
+		            ],
+			    [ avx2_found="no" ],
+			    [
+			        echo "cross compiling...disabling"
+			        avx2_found = "no"
+			        break
+			    ])
+			done
+				
+	                dnl Is AVX2 found?
+			AS_IF([ test "x$avx2_found" = "xyes" ],
+			[
+				AC_MSG_RESULT(yes)
+				AC_DEFINE(HAVE_AVX2_INSTRUCTIONS,1,[Define if AVX2 is available])
+				AVXFLAGS=${AVX2FLAGS}
+			],
+			[ AC_MSG_RESULT(no) ]
+			)
+		    ],
+		    [
+			dnl No AVX
+		    	AC_MSG_RESULT(no)
+		    ])
+		
+		    CXXFLAGS=${BACKUP_CXXFLAGS}
+		],
+		[ ])
+	],[ AS_ECHO("SIMD disabled")
+	    CUSTOM_SIMD="yes" ])
+])
diff --git a/macros/sse2-check.m4 b/macros/sse2-check.m4
index 198299e..ea39874 100644
--- a/macros/sse2-check.m4
+++ b/macros/sse2-check.m4
@@ -26,15 +26,9 @@ dnl turn on  SSE4.1 extensions if available
 
 AC_DEFUN([FF_CHECK_SSE],
 		[
-		AC_ARG_ENABLE(sse,
-			[AC_HELP_STRING([--enable-sse],
-				[ Use Intel(r) SSE 4.1])
-			],
-			[ avec_sse=$enable_sse ],
-			[ avec_sse=yes ]
-			)
+		AC_ARG_ENABLE(sse,[AC_HELP_STRING([--disable-sse], [ Disable Intel(r) SSE 4.1])])
 		AC_MSG_CHECKING(for SSE 4.1)
-		AS_IF([ test  "x$avec_sse" != "xno" ],
+		AS_IF([ test  "x$enable_sse" != "xno" ],
 			[
 			BACKUP_CXXFLAGS=${CXXFLAGS}
 			dnl  SSEFLAGS="-msse2"
@@ -51,17 +45,24 @@ AC_DEFUN([FF_CHECK_SSE],
 				sse_found="no"
 				])
 			AS_IF([ test "x$sse_found" = "xyes" ],[
-				AC_DEFINE(USE_SSE,1,[Define if SSE is available])
+				AC_DEFINE(HAVE_SSE4_1_INSTRUCTIONS,1,[Define if SSE is available])
 				AC_SUBST(SSEFLAGS)
-				AC_MSG_RESULT(yes (SSE))
+				AC_MSG_RESULT(yes)
 				],
 				[
 				SSEFLAGS=""
+				dnl Forcing to disable AVX
+				enable_avx="no"
 				AC_MSG_RESULT(no)
 				]
 				)
 			CXXFLAGS=${BACKUP_CXXFLAGS}
 			],
-			[ AC_MSG_RESULT(no) ]
+			[
+			dnl --disable-sse
+			AC_MSG_RESULT(no [disabled])
+			dnl Forcing to disable AVX
+			enable_avx="no"
+			]
 	)
 	])
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 151b4c7..8f503a3 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -24,11 +24,13 @@ SUBDIRS = data
 check:
 	$(BASE_TESTS)
 
-AM_CPPFLAGS=-I$(top_srcdir)
-AM_CXXFLAGS = @TESTS_CFLAGS@
-AM_CPPFLAGS += $(OPTFLAGS)  -I$(top_srcdir)/fflas-ffpack/ -I$(top_srcdir)/fflas-ffpack/utils/ -I$(top_srcdir)/fflas-ffpack/fflas/  -I$(top_srcdir)/fflas-ffpack/ffpack  -I$(top_srcdir)/fflas-ffpack/field $(GIVARO_CFLAGS) $(CBLAS_FLAG) $(CUDA_CFLAGS) $(PARFLAGS) $(PRECOMPILE_FLAGS)
+AM_CPPFLAGS=-I$(top_srcdir) -g
+AM_CXXFLAGS = @TESTS_CFLAGS@ $(OPTFLAGS) $(GIVARO_CFLAGS) $(CBLAS_FLAG) $(CUDA_CFLAGS) $(PARFLAGS) $(PRECOMPILE_FLAGS)
+AM_CPPFLAGS +=  -I$(top_srcdir)/fflas-ffpack/ -I$(top_srcdir)/fflas-ffpack/utils/ -I$(top_srcdir)/fflas-ffpack/fflas/  -I$(top_srcdir)/fflas-ffpack/ffpack  -I$(top_srcdir)/fflas-ffpack/field 
+
+#LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARFLAGS) $(PRECOMPILE_LIBS)
+AM_LDFLAGS=-static $(PARFLAGS) #-L$(prefix)/lib   -lfflas -lffpack -lfflas_c -lffpack_c
 
-AM_LDFLAGS=-static  #-L$(prefix)/lib   -lfflas -lffpack -lfflas_c -lffpack_c
 
 EXTRA_DIST= test-utils.h
 
@@ -40,25 +42,35 @@ BASIC_TESTS =               \
 		test-echelon        \
 		test-rankprofiles   \
 		test-compressQ      \
+		test-permutations   \
 		test-fadd           \
 		test-finit          \
 		test-fscal          \
 		test-fgemm          \
+		test-pluq-check     \
+		test-fgemm-check    \
+		test-ftrsm-check    \
+		test-invert-check   \
+		test-charpoly-check \
 		test-fger           \
 		test-ftrsm          \
 		test-multifile      \
+		test-maxdelayeddim \
 		regression-check
 
 if FFLASFFPACK_PRECOMPILED
-LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARFLAGS) \
+LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARLIBS) \
 	$(top_builddir)/fflas-ffpack/interfaces/libs/libfflas.la \
 	$(top_builddir)/fflas-ffpack/interfaces/libs/libffpack.la
+
 INTERFACE_TESTS= test-interfaces-c 
-test_interfaces_c_LDFLAGS = $(LDADD) \
+test_interfaces_c_LDADD = \
 	$(top_builddir)/fflas-ffpack/interfaces/libs/libfflas_c.la \
-	$(top_builddir)/fflas-ffpack/interfaces/libs/libffpack_c.la
+	$(top_builddir)/fflas-ffpack/interfaces/libs/libffpack_c.la \
+	-lm -lstdc++
+
 else
-LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARFLAGS)
+LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARLIBS)
 endif
 NOT_A_TEST =  \
 		test-lqup2             \
@@ -96,9 +108,16 @@ CLEANFILES =           \
 TESTS =     $(EXTRA_PROGRAMS)
 
 test_compressQ_SOURCES         = test-compressQ.C
+test_permutations_SOURCES         = test-permutations.C
+
 test_lu_SOURCES              = test-lu.C
 #test_lqup2_SOURCES              = test-lqup2.C
 test_det_SOURCES               = test-det.C
+test_pluq_check_SOURCES = test-pluq-check.C
+test_fgemm_check_SOURCES = test-fgemm-check.C
+test_ftrsm_check_SOURCES = test-ftrsm-check.C
+test_invert_check_SOURCES = test-invert-check.C
+test_charpoly_check_SOURCES = test-charpoly-check.C
 test_echelon_SOURCES           = test-echelon.C
 test_rankprofiles_SOURCES           = test-rankprofiles.C
 test_fgemm_SOURCES             = test-fgemm.C
@@ -132,13 +151,13 @@ test_fadd_SOURCES = test-fadd.C
 test_fscal_SOURCES = test-fscal.C
 test_finit_SOURCES = test-finit.C
 test_interfaces_c_SOURCES = test-interfaces-c.c
+test_maxdelayeddim_SOURCES = test-maxdelayeddim.C
 #test_interfaces_c_CFLAGS= -std=c11 -I/$(prefix)/include $(AM_CPPFLAGS) $(AM_CXXFLAGS) $(PARFLAGS)
 #test_interfaces_c_LDFLAGS= $(LDFLAGS) $(LDADD) $(AM_LDFLAGS) -L/$(prefix)/lib/ -lfflas_c -lffpack_c -lstdc++
 #  test_fspmv_SOURCES = test-fspmv.C
 
 regression_check_SOURCES = regression-check.C
 
-
 dense_generator: dense_generator.C
 	$(CXX) $(CXXFLAGS) $(AM_CXXFLAGS) dense_generator.C -o dense_generator
 
@@ -151,7 +170,7 @@ perfpublisher:
 # for compilation of new tests
 FFLASFFPACK_BIN=@bindir@
 
-new_examp_comp = $(CXX) $(CXXFLAGS) $(AM_CXXFLAGS)  ${INCLUDES} $(AM_CPPFLAGS) $*.C -o $@ $(LDFLAGS) $(LDADD) $(LOADLIBES)
+new_examp_comp = $(CXX) $(CXXFLAGS) $(AM_CXXFLAGS)  ${INCLUDES} $(AM_CPPFLAGS) $^ -o $@ $(LDFLAGS) $(LDADD) $(LOADLIBES)
 
 %:%.C
 	$(new_examp_comp)
diff --git a/tests/jenkins-maker.sh b/tests/jenkins-maker.sh
new file mode 100755
index 0000000..203ba19
--- /dev/null
+++ b/tests/jenkins-maker.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# This file is part of the FFLAS-FFPACK library.
+# It is distributed under the terms of the LGPL licence version 2.1 or later 
+# (see COPYING)
+# Created by AB - 2014/12/03
+# Modified by AC - 2016/06/20
+# Modified by CP - 2016/06/22
+# Some influential environment variables:
+#	CXX			C++ compiler command
+#	CXXFLAGS	C++ compiler flags
+
+# Note: This script is intended to be launched
+# by the Jenkins web interface whenever it needs
+# to compile the project.
+# It is launched from the svn:trunk root directory.
+# But should be stored in /<slave_jenkins_path>/makers/
+
+SOURCE_DIRECTORY=$( cd "$( dirname "$0" )" && pwd )
+
+#=============================#
+# Change only these variables #
+#=============================#
+CXX=`pwd | awk -F/ '{print $(NF-2)}'`
+SSE=`pwd | awk -F/ '{print $NF}'`
+
+# Job fflas-ffpack with SSE option flag 
+# by default sse is enabled
+if [ "$SSE" == "withoutSSE" ]; then
+  FFLAS_SSEFLAG="--disable-simd"
+fi
+
+JENKINS_DIR=${SOURCE_DIRECTORY%%/workspace/*}
+LOCAL_DIR="$JENKINS_DIR"/local
+# Add path to compilers (if needed)
+export PATH=$PATH:/usr/local/bin:"$LOCAL_DIR/$CXX/bin"
+echo $PATH
+
+# Where are blas installed (<blas_home>/lib/<blas_name>.so)
+# And their name (libtotoblas)
+BLAS_HOME="$LOCAL_DIR/$CXX"
+BLAS_NAME=openblas
+
+# Change these if necessary
+
+BLAS_LIBS="-L$BLAS_HOME/lib/ -l$BLAS_NAME"
+BLAS_CFLAGS=-I"$BLAS_HOME"/include
+
+# Where to install fflas-ffpack binaries
+# Keep default for local installation.
+PREFIX_INSTALL="$LOCAL_DIR/$CXX/$SSE"
+
+# Add specific locations (if needed)
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":/usr/local/lib:"$LOCAL_DIR/$CXX/lib":"$PREFIX_INSTALL"/lib
+echo "LD_LIBRARY_PATH = ${LD_LIBRARY_PATH}"
+export PKG_CONFIG_PATH=${PKG_CONFIG_PATH}:"$LOCAL_DIR/$CXX/lib/pkgconfig"
+echo "PKG_CONFIG_PATH = ${PKG_CONFIG_PATH}"
+# /!\ Warning /!\ This could be an issue if you changed
+# the local installation directory
+rm -rf "$PREFIX_INSTALL"/bin/fflas-ffpack* "$PREFIX_INSTALL"/include/fflas-ffpack*
+
+#================#
+# Setup Variables#
+#================#
+
+if [ "$CXX" == "icpc" ]; then
+     distribution=`uname -m`
+     CC=icc
+     if [ "$distribution" == "i686" ]; then 	
+	source /usr/local/bin/compilervars.sh ia32
+     else
+	source /usr/local/bin/compilervars.sh intel64
+     fi
+fi
+
+# Particular case for Fedora23: g++=g++-5.3
+vm_name=`uname -n | cut -d"-" -f1`
+if [[ "$vm_name" == "fedora"  &&  "$CXX" == "g++-5.3" ]]; then
+   CXX="g++"
+   CC=gcc
+fi
+if [ -z "$CC" ]; then
+    if [[ $CXX == g++* ]]; then
+        CC=`echo $CXX | sed -re 'y/++/cc/'`
+    else
+        CC="clang"
+    fi
+fi 
+#==================================#
+# Automated installation and tests #
+#==================================#
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| ./autogen.sh CXX=$CXX CC=$CC --prefix=$PREFIX_INSTALL --with-blas-libs=$BLAS_LIBS --enable-optimization --enable-precompilation $FFLAS_SSEFLAG"
+./autogen.sh CXX=$CXX CC=$CC --prefix="$PREFIX_INSTALL" --with-blas-libs="$BLAS_LIBS" --enable-optimization --enable-precompilation "$FFLAS_SSEFLAG"
+V="$?"; if test "x$V" != "x0"; then exit "$V"; fi
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| make prefix=$PREFIX_INSTALL install"
+make install
+V="$?"; if test "x$V" != "x0"; then exit "$V"; fi
+
+echo "|=== JENKINS AUTOMATED SCRIPT ===| make perfpublisher"
+make perfpublisher
+
+
diff --git a/tests/perfpublisher.sh b/tests/perfpublisher.sh
index 2c3c452..40ed46d 100755
--- a/tests/perfpublisher.sh
+++ b/tests/perfpublisher.sh
@@ -8,12 +8,25 @@ XMLFILE=$1
 tests=$2
 COMPILER=$3
 
+# choose gdate on OS X
+if command -v "gdate" >/dev/null; then
+    DATE=gdate
+else
+    DATE=date
+fi
+
 #=================#
 # Plateform infos #
 #=================#
 
 COMPILERVERSION=$($COMPILER --version 2>&1 | head -1)
-CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+
+if command -v "lscpu" >/dev/null; then
+    CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev)
+else
+    CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000))
+fi
+
 ARCH=$(uname -m)
 OSNAME=$(uname -s)
 OSVERSION=$(uname -r)
@@ -45,8 +58,8 @@ echo '<report name="tests-report" categ="tests">' >> $XMLFILE
 #=======#
 
 echo '<start>' >> $XMLFILE
-echo '<date format="YYYYMMDD" val="'$(date +%Y%m%d)'" />' >> $XMLFILE
-echo '<time format="HHMMSS" val="'$(date +%H%M%S)'" />' >> $XMLFILE
+echo '<date format="YYYYMMDD" val="'$($DATE +%Y%m%d)'" />' >> $XMLFILE
+echo '<time format="HHMMSS" val="'$($DATE +%H%M%S)'" />' >> $XMLFILE
 echo '</start>' >> $XMLFILE
 
 #=======#
@@ -59,9 +72,9 @@ do
 	then
 		#File does not exist: compile it
 		echo '[Compiling]' $test
-		COMPILESTART=$(date +%s%3N)
+		COMPILESTART=$($DATE +%s%3N)
 		COMPILELOG=$(make $test 2>&1; echo 'Returned state: '$?)
-		COMPILEEND=$(date +%s%3N)
+		COMPILEEND=$($DATE +%s%3N)
 		COMPILETIME=$(($COMPILEEND - $COMPILESTART))
 		COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$')
 		COMPILETIMERELEVANT='true'
@@ -92,9 +105,9 @@ do
 		#Compilation success
 		echo '[Executing]' $test
 		EXECUTED='yes'
-		EXECUTIONSTART=$(date +%s%3N)
+		EXECUTIONSTART=$($DATE +%s%3N)
 		EXECUTIONLOG=$(./$test  2>&1; echo 'Returned state: '$?)
-		EXECUTIONEND=$(date +%s%3N)
+		EXECUTIONEND=$($DATE +%s%3N)
 		EXECUTIONTIME=$(($EXECUTIONEND - $EXECUTIONSTART))
 		EXECUTIONCHECK=$(echo $EXECUTIONLOG | grep -o '[^ ]*$')
 		
diff --git a/tests/test-charpoly-check.C b/tests/test-charpoly-check.C
new file mode 100644
index 0000000..83bc708
--- /dev/null
+++ b/tests/test-charpoly-check.C
@@ -0,0 +1,106 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+//--------------------------------------------------------------------------
+//          Test for Checker_charpoly
+//--------------------------------------------------------------------------
+
+#define ENABLE_ALL_CHECKINGS 1
+
+
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+
+
+template <class Field, class Polynomial>
+void printPolynomial (const Field &F, Polynomial &v)
+{
+	for (int i = v.size() - 1; i >= 0; i--) {
+		F.write (std::cout, v[i]);
+		if (i > 0)
+			std::cout << " x^" << i << " + ";
+	}
+	std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+	srand (time(NULL));
+	typedef Givaro::ModularBalanced<double> Field;
+	Givaro::Integer q = 131071;
+	size_t iter = 3;
+    size_t MAXN = 100;
+    size_t n = 0;
+	
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
+		{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter },
+		{ 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &n },
+		END_OF_ARGUMENTS
+	};
+	FFLAS::parseArguments(argc,argv,as);
+
+	Field F(q);
+	Field::RandIter Rand(F);
+    typedef std::vector<Field::Element> Polynomial;
+
+	size_t pass = 0;
+	for (size_t i=0; i<iter; ++i) {
+        
+		n = n?n: rand() % MAXN + 1;
+// 		std::cout << "n= " << n << "\n";
+        Field::Element_ptr A = FFLAS::fflas_new(F,n,n);
+
+		Polynomial g(n);
+
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, n,n,A,n/MAX_THREADS); }
+		try {
+			//write_field(F,std::cerr<<"A=",A,n,n,n,true) <<std::endl;
+// 			FFPACK::Checker_charpoly<Field,Polynomial> checker(F,n,A);
+            Givaro::Timer charpolytime; charpolytime.start();
+			FFPACK::CharPoly(F,g,n,A,n,FFPACK::FfpackLUK);
+            charpolytime.stop();
+            std::cerr << "CHARPol time:" << charpolytime << std::endl;
+			//printPolynomial(F,g);
+// 			checker.check(g);
+			std::cout << n << 'x' << n << " charpoly verification successful\n";
+			pass++;
+		} catch(FailureCharpolyCheck &e) {
+			std::cout << n << 'x' << n << " charpoly verification failed!\n";
+		}
+		FFLAS::fflas_delete( A);
+		
+	}
+
+	std::cout << pass << "/" << iter << " tests were successful.\n";	
+
+	return 0;
+}
diff --git a/tests/test-charpoly.C b/tests/test-charpoly.C
index 9fe71d0..51a0c48 100644
--- a/tests/test-charpoly.C
+++ b/tests/test-charpoly.C
@@ -33,6 +33,8 @@
 // Clement Pernet
 //-------------------------------------------------------------------------
 
+#define ENABLE_ALL_CHECKINGS 1
+
 #include <iomanip>
 #include <iostream>
 #include "fflas-ffpack/field/modular-balanced.h"
@@ -116,8 +118,8 @@ int main(int argc, char** argv)
 
 	static Argument as[] = {
 		{ 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p },
-		{ 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &p },
-		{ 'r', "-r R", "Set number of repetitions.", TYPE_INT , &nbit },
+		{ 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &n },
+		{ 'i', "-i I", "Set number of repetitions.", TYPE_INT , &nbit },
 		{ 'f', "-f file", "Set input file", TYPE_STR, &file },
 		{ 'a', "-a algorithm", "Set the algorithm variant", TYPE_INT, &variant },
 		END_OF_ARGUMENTS
diff --git a/tests/test-fgemm-check.C b/tests/test-fgemm-check.C
new file mode 100644
index 0000000..5d6facb
--- /dev/null
+++ b/tests/test-fgemm-check.C
@@ -0,0 +1,102 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+//--------------------------------------------------------------------------
+//          Test for Checker_fgemm
+//--------------------------------------------------------------------------
+
+#define ENABLE_ALL_CHECKINGS 1
+
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+
+int main(int argc, char** argv) {
+	srand (time(NULL));
+	typedef Givaro::Modular<double> Field;
+	Givaro::Integer q = 131071;
+	size_t iter = 3;
+	
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
+		{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter },
+		END_OF_ARGUMENTS
+	};
+	FFLAS::parseArguments(argc,argv,as);
+
+	Field F(q);
+	Field::RandIter Rand(F);
+	FFLAS::FFLAS_TRANSPOSE ta,tb;
+
+	size_t pass = 0;
+	for (size_t i=0; i<iter; ++i) {
+
+		size_t m = rand() % 1000 + 1;
+		size_t n = rand() % 1000 + 1;
+		size_t k = rand() % 1000 + 1;
+		std::cout << "m= " << m << "    n= " << n << "    k= " << k << "\n";
+
+		typename Field::Element alpha,beta;
+		F.init(alpha); Rand.random(alpha);
+		F.init(beta);  Rand.random(beta);
+		
+		ta = /*rand()%2 ? */FFLAS::FflasNoTrans /*: FFLAS::FflasTrans*/,
+		tb = /*rand()%2 ? */FFLAS::FflasNoTrans /*: FFLAS::FflasTrans*/;
+
+		size_t lda = ta == FFLAS::FflasNoTrans ? k : m,
+			   ldb = tb == FFLAS::FflasNoTrans ? n : k,
+			   ldc = n;
+
+		Field::Element_ptr A = FFLAS::fflas_new(F,m,k);
+		Field::Element_ptr B = FFLAS::fflas_new(F,k,n);
+		Field::Element_ptr C = FFLAS::fflas_new(F,m,n);
+
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, m,k,A,m/MAX_THREADS); }
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, k,n,B,k/MAX_THREADS); }
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,C,n/MAX_THREADS); }
+
+		FFLAS::Checker_fgemm<Field> checker(F,m,n,k,beta,C,ldc);
+		FFLAS::fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+		try {
+			checker.check(ta,tb,alpha,A,lda,B,ldb,C);
+			std::cout << "Verification successful\n";
+			pass++;
+		} catch (FailureFgemmCheck &e) {
+			std::cout << "Verification failed!\n";
+		}
+
+		FFLAS::fflas_delete(A,B,C);
+	}
+
+	std::cout << pass << "/" << iter << " tests were successful.\n";
+
+	return 0;
+}
diff --git a/tests/test-fgemm.C b/tests/test-fgemm.C
index e509e5e..63569e1 100644
--- a/tests/test-fgemm.C
+++ b/tests/test-fgemm.C
@@ -28,7 +28,6 @@
  *.
  */
 
-
 // #ifndef NEWINO
 // #define NEWWINO
 // #endif
@@ -37,10 +36,15 @@
 // #define OLD_DYNAMIC_PEELING
 //#define DEBUG 1
 
+#define ENABLE_CHECKER_fgemm 1
+
 #include "fflas-ffpack/fflas-ffpack-config.h"
+
 #include <iomanip>
 #include <iostream>
+
 #include <givaro/modular.h>
+ 
 #include <givaro/udl.h>
 #include <recint/rint.h>
 
@@ -57,6 +61,7 @@
 
 using namespace std;
 using namespace FFPACK;
+
 using Givaro::Modular;
 using Givaro::ModularBalanced;
 
@@ -163,6 +168,7 @@ bool launch_MM(const Field & F,
 			   bool par, 
 			   size_t b)
 {
+
 	bool ok = true;
 
 	typedef typename Field::Element_ptr Element_ptr;
@@ -181,7 +187,7 @@ bool launch_MM(const Field & F,
 		}
 		else {
 			FFLASFFPACK_check(lda >= m);
-			A = FFLAS::fflas_new (F, k, lda);
+			A = FFLAS::fflas_new (F, k, lda); 
 			FFLAS::fzero(F,k,lda,A,lda);
 			RandomMatrix(F,A,k,m,lda,b);
 		}
@@ -205,7 +211,7 @@ bool launch_MM(const Field & F,
 				FFLAS::fgemm (F, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc,WH);
 			}
 		}else{
-			FFLAS::MMHelper<Field,FFLAS::MMHelperAlgo::Auto> WH(F,nbw,FFLAS::ParSeqHelper::Sequential());
+			FFLAS::MMHelper<Field,FFLAS::MMHelperAlgo::Auto,typename FFLAS::ModeTraits<Field>::value> WH(F,nbw,FFLAS::ParSeqHelper::Sequential());
 			FFLAS::fgemm (F, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc,WH);
 		}
 		ok &= check_MM(F, D, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc);
@@ -291,7 +297,7 @@ bool run_with_field (Givaro::Integer q, uint64_t b, int m, int n, int k, int nbw
 	bool ok = true ;
 
 	int nbit=(int)iters;
-
+	
 	while (ok &&  nbit){
 		typedef typename Field::Element Element ;
 		// choose Field
@@ -410,14 +416,17 @@ int main(int argc, char** argv)
 		ok &= run_with_field<ModularBalanced<float> >(q,b,m,n,k,nbw,iters,p);
 		ok &= run_with_field<Modular<int32_t> >(q,b,m,n,k,nbw,iters,p);
 		ok &= run_with_field<ModularBalanced<int32_t> >(q,b,m,n,k,nbw,iters,p);
-		ok &= run_with_field<Modular<RecInt::rint<7> > >(q,b?b:63_ui64,m,n,k,nbw,iters, p);
-		ok &= run_with_field<Modular<RecInt::rint<8> > >(q,b?b:127_ui64,m,n,k,nbw,iters, p);
 		ok &= run_with_field<Modular<int64_t> >(q,b,m,n,k,nbw,iters, p);
 		ok &= run_with_field<ModularBalanced<int64_t> >(q,b,m,n,k,nbw,iters, p);
+		ok &= run_with_field<Modular<RecInt::rint<7> > >(q,b?b:63_ui64,m,n,k,nbw,iters, p);
+		ok &= run_with_field<Modular<RecInt::rint<8> > >(q,b?b:127_ui64,m,n,k,nbw,iters, p);		
 		ok &= run_with_field<Modular<Givaro::Integer> >(q,(b?b:512_ui64),m,n,k,nbw,iters,p);
 		ok &= run_with_field<Givaro::ZRing<Givaro::Integer> >(0,(b?b:512_ui64),m,n,k,nbw,iters,p);
 
 	} while (loop && ok);
 
+
+	
+	
 	return !ok ;
 }
diff --git a/tests/test-fger.C b/tests/test-fger.C
index 1da79ea..3f5ffd1 100644
--- a/tests/test-fger.C
+++ b/tests/test-fger.C
@@ -34,7 +34,7 @@
 // Clement Pernet
 //-------------------------------------------------------------------------
 
-#define DEBUG 
+// #define DEBUG 
 #define TIME 1
 
 #include "fflas-ffpack/fflas-ffpack-config.h"
diff --git a/tests/test-ftrsm-check.C b/tests/test-ftrsm-check.C
new file mode 100644
index 0000000..53f00f0
--- /dev/null
+++ b/tests/test-ftrsm-check.C
@@ -0,0 +1,110 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+//--------------------------------------------------------------------------
+//          Test for Checker_ftrsm
+//--------------------------------------------------------------------------
+#define ENABLE_ALL_CHECKINGS 1
+
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+
+int main(int argc, char** argv) {
+	srand (time(NULL));
+	typedef Givaro::Modular<double> Field;
+	Givaro::Integer q = 131071;
+	size_t iter = 3;
+	size_t MAXN = 100;
+    size_t seed(0);
+
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
+		{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter },
+		{ 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &MAXN },
+        { 's', "-s N", "Set the seed.", TYPE_INT , &seed },
+		END_OF_ARGUMENTS
+	};
+	FFLAS::parseArguments(argc,argv,as);	
+
+	Field F(q); Field::RandIter G(F,0,seed);
+    srandom(seed);
+
+	typename Field::Element alpha,tmp;
+	Field::RandIter Rand(F);
+	Field::NonZeroRandIter NZRand(Rand);
+
+	size_t pass = 0;
+	for (size_t i=0; i<iter; ++i) {
+
+		size_t m = random() % MAXN + 1;
+		size_t n = random() % MAXN + 1;
+		std::cout << "m= " << m << "    n= " << n << "\n";
+		Rand.random(alpha);
+		FFLAS::FFLAS_SIDE side = rand()%2?FFLAS::FflasLeft:FFLAS::FflasRight;
+		FFLAS::FFLAS_UPLO uplo = rand()%2?FFLAS::FflasLower:FFLAS::FflasUpper;
+		FFLAS::FFLAS_TRANSPOSE trans = rand()%2?FFLAS::FflasNoTrans:FFLAS::FflasTrans;
+		FFLAS::FFLAS_DIAG diag = rand()%2?FFLAS::FflasNonUnit:FFLAS::FflasUnit;
+		size_t k = (side==FFLAS::FflasLeft?m:n);
+		//std::cout << alpha << "  " << side << "  " << uplo << "  " << trans << "  " << diag << "  \n";
+
+		Field::Element_ptr X = FFLAS::fflas_new(F,m,n);
+		Field::Element_ptr A = FFLAS::fflas_new(F,k,k);
+
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,X,m/MAX_THREADS); }
+		//write_field(F,std::cerr<<"X:=",X,m,n,n,true) <<std::endl;
+
+		for (size_t i=0;i<k;++i){
+			for (size_t j=0;j<i;++j)
+				A[i*k+j]= (uplo == FFLAS::FflasLower)? Rand.random(tmp) : F.zero;
+			A[i*k+i]= (diag == FFLAS::FflasNonUnit)? NZRand.random(tmp) : F.one;
+			for (size_t j=i+1;j<k;++j)
+				A[i*k+j]= (uplo == FFLAS::FflasUpper)? Rand.random(tmp) : F.zero;
+		}
+		//write_field(F,std::cerr<<"A:=",A,k,k,k,true) <<std::endl;
+
+		FFLAS::Checker_ftrsm<Field> checker(G, m, n, alpha, X, n);
+		FFLAS::ftrsm(F, side, uplo, trans, diag, m, n, alpha, A, k, X, n);
+		try {
+			checker.check(side, uplo, trans, diag, m, n, A, k, X, n);
+			std::cout << "Verification successful\n";
+			pass++;
+		} catch(FailureTrsmCheck &e) {
+			std::cout << "Verification failed!\n";
+		}
+
+		FFLAS::fflas_delete(X,A);
+	}
+
+	std::cout << pass << "/" << iter << " tests were successful.\n";
+
+	return 0;
+}
diff --git a/tests/test-ftrsm.C b/tests/test-ftrsm.C
index ec4d311..971eeea 100644
--- a/tests/test-ftrsm.C
+++ b/tests/test-ftrsm.C
@@ -27,6 +27,8 @@
  */
 #define  __FFLASFFPACK_SEQUENTIAL
 
+#define ENABLE_ALL_CHECKINGS 1
+
 #include "fflas-ffpack/fflas-ffpack-config.h"
 #include <givaro/modular-integer.h>
 
@@ -224,6 +226,7 @@ int main(int argc, char** argv)
 		ok &= run_with_field<ModularBalanced<int32_t> >(q,b,m,n,s,iters);
 		ok &= run_with_field<Modular<int64_t> >(q,b,m,n,s,iters);
 		ok &= run_with_field<ModularBalanced<int64_t> >(q,b,m,n,s,iters);
+		ok &= run_with_field<Modular<Givaro::Integer> >(q,5,m/4+1,n/4+1,s,iters); 
 		ok &= run_with_field<Modular<Givaro::Integer> >(q,(b?b:512),m/4+1,n/4+1,s,iters); 
 	} while (loop && ok);
 
diff --git a/tests/test-interfaces-c.c b/tests/test-interfaces-c.c
index aba09b4..200f383 100644
--- a/tests/test-interfaces-c.c
+++ b/tests/test-interfaces-c.c
@@ -1,3 +1,29 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) FFLAS-FFPACK
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
 #include <interfaces/libs/fflas_c.h>
 #include <interfaces/libs/ffpack_c.h>
 
diff --git a/tests/test-invert-check.C b/tests/test-invert-check.C
new file mode 100644
index 0000000..55384be
--- /dev/null
+++ b/tests/test-invert-check.C
@@ -0,0 +1,93 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+//--------------------------------------------------------------------------
+//          Test for Checker_invert
+//--------------------------------------------------------------------------
+#define ENABLE_ALL_CHECKINGS 1
+
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+#include "fflas-ffpack/utils/fflas_randommatrix.h"
+
+int main(int argc, char** argv) {
+	srand (time(NULL));
+	typedef Givaro::Modular<double> Field;
+	Givaro::Integer q = 131071;
+	size_t iter = 3;
+	size_t MAXM = 1000;
+	size_t seed( (int) time(NULL) );
+    
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
+		{ 'n', "-n N", "Set the maximal size of the matrix.", TYPE_INT , &MAXM },
+		{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter },
+        { 's', "-s N", "Set the seed.", TYPE_INT , &seed },
+		END_OF_ARGUMENTS
+	};
+	FFLAS::parseArguments(argc,argv,as);
+    FFLAS::writeCommandString(std::cout, as) << std::endl;
+
+	Field F(q);
+
+	Field::RandIter Rand(F,0,seed);
+	Field::NonZeroRandIter NZRand(Rand);
+    srandom(seed);
+
+	int nullity;
+	size_t m = MAXM, pass = 0;
+	for (size_t i=0; i<iter; ++i) {
+		m = random() % MAXM + 1;
+		std::cout << "m= " << m << "\n";
+
+		Field::Element_ptr A = FFLAS::fflas_new(F,m<<1,m<<1);
+
+		FFPACK::RandomMatrixWithRankandRandomRPM(F,A,m<<1,m,m,m);
+
+		FFPACK::Checker_invert<Field> checker(Rand,m,A,m<<1);
+		FFPACK::Invert(F,m,A,m<<1,nullity);
+		try {
+			checker.check(A,nullity);
+			std::cout << "Verification successful\n";
+			pass++;
+		} catch (FailureInvertCheck &e) {
+			std::cout << "Verification failed!\n";
+		}
+
+		FFLAS::fflas_delete(A);
+	}
+
+	std::cout << pass << "/" << iter << " tests were successful.\n";
+
+
+	return 0;
+}
diff --git a/tests/test-invert.C b/tests/test-invert.C
index 79211ea..815480d 100644
--- a/tests/test-invert.C
+++ b/tests/test-invert.C
@@ -1,9 +1,9 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 
 /*
- * Copyright (C) FFLAS-FFPACK
- * Written by Clément Pernet
+ * Copyright (C) the FFLAS-FFPACK group
+ * Written by Clément Pernet <clement.pernet at imag.fr>
  * This file is Free Software and part of FFLAS-FFPACK.
  *
  * ========LICENCE========
@@ -26,95 +26,110 @@
  *.
  */
 
+#define ENABLE_ALL_CHECKINGS 1
 
-//--------------------------------------------------------------------------
-//                        Test for invert : 1 computation
-//
-//--------------------------------------------------------------------------
-// Clement Pernet
-//-------------------------------------------------------------------------
+#define  __FFLASFFPACK_SEQUENTIAL
 
-//#define DEBUG 1
-#define TIME 1
-using namespace std;
+#include "fflas-ffpack/fflas-ffpack-config.h"
 
 #include <iomanip>
 #include <iostream>
-#include "fflas-ffpack/field/modular-balanced.h"
-#include "fflas-ffpack/utils/timer.h"
-#include "Matio.h"
+
 #include "fflas-ffpack/ffpack/ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+#include "test-utils.h"
+#include <givaro/modular.h>
+#include <givaro/modular-balanced.h>
 
 
+using namespace std;
+using namespace FFLAS;
 using namespace FFPACK;
-typedef ModularBalanced<float> Field;
+using Givaro::Modular;
+using Givaro::ModularBalanced;
 
-int main(int argc, char** argv){
+template <class Field>
+bool run_with_field (Givaro::Integer q, size_t b, size_t n, size_t iters){
+	bool ok = true ;
+	int nbit=(int)iters;
+	while (ok && nbit){
+		Field* F= chooseField<Field>(q,b);
+		if (F==nullptr)
+			return true;
 
-	int n;
-	int nbit=atoi(argv[3]); // number of times the product is performed
-	cerr<<setprecision(10);
+		cout<<"Checking with ";F->write(cout)<<endl;
 
-	if (argc != 4)	{
-		cerr<<"Usage : test-invert <p> <A> <<i>"
-		    <<endl
-		    <<"         to invert A mod p (i computations)"
-		    <<endl;
-		exit(-1);
-	}
-	Field F(atof(argv[1]));
-	Field::Element * A;
-	A = read_field(F,argv[2],&n,&n);
-
- FFLAS::Timer tim,t; t.clear();tim.clear();
-	int nullity=0;
-
-	for(int i = 0;i<nbit;++i){
-		t.clear();
-		t.start();
-		FFPACK::Invert (F, n, A, n, nullity);
-		t.stop();
-		tim+=t;
-	}
+		size_t lda = n + (rand() % 4);
+		size_t ldx = n + (rand() % 4);
+
+		typename Field::Element_ptr A = fflas_new(*F, n, lda);
+		typename Field::Element_ptr X = fflas_new(*F, n, ldx);
+
+		RandomMatrixWithRank (*F, A, lda, n, n, n);
 
-#if DEBUG
-	Field::Element *Ab = read_field(F,argv[2],&n,&n);
-	Field::Element *I = FFLAS::fflas_new<Field::Element>(n*n);
-	FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, n, n, n,
-		      1.0, Ab, n, A, n, 0.0, I, n);
-	bool wrong = false;
-
-	for (int i=0;i<n;++i)
-		for (int j=0;j<n;++j)
-			if ( ((i!=j) && !F.isZero(*(I+i*n+j)))
-			     ||((i==j) &&!F.isOne(*(I+i*n+j))))
-				wrong = true;
-
-	if ( wrong ){
-		if (nullity > 0)
-			cerr<<"Matrix is singular over Z/"<<argv[1]<<"Z"<<endl;
-		else{
-			cerr<<"FAIL"<<endl;
-			write_field (F,cerr<<"A="<<endl,Ab,n,n,n);
-			write_field (F,cerr<<"A^-1="<<endl,A,n,n,n);
-			write_field (F,cerr<<"I="<<endl,I,n,n,n);
+		int nullity;
+		FFPACK::Invert(*F, n, A, lda, X, ldx, nullity);
+
+		if (nullity != 0){
+			std::cerr<<"Error: Singular matrix detected"<<std::endl;
+			fflas_delete(A);
+			fflas_delete(X);
+			return ok = false;
 		}
-	} else {
-		cerr<<"PASS"<<endl;
-	}
-	FFLAS::fflas_delete( I);
-	FFLAS::fflas_delete( Ab);
 
-#endif
-	FFLAS::fflas_delete( A);
+		typename Field::Element_ptr Y = fflas_new(*F, n, n);
+		fidentity(*F, n, n, Y, n);
+
+		fgemm(*F, FflasNoTrans, FflasNoTrans, n,n,n, F->one, A, lda, X, ldx, F->mOne, Y, n);
+
+		if (! fiszero(*F,n,n,Y,n)){
+			write_field(*F, std::cerr<<"Y = "<<std::endl,Y,n,n,n);
+			std::cerr<<"Error: A * A^{-1} != Id"<<std::endl;
+			fflas_delete(A);
+			fflas_delete(X);
+			fflas_delete(Y);
+			return ok = false;
+		}
 
-#if TIME
-	double mflops = 2*(n*n/1000000.0)*nbit*n/tim.usertime();
-	cerr<<"n = "<<n<<" Inversion over Z/"<<atoi(argv[1])<<"Z : t= "
-	     << tim.usertime()/nbit
-	     << " s, Mffops = "<<mflops
-	     << endl;
+		nbit--;
+		fflas_delete(A);
+		fflas_delete(X);
+		fflas_delete(Y);
+	}
+	return ok;
+}
 
-	cout<<n<<" "<<mflops<<" "<<tim.usertime()/nbit<<endl;
-#endif
+int main(int argc, char** argv)
+{
+	cerr<<setprecision(10);
+	static Givaro::Integer q=-1;
+	static size_t b=0;
+	static size_t n=300;
+	static size_t iters=3;
+	static bool loop=false;
+	static Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",         TYPE_INTEGER , &q },
+		{ 'b', "-b B", "Set the bitsize of the field characteristic.",  TYPE_INT , &b },
+		{ 'n', "-n N", "Set the dimension of the square matrix.", TYPE_INT , &n },
+		{ 'i', "-i R", "Set number of repetitions.",            TYPE_INT , &iters },
+		{ 'l', "-loop Y/N", "run the test in an infinite loop.", TYPE_BOOL , &loop },
+		END_OF_ARGUMENTS
+        };
+
+	FFLAS::parseArguments(argc,argv,as);
+
+	bool ok = true;
+	do{
+		ok &= run_with_field<Modular<double> >(q,b,n,iters);
+		ok &= run_with_field<ModularBalanced<double> >(q,b,n,iters);
+		ok &= run_with_field<Modular<float> >(q,b,n,iters);
+		ok &= run_with_field<ModularBalanced<float> >(q,b,n,iters);
+		ok &= run_with_field<Modular<int32_t> >(q,b,n,iters);
+		ok &= run_with_field<ModularBalanced<int32_t> >(q,b,n,iters);
+		ok &= run_with_field<Modular<int64_t> >(q,b,n,iters);
+		ok &= run_with_field<ModularBalanced<int64_t> >(q,b,n,iters);
+		ok &= run_with_field<Modular<Givaro::Integer> >(q,(b?b:512),n/4+1,iters); 
+	} while (loop && ok);
+
+	return !ok ;
 }
diff --git a/tests/test-lu.C b/tests/test-lu.C
index f848831..5c2f528 100644
--- a/tests/test-lu.C
+++ b/tests/test-lu.C
@@ -31,15 +31,33 @@
 //      Test suite for the Gaussian elimination routines: LUdivine and PLUQ
 //-------------------------------------------------------------------------
 
+// #define MONOTONIC_CYLCES
+// #define MONOTONIC_MOREPIVOTS
+// #define MONOTONIC_FEWPIVOTS
+#ifdef MONOTONIC_CYLCES
+  #define MONOTONIC_APPLYP
+#endif
+#ifdef MONOTONIC_MOREPIVOTS
+  #define MONOTONIC_APPLYP
+#endif
+#ifdef MONOTONIC_FEWPIVOTS
+  #define MONOTONIC_APPLYP
+#endif
+
+#define BASECASE_K 37 // Forcing a lower base case to be able to test a few recursive steps with smallish dimensions
+
+
 #define  __FFLASFFPACK_SEQUENTIAL
 #define __LUDIVINE_CUTOFF 1
 #include "fflas-ffpack/fflas-ffpack-config.h"
 #include <givaro/modular-balanced.h>
 #include <iostream>
 #include <iomanip>
-
+Givaro::Timer tperm, tgemm, tBC, ttrsm,trest,timtot;
+size_t mvcnt = 0;
 #include "fflas-ffpack/utils/Matio.h"
 #include "fflas-ffpack/utils/timer.h"
+#include "fflas-ffpack/fflas/fflas.h"
 #include "fflas-ffpack/ffpack/ffpack.h"
 #include "test-utils.h"
 
@@ -48,7 +66,6 @@
 using namespace std;
 using namespace FFPACK;
 
-
 /*! Tests the LUdivine routine.
  * @tparam Field Field
  * @tparam Diag  Unit diagonal in U 
@@ -256,6 +273,7 @@ bool verifPLUQ (const Field & F, typename Field::ConstElement_ptr A, size_t lda,
 	typename Field::Element zero,one;
 	F.init(zero,0.0);
 	F.init(one,1.0);
+	// write_field(F,std::cerr<<"PLUQ = "<<std::endl,PLUQ,m,n,ldpluq);
 	FFPACK::getTriangular(F, FFLAS::FflasUpper, diag, m,n,R, PLUQ, ldpluq, U, n, true);
 	FFPACK::getTriangular(F, FFLAS::FflasLower, (diag==FFLAS::FflasNonUnit)?FFLAS::FflasUnit:FFLAS::FflasNonUnit, 
 						  m,n,R, PLUQ, ldpluq, L, R, true);
@@ -263,16 +281,24 @@ bool verifPLUQ (const Field & F, typename Field::ConstElement_ptr A, size_t lda,
 	FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasNoTrans, R,0,n, U, n, Q);
 	FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m,n,R, F.one, L,R, U,n, F.zero, X,n);
 
+	// write_perm(std::cerr<<"P = ",P,m);
+	// write_perm(std::cerr<<"Q = ",Q,n);
+	// write_field(F,std::cerr<<"L = "<<std::endl,L,m,R,R);
+	// write_field(F,std::cerr<<"U = "<<std::endl,U,R,n,n);
+	
+
 	bool fail = false;
 	for(size_t i=0; i<m; ++i)
 		for (size_t j=0; j<n; ++j)
 			if (!F.areEqual (*(A+i*lda+j), *(X+i*n+j))){
 				std::cerr << std::endl<<" A ["<<i<<","<<j<<"] = " << (*(A+i*lda+j))
-						  << " PLUQ ["<<i<<","<<j<<"] = " << (*(X+i*n+j))
-						  << std::endl;
+						  << " PLUQ ["<<i<<","<<j<<"] = " << (*(X+i*n+j));
 				fail=true;
 			}
 		//write_field(F, std::cerr<<"X = "<<std::endl,X, m,n,n);
+	if (fail)
+		std::cerr << std::endl;
+
 	FFLAS::fflas_delete( U);
 	FFLAS::fflas_delete( L);
 	FFLAS::fflas_delete( X);
@@ -303,9 +329,17 @@ bool test_pluq (const Field & F,
 	size_t * P = FFLAS::fflas_new<size_t> (m);
 	size_t * Q = FFLAS::fflas_new<size_t> (n);
 	
-		//write_field(F,std::cerr<<"\n B = \n",B,m,n,lda);
+	// write_field(F,std::cerr<<"\n B = \n",B,m,n,lda);
+    typename Field::RandIter G(F);
+    FFPACK::ForceCheck_PLUQ<Field> checker (G,m,n,A,n);
+
 	size_t R = FFPACK::PLUQ (F, diag, m, n, B, lda, P, Q);
-		//write_field(F,std::cerr<<"\n PLUQ = \n",B,m,n,lda);
+	// write_field(F,std::cerr<<"\n PLUQ = \n",B,m,n,lda);
+    try {
+        checker.check(A,n,R,P,Q);
+    } catch(FailurePLUQCheck &e) {
+        std::cout << m << 'x' << n << " pluq verification failed!\n";
+    }
 
 	if (R != r) {
 		std::cout << "rank is wrong (expected " << r << " but got " << R << ")" << std::endl;
@@ -802,6 +836,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, m, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,r,m,n);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,r,m,n);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,r,m,n);
 		fail |= test_pluq<Field,diag>(F,A,r,m,n,lda);
 		if (fail) std::cout << "failed at big lda" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -812,6 +847,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, m, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,R,m,n);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n);
 		fail |= test_pluq<Field,diag>(F,A,R,m,n,lda);
 		if (fail) std::cout << "failed at big lda max rank" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -822,6 +858,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, m, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,R,m,n);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n);
 		fail |= test_pluq<Field,diag>(F,A,R,m,n,lda);
 		if (fail) std::cout << "failed at big lda, rank 0" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -834,6 +871,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, M, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,R,M,N);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_pluq<Field,diag>(F,A,R,M,N,lda);
 		if (fail) std::cout << "failed at square" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -846,6 +884,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, M, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,R,M,N);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_pluq<Field,diag>(F,A,R,M,N,lda);
 		if (fail) std::cout << "failed at wide" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -858,6 +897,7 @@ bool launch_test(const Field & F,
 		Element_ptr A = FFLAS::fflas_new (F, M, lda);
 		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_LUdivine<Field,diag,trans>(F,A,lda,R,M,N);
+		RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N);
 		fail |= test_pluq<Field,diag>(F,A,R,M,N,lda);
 		if (fail) std::cout << "failed at narrow" << std::endl;
 		FFLAS::fflas_delete( A );
@@ -999,7 +1039,7 @@ bool run_with_field(Givaro::Integer q, uint64_t b, size_t m, size_t n, size_t r,
 		ok&= launch_test<Field,FFLAS::FflasUnit,FFLAS::FflasNoTrans>    (*F,r,m,n);
 		ok&= launch_test<Field,FFLAS::FflasUnit,FFLAS::FflasTrans>      (*F,r,m,n);
 		ok&= launch_test<Field,FFLAS::FflasNonUnit,FFLAS::FflasNoTrans> (*F,r,m,n);
-		ok&= launch_test<Field,FFLAS::FflasNonUnit,FFLAS::FflasTrans>   (*F,r,m,n);		
+		ok&= launch_test<Field,FFLAS::FflasNonUnit,FFLAS::FflasTrans>   (*F,r,m,n);
 
 #if 0 /*  may be bogus */
 		ok&= launch_test_append<Field,FFLAS::FflasUnit,FFLAS::FflasNoTrans>   (*F,r,m,n);
@@ -1026,8 +1066,8 @@ int main(int argc, char** argv)
 	static size_t b=0;
 	static size_t m=120;
 	static size_t n=120;
-	static size_t r=80;
-	static size_t iters=2;
+	static size_t r=70;
+	static size_t iters=3;
 	static bool loop=false;
 	static Argument as[] = {
 		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",         TYPE_INTEGER , &q },
@@ -1055,7 +1095,8 @@ int main(int argc, char** argv)
 		ok&=run_with_field<Givaro::ModularBalanced<int32_t> > (q,b,m,n,r,iters);
 		ok&=run_with_field<Givaro::Modular<int64_t> >         (q,b,m,n,r,iters);
 		ok&=run_with_field<Givaro::ModularBalanced<int64_t> > (q,b,m,n,r,iters);
-		ok&=run_with_field<Givaro::Modular<Givaro::Integer> > (q,(b?b:512),m/6,n/6,r/6,iters);		
+		ok&=run_with_field<Givaro::Modular<Givaro::Integer> > (q,5,m/6,n/6,r/6,iters);
+		ok&=run_with_field<Givaro::Modular<Givaro::Integer> > (q,(b?b:512),m/6,n/6,r/6,iters);
 	} while (loop && ok);
 
 	return !ok;
diff --git a/tests/test-maxdelayeddim.C b/tests/test-maxdelayeddim.C
new file mode 100644
index 0000000..1415b0b
--- /dev/null
+++ b/tests/test-maxdelayeddim.C
@@ -0,0 +1,86 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) FFLAS-FFPACK
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+#include <givaro/modular.h>
+#include <recint/rint.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+template <class Field>
+bool test (Givaro::Integer p, size_t kmax){
+    Field F(p);
+    FFLAS::MMHelper<Field, FFLAS::MMHelperAlgo::Auto, FFLAS::ModeCategories::DelayedTag> MMH(F, 0);
+    size_t k = MMH.MaxDelayedDim(0);
+    if (kmax!=k)
+        F.write(std::cerr)<<": expected: "<<kmax<<" got: "<<k<<std::endl;
+    return kmax == k;
+}
+int main() {
+
+    bool ok=true;
+    
+        // kmax = floor(2^53 / (p-1)^2)
+    // ok &= test<Givaro::Modular<double>  >(17,35184372088831);
+    // ok &= test<Givaro::Modular<double>  >(65521,2098176);
+    // ok &= test<Givaro::Modular<double>  >(67108859,2);
+    //     // kmax = floor(2^53 / ((p-1)/2)^2)
+    // ok &= test<Givaro::ModularBalanced<double>  >(17,140737488355327);
+    // ok &= test<Givaro::ModularBalanced<double>  >(65521,8392705);
+    // ok &= test<Givaro::ModularBalanced<double>  >(67108859,8);
+    //     // kmax = floor(2^24 / (p-1)^2)
+    // ok &= test<Givaro::Modular<float> > (17,65535);
+    // ok &= test<Givaro::Modular<float> > (2039,4);
+    //     // kmax = floor(2^24 / ((p-1)/2)^2)
+    // ok &= test<Givaro::ModularBalanced<float> >(17,262143);
+    // ok &= test<Givaro::ModularBalanced<float> > (2039,16);
+
+    //    // kmax = floor(2^53 / (p-1)^2)
+    // ok &= test<Givaro::Modular<int64_t>  >(17,36028797018963967);
+    // ok &= test<Givaro::Modular<int64_t>  >(65521,2148532608);
+    // ok &= test<Givaro::Modular<int64_t>  >(1147482977,7);
+    //     // kmax = floor(2^53 / ((p-1)/2)^2)
+    // ok &= test<Givaro::ModularBalanced<int64_t>  >(17,144115188075855871);
+    // ok &= test<Givaro::ModularBalanced<int64_t>  >(65521,8594130432);
+    // ok &= test<Givaro::ModularBalanced<int64_t>  >(1147482977,28);
+ 
+    //    // kmax = floor(2^31 / (p-1)^2)
+    // ok &= test<Givaro::Modular<int32_t>  >(17,8388607);
+    // ok &= test<Givaro::Modular<int32_t>  >(24571,3);
+    //     // kmax = floor(2^31 / ((p-1)/2)^2)
+    // ok &= test<Givaro::ModularBalanced<int32_t>  >(17,33554431);
+    // ok &= test<Givaro::ModularBalanced<int32_t>  >(24571,14);
+
+    //    // kmax = maxsize_t
+    // ok &= test<Givaro::Modular<Givaro::Integer>  >(17, std::numeric_limits<size_t>::max());
+    // ok &= test<Givaro::Modular<Givaro::Integer>  >(Givaro::Integer("46768052394588893382517914646921056628989841375373"),std::numeric_limits<size_t>::max());
+    //     // kmax = maxsize_t
+    ok &= test<Givaro::Modular<RecInt::rint<8> > >(17, std::numeric_limits<size_t>::max());
+    ok &= test<Givaro::Modular<RecInt::rint<8> > >(Givaro::Integer("166153499473114484112975882535042793"),2097152);
+    return !ok;
+}
+        
+
diff --git a/tests/test-permutations.C b/tests/test-permutations.C
new file mode 100644
index 0000000..e5b0897
--- /dev/null
+++ b/tests/test-permutations.C
@@ -0,0 +1,118 @@
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) the FFLAS-FFPACK group
+ * Written by Clément Pernet
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *.
+ */
+
+
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include <iostream>
+#include <givaro/modular.h>
+
+Givaro::Timer tperm, tgemm, tBC, ttrsm,trest,timtot;
+
+#include "fflas-ffpack/ffpack/ffpack.h"
+
+#include "fflas-ffpack/utils/Matio.h"
+
+
+
+using namespace std;
+using namespace FFLAS;
+using namespace FFPACK;
+using Givaro::Modular;
+
+bool checkMonotonicApplyP(FFLAS_SIDE Side, FFLAS_TRANSPOSE trans, size_t * P, size_t N, size_t R){
+	bool ok = true;
+	
+	typedef Modular<double> Field;
+    Field F(101);
+	size_t M = 2;
+	size_t lda = (Side == FflasLeft)? M : N;
+	size_t ldb = lda;
+	Field::Element_ptr A = fflas_new(F, M, N);
+	Field::Element_ptr B = fflas_new(F, M, N);
+	if (Side == FflasLeft)
+		for (size_t i = 0; i<N; ++i)
+			for (size_t j = 0; j<M; ++j)
+				F.init(A[i*lda+j],i*10+j);
+	else
+		for (size_t i = 0; i<N; ++i)
+			for (size_t j = 0; j<M; ++j)
+				F.init(A[i+j*lda],i*10+j);
+
+	fassign(F, N,M, A, lda, B, ldb);
+
+    write_field(F, std::cerr<<"Before MonotonicApplyP, A = "<<std::endl, A, N,M, lda);
+
+	MonotonicApplyP (F, FflasLeft, FflasNoTrans, M, 0, N, A, lda, P, R);
+
+    write_field(F, std::cerr<<"After MonotonicApplyP, A = "<<std::endl, A, N,M, lda);
+
+		// checking that cols have not been permuted
+	typename Field::Element x;
+	F.init(x);
+	for (size_t i=0; i<N; i++){
+		F.sub(x,A[lda*i+1],A[lda*i]);
+		if (!F.isOne(x)){
+			std::cerr<<"ERROR: A["<<i<<", 1] = "<<A[i*lda+1]<<" != "<<A[i*lda]+1<<" = A["<<i<<", 0]+1"<<std::endl;
+			ok = false;
+		}
+	}
+	
+		// Checking that the non pivot rows are monotonically increasing
+	for (size_t i=R; i<N-1; i++){
+		if (A[i*lda] >= A[(i+1)*lda]){
+			std::cerr<<"ERROR: A["<<i<<", 0] = "<<A[i*lda]<<" >= "<<A[(i+1)*lda]<<" = A["<<i+1<<", 0]"<<std::endl;
+			ok = false;
+		}
+	}
+
+		// Checking that the first R rows have been permuted correctly
+	applyP(F, Side, trans, M, 0, R, B, ldb, P);
+	if (!fequal(F, R, M,A, lda, B, ldb)){
+		std::cerr<<"ERROR: first R rows are not permuted correctly"<<std::endl;
+		ok =false;
+	}
+	fflas_delete(A);
+	fflas_delete(B);
+	
+	return ok;
+}
+
+int main(){
+    
+    
+	bool ok = true;
+    
+    size_t  P1[10] = {0,5,6,6,7,9,6,7,8,9};
+	ok &= checkMonotonicApplyP(FflasLeft, FflasNoTrans, P1, 10, 6);
+    size_t  P2[10] = {0,3,3,6,6,5,6,7,8,9};
+	ok &= checkMonotonicApplyP(FflasLeft, FflasNoTrans, P2, 10, 5);
+    size_t  P3[10] = {0,4,2,4,5,5,6,7,8,9};
+	ok &= checkMonotonicApplyP(FflasLeft, FflasNoTrans, P3, 10, 6);
+	
+	return !ok;
+}
diff --git a/tests/test-pluq-check.C b/tests/test-pluq-check.C
new file mode 100644
index 0000000..79790d7
--- /dev/null
+++ b/tests/test-pluq-check.C
@@ -0,0 +1,104 @@
+/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
+
+/*
+ * Copyright (C) 2015 the FFLAS-FFPACK group
+ * Written by Ashley Lesdalons <Ashley.Lesdalons at e.ujf-grenoble.fr>
+ *
+ * This file is Free Software and part of FFLAS-FFPACK.
+ *
+ * ========LICENCE========
+ * This file is part of the library FFLAS-FFPACK.
+ *
+ * FFLAS-FFPACK is free software: you can redistribute it and/or modify
+ * it under the terms of the  GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * ========LICENCE========
+ *
+ */
+
+//--------------------------------------------------------------------------
+//          Test for Checker_PLUQ
+//--------------------------------------------------------------------------
+
+#define ENABLE_ALL_CHECKINGS 1
+
+
+#include <iostream>
+#include <stdlib.h>
+#include <time.h>
+#include "fflas-ffpack/fflas-ffpack.h"
+#include "fflas-ffpack/utils/args-parser.h"
+
+int main(int argc, char** argv) {
+	size_t iter = 3 ;
+	Givaro::Integer q = 131071;
+	size_t MAXM = 1000;
+	size_t MAXN = 1000;
+    size_t m=0,n=0;
+    size_t seed(0);
+ bool random_dim = false;
+
+	Argument as[] = {
+		{ 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q },
+		{ 'm', "-m M", "Set the row dimension of A.", TYPE_INT , &m },
+		{ 'n', "-n N", "Set the col dimension of A.", TYPE_INT , &n },
+		{ 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter },
+        { 's', "-s N", "Set the seed.", TYPE_INT , &seed },
+		END_OF_ARGUMENTS
+	};
+
+	FFLAS::parseArguments(argc,argv,as);
+	if (m == 0 || n == 0) random_dim = true;
+
+	srandom ( seed?seed:time(NULL) );
+
+	typedef Givaro::Modular<double> Field;
+	Field F(q);
+
+	Field::RandIter Rand(F,0,seed);
+    srandom(seed);
+    
+	size_t pass = 0;	// number of tests that have successfully passed
+
+	for(size_t it=0; it<iter; ++it) {
+		if (random_dim) {
+			m = random() % MAXM + 1;
+			n = random() % MAXN + 1;
+		}
+			
+		Field::Element_ptr A = FFLAS::fflas_new(F,m,n);
+		size_t *P = FFLAS::fflas_new<size_t>(m);
+		size_t *Q = FFLAS::fflas_new<size_t>(n);
+
+		// generate a random matrix A
+		PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,A,m/MAX_THREADS); }
+  
+//   		FFPACK::Checker_PLUQ<Field> checker (RValue,m,n,A,n);
+//   		size_t R = FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q);
+  		FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q);
+		try {
+// 			checker.check(A,n,R,P,Q);
+			std::cout << m << 'x' << n << " pluq verification successful\n";
+			pass++;
+		} catch(FailurePLUQCheck &e) {
+			std::cout << m << 'x' << n << " pluq verification failed!\n";
+		}
+
+		FFLAS::fflas_delete(A,P,Q);
+	}
+
+	std::cout << pass << "/" << iter << " tests were successful.\n";
+
+	return 0;
+}
diff --git a/tests/test-pluq.C b/tests/test-pluq.C
index 80af6bf..2d2ef8c 100644
--- a/tests/test-pluq.C
+++ b/tests/test-pluq.C
@@ -41,6 +41,7 @@
 //               1: check A = LQUP
 //-------------------------------------------------------------------------
 
+#define ENABLE_ALL_CHECKINGS 1
 
 #define __FFPACK_LUDIVINE_CUTOFF 60
 #include <iostream>
diff --git a/tests/test-simd.C b/tests/test-simd.C
index 66a8b19..cb08a5e 100644
--- a/tests/test-simd.C
+++ b/tests/test-simd.C
@@ -1,6 +1,5 @@
-/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-// vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
-
+/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
 /*
  * Copyright (C) 2014 FFLAS-FFPACK
  * Written by :
@@ -27,19 +26,43 @@
  *.
  */
 
- #include "fflas-ffpack/fflas/fflas_simd.h"
- #include "fflas-ffpack/utils/args-parser.h"
- #include "fflas-ffpack/utils/align-allocator.h"
- #include <vector>
- #include <algorithm>
- #include <random>
- #include <tuple>
- #include <type_traits>
- #include <string>
- #include <iterator>
- #include <limits>
- #include <cmath>
- #include <iomanip>
+#include "givaro/givinteger.h"
+#include "fflas-ffpack/fflas-ffpack-config.h"
+#include "fflas-ffpack/fflas/fflas_simd.h"
+#include "fflas-ffpack/utils/args-parser.h"
+#include "fflas-ffpack/utils/align-allocator.h"
+#include <vector>
+#include <algorithm>
+#include <random>
+#include <tuple>
+#include <type_traits>
+#include <string>
+#include <iterator>
+#include <limits>
+#include <cmath>
+#include <iomanip>
+
+typedef Givaro::Integer integer;
+
+/**********************************************************************************
+ *
+ * Random generators
+ *
+ ***********************************************************************************/
+
+template <class Element, class Alloc>
+typename std::enable_if<std::is_integral<Element>::value>::type
+generate_random (std::vector<Element,Alloc> &a, std::mt19937 &generator) {
+	std::uniform_int_distribution<Element> dist(std::numeric_limits<Element>::min(), std::numeric_limits<Element>::max());
+	std::generate(a.begin(), a.end(), [&](){return dist(generator);});
+}
+
+template <class Element, class Alloc>
+typename std::enable_if<std::is_floating_point<Element>::value>::type
+generate_random (std::vector<Element,Alloc> &a, std::mt19937 &generator) {
+	std::uniform_real_distribution<Element> dist(std::numeric_limits<Element>::min(), std::numeric_limits<Element>::max());
+	std::generate(a.begin(), a.end(), [&](){return dist(generator);});
+}
 
 /**********************************************************************************
  *
@@ -54,192 +77,218 @@ template <class R, class... Args>
 struct function_traits<R (*)(Args...)> : public function_traits<R(Args...)> {};
 
 template <class R, class... Args> struct function_traits<R(Args...)> {
-  using return_type = R;
+	using return_type = R;
 
-  static constexpr std::size_t arity = sizeof...(Args);
+	static constexpr std::size_t arity = sizeof...(Args);
 
-  template <std::size_t N> struct argument {
-    static_assert(N < arity, "error: invalid parameter index.");
-    using type = typename std::tuple_element<N, std::tuple<Args...> >::type;
-  };
+	template <std::size_t N> struct argument {
+		static_assert(N < arity, "error: invalid parameter index.");
+		using type = typename std::tuple_element<N, std::tuple<Args...> >::type;
+	};
 };
 
 // member function pointer
 template <class C, class R, class... Args>
-struct function_traits<R (C::*)(Args...)> : public function_traits<
-                                                R(C&, Args...)> {};
+struct function_traits<R (C::*)(Args...)>       : public function_traits<R(C&, Args...)> {};
 
 // const member function pointer
 template <class C, class R, class... Args>
-struct function_traits<R (C::*)(Args...)
-                       const> : public function_traits<R(C&, Args...)> {};
+struct function_traits<R (C::*)(Args...) const> : public function_traits<R(C&, Args...)> {};
 
 // member object pointer
 template <class C, class R>
-struct function_traits<R(C::*)> : public function_traits<R(C&)> {};
+struct function_traits<R(C::*)>                 : public function_traits<R(C&)> {};
+
+template<class SimdFunc>
+void print_arity (SimdFunc f) {
+	std::cout << "Arity of function is " << (function_traits<SimdFunc>::arity) << std::endl;
+}
 
 /**************************************************************************************/
 
 template<class simd, class Element, class SimdFunc, class ScalFunc>
 inline
 typename std::enable_if<
-						(function_traits<SimdFunc>::arity == 1) &&
-						!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
-					   , bool>::type
-test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){
-	
+(function_traits<SimdFunc>::arity == 0) &&
+!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
+, bool>::type
+test_op(SimdFunc && fsimd, ScalFunc && fscal, size_t seed, size_t vectorSize, Element max, std::string name){
+
 	using vect_t = typename simd::vect_t;
 
-	std::mt19937 generator(seed);
- 	std::uniform_real_distribution<> dist(1, (int)max);
-
- 	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), c1(vectorSize), a2(vectorSize), c2(vectorSize);
- 	std::generate(a1.begin(), a1.end(), [&](){return dist(generator);});
- 	a2 = a1;
-
- 	std::transform(a1.begin(), a1.end(), c1.begin(), fscal);
-
- 	vect_t va2, vc2;
- 	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
- 		va2 = simd::load(a2.data()+i);
- 		vc2 = fsimd(va2);
- 		simd::store(c2.data()+i, vc2);
- 	}
-
- 	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
- 	if(!res)
- 	{
- 		std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << std::endl;
-  		std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
-  		std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
- 	}
- 	return res;
+	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> c1(vectorSize), c2(vectorSize);
+
+	std::transform(c1.begin(), c1.end(), c1.begin(), fscal);
+
+	vect_t vc2;
+	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
+			c2 = fsimd();
+			simd::store(c2.data()+i, c2);
+		}
+
+	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
+	if(!res)
+		{
+			std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name
+					  << " on " << (sizeof(Element) * 8) << "bits." << std::endl;
+
+			std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl ;
+		}
+	return res;
 }
 
 template<class simd, class Element, class SimdFunc, class ScalFunc>
 inline
 typename std::enable_if<
-						(function_traits<SimdFunc>::arity == 0) &&
-						!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
-					   , bool>::type
-test_op(SimdFunc && fsimd, ScalFunc && fscal, size_t seed, size_t vectorSize, Element max, std::string name){
+(function_traits<SimdFunc>::arity == 1) &&
+!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
+, bool>::type
+test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){
 	
 	using vect_t = typename simd::vect_t;
 
 	std::mt19937 generator(seed);
- 	std::uniform_real_distribution<Element> dist(1, (int)max);
-
- 	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> c1(vectorSize), c2(vectorSize);
-
- 	std::transform(c1.begin(), c1.end(), c1.begin(), fscal);
-
- 	vect_t vc2;
- 	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
- 		c2 = fsimd();
- 		simd::store(c2.data()+i, c2);
- 	}
-
- 	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
- 	if(!res)
- 	{
- 		std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << std::endl;
-  		std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
-  		std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
- 	}
- 	return res;
+	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), c1(vectorSize), a2(vectorSize), c2(vectorSize), c3(vectorSize);
+	generate_random(a1, generator);
+	a2 = a1;
+
+	std::transform(a1.begin(), a1.end(), c1.begin(), fscal);
+
+	vect_t va2, vc2, vc3;
+	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
+			va2 = simd::load(a2.data()+i);
+			vc3 = simd::load(c1.data()+i);
+			vc2 = fsimd(va2);
+			vc3 = simd::sub(vc3,vc2);
+			simd::store(c2.data()+i, vc2);
+			simd::store(c3.data()+i, vc3);
+		}
+
+	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
+	if(!res)
+		{
+			std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name
+					  << " on " << (sizeof(Element) * 8) << "bits." << std::endl;
+
+			std::cout << "a2: ";
+			std::copy(a2.begin(), a2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::cout << "c1: ";
+			std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::cout << "c2: ";
+			std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl << std::endl;
+			std::cout << "c1-c2: ";
+			std::copy(c3.begin(), c3.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl << std::endl;
+		}
+	return res;
 }
 
 template<class simd, class Element, class SimdFunc, class ScalFunc>
 inline
 typename std::enable_if<
-						(function_traits<SimdFunc>::arity == 2) &&
-						!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
-					   , bool>::type
+(function_traits<SimdFunc>::arity == 2) &&
+!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
+, bool>::type
 test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){
 	
 	using vect_t = typename simd::vect_t;
 
 	std::mt19937 generator(seed);
- 	std::uniform_real_distribution<> dist(1, (int)max);
-
- 	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), b1(vectorSize), c1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize);
- 	std::generate(a1.begin(), a1.end(), [&](){return dist(generator);});
- 	std::generate(b1.begin(), b1.end(), [&](){return dist(generator);});
- 	a2 = a1;
- 	b2 = b1;
-
- 	std::transform(a1.begin(), a1.end(), b1.begin(), c1.begin(), fscal);
-
- 	vect_t va2, vb2, vc2;
- 	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
- 		va2 = simd::load(a2.data()+i);
- 		vb2 = simd::load(b2.data()+i);
- 		vc2 = fsimd(va2, vb2);
- 		simd::store(c2.data()+i, vc2);
- 	}
-
- 	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
- 	if(!res)
- 	{
- 		std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << std::endl;
-  		std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
-  		std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
- 	}
- 	return res;
+	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), b1(vectorSize), c1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize), c3(vectorSize);
+	generate_random(a1, generator);
+	generate_random(b1, generator);
+	a2 = a1;
+	b2 = b1;
+
+	std::transform(a1.begin(), a1.end(), b1.begin(), c1.begin(), fscal);
+
+	vect_t va2, vb2, vc2, vc3;
+	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
+			va2 = simd::load(a2.data()+i);
+			vb2 = simd::load(b2.data()+i);
+			vc3 = simd::load(c1.data()+i);
+			vc2 = fsimd(va2, vb2);
+			vc3 = simd::sub(vc3,vc2);
+			simd::store(c2.data()+i, vc2);
+			simd::store(c3.data()+i, vc3);
+		}
+
+	bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
+	if(!res)
+		{
+			std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name
+					  << " on " << (sizeof(Element) * 8) << "bits." << std::endl;
+
+			std::cout << "a2: ";
+			std::copy(a2.begin(), a2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::cout << "b2: ";
+			std::copy(b2.begin(), b2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::cout << "c1: ";
+			std::copy(c1.begin(), c1.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+			std::cout << "c2: ";
+			std::copy(c2.begin(), c2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl << std::endl;
+			std::cout << "c1-c2: ";
+			std::copy(c3.begin(), c3.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl << std::endl;
+		}
+	return res;
 }
 
 template<class simd, class Element, class SimdFunc, class ScalFunc>
 inline
 typename std::enable_if<
-						(function_traits<SimdFunc>::arity == 3) &&
-						!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
-					   , bool>::type
+(function_traits<SimdFunc>::arity == 3) &&
+!(std::is_same<typename function_traits<SimdFunc>::return_type, void>::value)
+, bool>::type
 test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){
 	
 	using vect_t = typename simd::vect_t;
 
 	std::mt19937 generator(seed);
- 	std::uniform_real_distribution<> dist(1, (int)max);
-
- 	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), b1(vectorSize), c1(vectorSize), d1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize), d2(vectorSize);
- 	std::generate(a1.begin(), a1.end(), [&](){return dist(generator);});
- 	std::generate(b1.begin(), b1.end(), [&](){return dist(generator);});
- 	std::generate(c1.begin(), c1.end(), [&](){return dist(generator);});
- 	a2 = a1;
- 	b2 = b1;
- 	c2 = c1;
-
- 	for(size_t i = 0 ; i < vectorSize ; ++i){
- 		d1[i] = fscal(c1[i], a1[i], b1[i]);
- 	}
-
- 	vect_t va2, vb2, vc2;
- 	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
- 		va2 = simd::load(a2.data()+i);
- 		vb2 = simd::load(b2.data()+i);
- 		vc2 = simd::load(c2.data()+i);
- 		simd::store(d2.data()+i, fsimd(vc2, va2, vb2));
- 	}
-
- 	bool res = std::equal(d1.begin(), d1.end(), d2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
- 	if(!res)
- 	{
- 		std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << std::endl;
-
-		std::transform(d1.begin(), d1.end(), d2.begin(), d2.begin(), [](Element x1, Element x2){return x1-x2;});		
-
-  		//std::copy(d1.begin(), d1.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		//std::cout << std::endl;
-  		std::copy(d2.begin(), d2.end(), std::ostream_iterator<Element>(std::cout, " "));
-  		std::cout << std::endl;
- 	}
- 	return res;
+	std::vector<Element, AlignedAllocator<Element, Alignment::AVX>> a1(vectorSize), b1(vectorSize), c1(vectorSize), d1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize), d2(vectorSize);
+	generate_random(a1, generator);
+	generate_random(b1, generator);
+	generate_random(c1, generator);
+	a2 = a1;
+	b2 = b1;
+	c2 = c1;
+
+	for(size_t i = 0 ; i < vectorSize ; ++i){
+			d1[i] = fscal(c1[i], a1[i], b1[i]);
+		}
+
+	vect_t va2, vb2, vc2;
+	for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){
+			va2 = simd::load(a2.data()+i);
+			vb2 = simd::load(b2.data()+i);
+			vc2 = simd::load(c2.data()+i);
+			simd::store(d2.data()+i, fsimd(vc2, va2, vb2));
+		}
+
+	bool res = std::equal(d1.begin(), d1.end(), d2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;});
+	if(!res)
+		{
+			std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name
+					  << " on " << (sizeof(Element) * 8) << "bits." << std::endl;
+
+			std::transform(d1.begin(), d1.end(), d2.begin(), d2.begin(), [](Element x1, Element x2){return x1-x2;});
+
+			//std::copy(d1.begin(), d1.end(), std::ostream_iterator<Element>(std::cout, " "));
+			//std::cout << std::endl;
+			std::copy(d2.begin(), d2.end(), std::ostream_iterator<Element>(std::cout, " "));
+			std::cout << std::endl;
+		}
+	return res;
 }
 
 
@@ -265,6 +314,9 @@ bool test_float_impl(size_t seed, size_t vectorSize, Element max){
 	return btest;
 }
 
+template<typename simd>
+typename simd::vect_t mysra (typename simd::vect_t x1){return simd::sra(x1, int(2));}
+
 template<class simd, class Element>
 bool test_integer_impl(size_t seed, size_t vectorSize, Element max){
 	bool btest = true;
@@ -272,14 +324,46 @@ bool test_integer_impl(size_t seed, size_t vectorSize, Element max){
 	btest &= test_op<simd>(simd::add, [](Element x1, Element x2){return x1+x2;}, seed, vectorSize, max, "add");
 	btest &= test_op<simd>(simd::sub, [](Element x1, Element x2){return x1-x2;}, seed, vectorSize, max, "sub");
 	btest &= test_op<simd>(simd::mullo, [](Element x1, Element x2){return x1*x2;}, seed, vectorSize, max, "mullo");
+	btest &= test_op<simd>(simd::mul, [](Element x1, Element x2){return x1*x2;}, seed, vectorSize, max, "mullo");
 	btest &= test_op<simd>(simd::fmadd, [](Element x1, Element x2, Element x3){return x1+x3*x2;}, seed, vectorSize, max, "fmadd");
-	// btest &= test_op<simd>(simd::fmsub, [](Element x1, Element x2, Element x3){return -x1+x3*x2;}, seed, vectorSize, max, "fmsub");
-	// btest &= test_op<simd>(simd::fnmadd, [](Element x1, Element x2, Element x3){return x1-x3*x2;}, seed, vectorSize, max, "fnmadd");
+	btest &= test_op<simd>(simd::fmsub, [](Element x1, Element x2, Element x3){return -x1+x3*x2;}, seed, vectorSize, max, "fmsub");
+	btest &= test_op<simd>(simd::fnmadd, [](Element x1, Element x2, Element x3){return x1-x3*x2;}, seed, vectorSize, max, "fnmadd");
 	btest &= test_op<simd>(simd::lesser, [](Element x1, Element x2){return (x1<x2)?-1:0;}, seed, vectorSize, max, "lesser");
 	btest &= test_op<simd>(simd::lesser_eq, [](Element x1, Element x2){return (x1<=x2)?-1:0;}, seed, vectorSize, max, "lesser_eq");
 	btest &= test_op<simd>(simd::greater, [](Element x1, Element x2){return (x1>x2)?-1:0;}, seed, vectorSize, max, "greater");
 	btest &= test_op<simd>(simd::greater_eq, [](Element x1, Element x2){return (x1>=x2)?-1:0;}, seed, vectorSize, max, "greater_eq");
 	btest &= test_op<simd>(simd::eq, [](Element x1, Element x2){return (x1==x2)?-1:0;}, seed, vectorSize, max, "eq");
+	// print_arity(mysra<simd>);
+	btest &= test_op<simd>(mysra<simd>, //std::bind(simd::sra,std::placeholders::_1,int(sizeof(Element)*4)),
+						   [](Element x1){
+			integer h = integer (1) << 2;
+			integer r = integer(x1) / h;
+			r -= ((integer(x1)-h*r) < 0)?1:0;
+			return Element(r);
+			// return Element(std::floor(double(x1)/double(h)));
+}, seed, vectorSize, max, "sra");
+	btest &= test_op<simd>(simd::mulhi, [](Element x1, Element x2){
+			integer q,r;
+			integer a = (integer(x1)*integer(x2));
+			integer b = integer(1) << uint64_t(sizeof(Element)*8);
+			Givaro::IntegerDom Z;
+			Z.divmod(q, r, a, b);
+			return Element(q);
+}, seed, vectorSize, max, "mulhi");
+	btest &= test_op<simd>(simd::mulx, [](Element x1, Element x2){
+			Element h = Element(1) << (sizeof(Element)*4);
+			/* Representative r of x1 modulo h with -h/2 <= r < h/2*/
+			if (std::is_signed<Element>::value) {
+			x1 = (x1+h/2)%h;
+			x1 += (x1 < 0)?h/2:-h/2;
+			x2 = (x2+h/2)%h;
+			x2 += (x2 < 0)?h/2:-h/2; }
+			else {
+			x1 = x1 % h;
+			x2 = x2 % h;
+}
+			return x1*x2;
+}, seed, vectorSize, max, "mulx");
 
 	return btest;
 }
@@ -287,28 +371,35 @@ bool test_integer_impl(size_t seed, size_t vectorSize, Element max){
 template<class Element>
 bool test_float(size_t seed, size_t vectorSize, size_t max_){
 	bool sse = true, avx = true;
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	sse = test_float_impl<Simd128<Element>>(seed, vectorSize, (Element)max_);
 	if(!sse)
 		std::cout << "bug sse" << std::endl;
 	else
 		std::cout << "SSE OK" << std::endl;
+#endif
+
+#ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS
 	avx = test_float_impl<Simd256<Element>>(seed, vectorSize, (Element)max_);
 	if(!avx)
 		std::cout << "bug avx" << std::endl;
 	else
 		std::cout << "AVX OK" << std::endl;
+#endif
 	return sse && avx;
 }
 
- template<class Element>
- bool test_integer(size_t seed, size_t vectorSize, size_t max_){
- 	bool sse = true, avx = true;
+template<class Element>
+bool test_integer(size_t seed, size_t vectorSize, size_t max_){
+	bool sse = true, avx = true;
+#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS
 	sse = test_integer_impl<Simd128<Element>>(seed, vectorSize, (Element)max_);
 	if(!sse)
 		std::cout << "bug sse" << std::endl;
 	else
 		std::cout << "SSE OK" << std::endl;
-#ifdef __AVX2__	
+#endif
+#ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS
 	avx = test_integer_impl<Simd256<Element>>(seed, vectorSize, (Element)max_);
 	if(!avx)
 		std::cout << "bug avx" << std::endl;
@@ -316,10 +407,10 @@ bool test_float(size_t seed, size_t vectorSize, size_t max_){
 		std::cout << "AVX OK" << std::endl;
 #endif
 	return sse && avx;
- }
+}
 
 
- int main(int ac, char **av) {
+int main(int ac, char **av) {
 	int seed = (int) time(NULL);
 	int vectorSize = 32;
 	int max = 100;
@@ -337,33 +428,33 @@ bool test_float(size_t seed, size_t vectorSize, size_t max_){
 	srand48(seed);
 
 	bool pass  = true ;
-	{ 
+	{
 		do{
-		{
-			pass &= test_float<float>(seed, vectorSize, max);
-		}
-		{
-			pass &= test_float<double>(seed, vectorSize, max);
-		}
-		{
-			pass &= test_integer<int16_t>(seed, vectorSize, max);
-		}
-		{
-			pass &= test_integer<int32_t>(seed, vectorSize, max);
-		}
-		{
-			pass &= test_integer<int64_t>(seed, vectorSize, max);
-		}
-		// {
-		// 	pass &= test_integer<uint16_t>(seed, vectorSize, max);
-		// }
-		// {
-		// 	pass &= test_integer<uint32_t>(seed, vectorSize, max);
-		// }
-		// {
-		// 	pass &= test_integer<uint64_t>(seed, vectorSize, max);
-		// }
-	}while(loop);
+				{
+					pass &= test_float<float>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_float<double>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<int16_t>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<int32_t>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<int64_t>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<uint16_t>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<uint32_t>(seed, vectorSize, max);
+				}
+				{
+					pass &= test_integer<uint64_t>(seed, vectorSize, max);
+				}
+			}while(loop);
 	}
 	std::cout << std::boolalpha << pass << std::endl;
 	return (pass?0:1) ;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/fflas-ffpack.git