[SCM] Fast arithmetic with dense matrices over F_{2^e} branch, upstream, updated. 9faf6ece9a183a703670566609063ab274b1c544

Martin Albrecht martinralbrecht at googlemail.com
Mon Sep 10 12:24:28 UTC 2012


The following commit has been merged in the upstream branch:
commit 9faf6ece9a183a703670566609063ab274b1c544
Author: Martin Albrecht <martinralbrecht at googlemail.com>
Date:   Fri Aug 17 16:38:51 2012 -0400

    conversion mzed_t <=> mzd_slice_t for up to degree 16

diff --git a/src/conversion.c b/src/conversion.c
index 19354ba..e5d10cf 100644
--- a/src/conversion.c
+++ b/src/conversion.c
@@ -19,14 +19,17 @@
 
 #include "conversion.h"
 
+static const word x80008000 = 0x8000800080008000ULL;
 static const word x80808080 = 0x8080808080808080ULL;
 static const word x88888888 = 0x8888888888888888ULL;
 static const word xaaaaaaaa = 0xaaaaaaaaaaaaaaaaULL;
 static const word xcccccccc = 0xccccccccccccccccULL;
+static const word xc0c0c0c0 = 0xc0c0c0c0c0c0c0c0ULL;
 static const word xf0f0f0f0 = 0xf0f0f0f0f0f0f0f0ULL;
 static const word xff00ff00 = 0xff00ff00ff00ff00ULL;
 static const word xffff0000 = 0xffff0000ffff0000ULL;
 static const word xffffffff = 0xffffffff00000000ULL;
+static const word x__left04 = 0xf000000000000000ULL;
 static const word x__left08 = 0xff00000000000000ULL;
 static const word x__left16 = 0xffff000000000000ULL;
 static const word x__left32 = 0xffffffff00000000ULL;
@@ -55,6 +58,12 @@ static inline word word_slice_64_08_l(word a) {
   return a;
 }
 
+static inline word word_slice_64_16_l(word a) {
+  a = (a & xffff0000) | (a & xffff0000>>16)<<15;
+  a = (a & xffffffff) | (a & xffffffff>>32)<<30;
+  return a;
+}
+
 static inline word word_cling_64_02_l(word a) {
   a = (a & xffff0000 & x__left32) | (a & (xffff0000>>16) & x__left32)>>16;
   a = (a & xff00ff00) | (a & xff00ff00>> 8)>> 8;
@@ -79,6 +88,12 @@ static inline word word_cling_64_08_l(word a) {
   return a;
 }
 
+static inline word word_cling_64_16_l(word a) {
+  a = (a & xcccccccc & x__left04) | (a & xcccccccc>> 2  & x__left04)>>30;
+  a = (a & xaaaaaaaa) | (a & xaaaaaaaa>> 1)>> 15;
+  return a;
+}
+
 mzd_slice_t *mzed_slice(mzd_slice_t *A, const mzed_t *Z) {
   if (A == NULL) {
     assert(Z->x->offset == 0);
@@ -90,14 +105,23 @@ mzd_slice_t *mzed_slice(mzd_slice_t *A, const mzed_t *Z) {
 
   switch(Z->finite_field->degree) {
   case  2: return _mzed_slice2(A,Z);
+
   case  3: return _mzed_slice4(A,Z);
   case  4: return _mzed_slice4(A,Z);
+
   case  5: return _mzed_slice8(A,Z);
   case  6: return _mzed_slice8(A,Z);
   case  7: return _mzed_slice8(A,Z);
   case  8: return _mzed_slice8(A,Z);
-  case  9:
-  case 10:
+
+  case  9: return _mzed_slice16(A,Z);
+  case 10: return _mzed_slice16(A,Z);
+  case 11: return _mzed_slice16(A,Z);
+  case 12: return _mzed_slice16(A,Z);
+  case 13: return _mzed_slice16(A,Z);
+  case 14: return _mzed_slice16(A,Z);
+  case 15: return _mzed_slice16(A,Z);
+  case 16: return _mzed_slice16(A,Z);
   default:
     m4ri_die("slicing not implemented for this degree");
   }
@@ -116,14 +140,23 @@ mzed_t *mzed_cling(mzed_t *A, const mzd_slice_t *Z) {
 
   switch(Z->finite_field->degree) {
   case  2: return _mzed_cling2(A,Z);
+
   case  3: return _mzed_cling4(A,Z);
   case  4: return _mzed_cling4(A,Z);
+
   case  5: return _mzed_cling8(A,Z);
   case  6: return _mzed_cling8(A,Z);
   case  7: return _mzed_cling8(A,Z);
   case  8: return _mzed_cling8(A,Z);
-  case  9:
-  case 10:
+
+  case  9: return _mzed_cling16(A,Z);
+  case 10: return _mzed_cling16(A,Z);
+  case 11: return _mzed_cling16(A,Z);
+  case 12: return _mzed_cling16(A,Z);
+  case 13: return _mzed_cling16(A,Z);
+  case 14: return _mzed_cling16(A,Z);
+  case 15: return _mzed_cling16(A,Z);
+  case 16: return _mzed_cling16(A,Z);
   default:
     m4ri_die("clinging not implemented for this degree");
   }
@@ -143,7 +176,7 @@ mzd_slice_t *_mzed_slice2(mzd_slice_t *T, const mzed_t *F) {
   for(size_t i=0; i<T->nrows; i++) {
     word *t0 = T->x[0]->rows[i];
     word *t1 = T->x[1]->rows[i];
-    const word *f  = F->x->rows[i];    
+    const word *f  = F->x->rows[i];
 
     /* bulk of work */
     for(j=0, j2=0; j+2 < F->x->width; j+=2,j2++) {
@@ -186,7 +219,7 @@ mzd_slice_t *_mzed_slice2(mzd_slice_t *T, const mzed_t *F) {
       m4ri_die("impossible");
     }
   }
-  
+
   return T;
 }
 
@@ -243,26 +276,26 @@ mzd_slice_t *_mzed_slice4(mzd_slice_t *T, const mzed_t *F) {
       /* bulk of work */
       for(j=0, j2=0; j+4 < F->x->width; j+=4,j2++) {
         t0[j2] = word_slice_64_04_l(f[j+0]<<3 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32 \
-          |      word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0; 
+          |      word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0;
         t1[j2] = word_slice_64_04_l(f[j+0]<<2 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32 \
-          |      word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0; 
+          |      word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0;
         t2[j2] = word_slice_64_04_l(f[j+0]<<1 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32 \
           |      word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0;
       }
       r0 = r1 = r2 = 0;
       switch(F->x->width - j) {
       case 4:
-        r0 |= word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0; 
-        r1 |= word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0; 
-        r2 |= word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0; 
+        r0 |= word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0;
+        r1 |= word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0;
+        r2 |= word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0;
       case 3:
-        r0 |= word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16; 
-        r1 |= word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16; 
-        r2 |= word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16; 
+        r0 |= word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16;
+        r1 |= word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16;
+        r2 |= word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16;
       case 2:
-        r0 |= word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32; 
-        r1 |= word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32; 
-        r2 |= word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32; 
+        r0 |= word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32;
+        r1 |= word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32;
+        r2 |= word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32;
       case 1:
         r0 |= word_slice_64_04_l(f[j+0]<<3 & x88888888)>>48;
         r1 |= word_slice_64_04_l(f[j+0]<<2 & x88888888)>>48;
@@ -286,31 +319,31 @@ mzd_slice_t *_mzed_slice4(mzd_slice_t *T, const mzed_t *F) {
       /* bulk of work */
       for(j=0, j2=0; j+4 < F->x->width; j+=4,j2++) {
         t0[j2] = word_slice_64_04_l(f[j+0]<<3 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32 \
-          |      word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0; 
+          |      word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0;
         t1[j2] = word_slice_64_04_l(f[j+0]<<2 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32 \
-          |      word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0; 
+          |      word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0;
         t2[j2] = word_slice_64_04_l(f[j+0]<<1 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32 \
           |      word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0;
         t3[j2] = word_slice_64_04_l(f[j+0]<<0 & x88888888)>>48 | word_slice_64_04_l(f[j+1]<<0 & x88888888)>>32 \
-          |      word_slice_64_04_l(f[j+2]<<0 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<0 & x88888888)>> 0; 
+          |      word_slice_64_04_l(f[j+2]<<0 & x88888888)>>16 | word_slice_64_04_l(f[j+3]<<0 & x88888888)>> 0;
       }
       r0 = r1 = r2 = r3 = 0;
       switch(F->x->width - j) {
       case 4:
-        r0 |= word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0; 
-        r1 |= word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0; 
-        r2 |= word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0; 
-        r3 |= word_slice_64_04_l(f[j+3]<<0 & x88888888)>> 0; 
+        r0 |= word_slice_64_04_l(f[j+3]<<3 & x88888888)>> 0;
+        r1 |= word_slice_64_04_l(f[j+3]<<2 & x88888888)>> 0;
+        r2 |= word_slice_64_04_l(f[j+3]<<1 & x88888888)>> 0;
+        r3 |= word_slice_64_04_l(f[j+3]<<0 & x88888888)>> 0;
       case 3:
-        r0 |= word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16; 
-        r1 |= word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16; 
-        r2 |= word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16; 
-        r3 |= word_slice_64_04_l(f[j+2]<<0 & x88888888)>>16; 
+        r0 |= word_slice_64_04_l(f[j+2]<<3 & x88888888)>>16;
+        r1 |= word_slice_64_04_l(f[j+2]<<2 & x88888888)>>16;
+        r2 |= word_slice_64_04_l(f[j+2]<<1 & x88888888)>>16;
+        r3 |= word_slice_64_04_l(f[j+2]<<0 & x88888888)>>16;
       case 2:
-        r0 |= word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32; 
-        r1 |= word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32; 
-        r2 |= word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32; 
-        r3 |= word_slice_64_04_l(f[j+1]<<0 & x88888888)>>32; 
+        r0 |= word_slice_64_04_l(f[j+1]<<3 & x88888888)>>32;
+        r1 |= word_slice_64_04_l(f[j+1]<<2 & x88888888)>>32;
+        r2 |= word_slice_64_04_l(f[j+1]<<1 & x88888888)>>32;
+        r3 |= word_slice_64_04_l(f[j+1]<<0 & x88888888)>>32;
       case 1:
         r0 |= word_slice_64_04_l(f[j+0]<<3 & x88888888)>>48;
         r1 |= word_slice_64_04_l(f[j+0]<<2 & x88888888)>>48;
@@ -323,7 +356,7 @@ mzd_slice_t *_mzed_slice4(mzd_slice_t *T, const mzed_t *F) {
       t0[j2] |= r0 & bitmask_end;
       t1[j2] |= r1 & bitmask_end;
       t2[j2] |= r2 & bitmask_end;
-      t3[j2] |= r3 & bitmask_end;    
+      t3[j2] |= r3 & bitmask_end;
     }
   }
   return T;
@@ -469,7 +502,7 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
           t3[j2] |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32 \
             |       word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24 | word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16 \
-            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;          
+            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
 
           t4[j2] |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32 \
@@ -489,82 +522,82 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
           t7[j2] |= word_slice_64_08_l(f[j+0]<<0 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<0 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<0 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<0 & x80808080)>>32 \
             |       word_slice_64_08_l(f[j+4]<<0 & x80808080)>>24 | word_slice_64_08_l(f[j+5]<<0 & x80808080)>>16 \
-            |       word_slice_64_08_l(f[j+6]<<0 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<0 & x80808080)>> 0;          
+            |       word_slice_64_08_l(f[j+6]<<0 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<0 & x80808080)>> 0;
         }
         r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;
         switch(F->x->width - j) {
         case 8:
-          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0; 
-          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0; 
-          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0; 
-          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0; 
-          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0; 
-          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0; 
-          r6 |= word_slice_64_08_l(f[j+7]<<1 & x80808080)>> 0; 
-          r7 |= word_slice_64_08_l(f[j+7]<<0 & x80808080)>> 0; 
+          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0;
+          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0;
+          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0;
+          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
+          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0;
+          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0;
+          r6 |= word_slice_64_08_l(f[j+7]<<1 & x80808080)>> 0;
+          r7 |= word_slice_64_08_l(f[j+7]<<0 & x80808080)>> 0;
         case 7:
-          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8; 
-          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8; 
-          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8; 
-          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8; 
-          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8; 
-          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8; 
-          r6 |= word_slice_64_08_l(f[j+6]<<1 & x80808080)>> 8; 
-          r7 |= word_slice_64_08_l(f[j+6]<<0 & x80808080)>> 8; 
+          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8;
+          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8;
+          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8;
+          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8;
+          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8;
+          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8;
+          r6 |= word_slice_64_08_l(f[j+6]<<1 & x80808080)>> 8;
+          r7 |= word_slice_64_08_l(f[j+6]<<0 & x80808080)>> 8;
         case 6:
-          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16; 
-          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16; 
-          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16; 
-          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16; 
-          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16; 
-          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16; 
-          r6 |= word_slice_64_08_l(f[j+5]<<1 & x80808080)>>16; 
-          r7 |= word_slice_64_08_l(f[j+5]<<0 & x80808080)>>16; 
+          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16;
+          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16;
+          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16;
+          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16;
+          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16;
+          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16;
+          r6 |= word_slice_64_08_l(f[j+5]<<1 & x80808080)>>16;
+          r7 |= word_slice_64_08_l(f[j+5]<<0 & x80808080)>>16;
         case 5:
-          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24; 
-          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24; 
-          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24; 
-          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24; 
-          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24; 
-          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24; 
-          r6 |= word_slice_64_08_l(f[j+4]<<1 & x80808080)>>24; 
-          r7 |= word_slice_64_08_l(f[j+4]<<0 & x80808080)>>24; 
+          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24;
+          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24;
+          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24;
+          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24;
+          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24;
+          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24;
+          r6 |= word_slice_64_08_l(f[j+4]<<1 & x80808080)>>24;
+          r7 |= word_slice_64_08_l(f[j+4]<<0 & x80808080)>>24;
         case 4:
-          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32; 
-          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32; 
-          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32; 
-          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32; 
-          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32; 
-          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32; 
-          r6 |= word_slice_64_08_l(f[j+3]<<1 & x80808080)>>32; 
-          r7 |= word_slice_64_08_l(f[j+3]<<0 & x80808080)>>32; 
+          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32;
+          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32;
+          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32;
+          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32;
+          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32;
+          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32;
+          r6 |= word_slice_64_08_l(f[j+3]<<1 & x80808080)>>32;
+          r7 |= word_slice_64_08_l(f[j+3]<<0 & x80808080)>>32;
         case 3:
-          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40; 
-          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40; 
-          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40; 
-          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40; 
-          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40; 
-          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40; 
-          r6 |= word_slice_64_08_l(f[j+2]<<1 & x80808080)>>40; 
-          r7 |= word_slice_64_08_l(f[j+2]<<0 & x80808080)>>40; 
+          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40;
+          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40;
+          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40;
+          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40;
+          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40;
+          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40;
+          r6 |= word_slice_64_08_l(f[j+2]<<1 & x80808080)>>40;
+          r7 |= word_slice_64_08_l(f[j+2]<<0 & x80808080)>>40;
         case 2:
-          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48; 
-          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48; 
-          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48; 
-          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48; 
-          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48; 
-          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48; 
-          r6 |= word_slice_64_08_l(f[j+1]<<1 & x80808080)>>48; 
-          r7 |= word_slice_64_08_l(f[j+1]<<0 & x80808080)>>48; 
+          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48;
+          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48;
+          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48;
+          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48;
+          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48;
+          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48;
+          r6 |= word_slice_64_08_l(f[j+1]<<1 & x80808080)>>48;
+          r7 |= word_slice_64_08_l(f[j+1]<<0 & x80808080)>>48;
         case 1:
-          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56; 
-          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56; 
-          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56; 
-          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56; 
-          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56; 
-          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56; 
-          r6 |= word_slice_64_08_l(f[j+0]<<1 & x80808080)>>56; 
-          r7 |= word_slice_64_08_l(f[j+0]<<0 & x80808080)>>56; 
+          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56;
+          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56;
+          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56;
+          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56;
+          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56;
+          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56;
+          r6 |= word_slice_64_08_l(f[j+0]<<1 & x80808080)>>56;
+          r7 |= word_slice_64_08_l(f[j+0]<<0 & x80808080)>>56;
           break;
         default:
           m4ri_die("impossible");
@@ -612,7 +645,7 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
           t3[j2] |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32 \
             |       word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24 | word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16 \
-            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;          
+            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
 
           t4[j2] |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32 \
@@ -632,69 +665,69 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
         r0 = r1 = r2 = r3 = r4 = r5 = r6 = 0;
         switch(F->x->width - j) {
         case 8:
-          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0; 
-          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0; 
-          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0; 
-          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0; 
-          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0; 
-          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0; 
-          r6 |= word_slice_64_08_l(f[j+7]<<1 & x80808080)>> 0; 
+          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0;
+          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0;
+          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0;
+          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
+          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0;
+          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0;
+          r6 |= word_slice_64_08_l(f[j+7]<<1 & x80808080)>> 0;
         case 7:
-          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8; 
-          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8; 
-          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8; 
-          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8; 
-          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8; 
-          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8; 
-          r6 |= word_slice_64_08_l(f[j+6]<<1 & x80808080)>> 8; 
+          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8;
+          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8;
+          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8;
+          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8;
+          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8;
+          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8;
+          r6 |= word_slice_64_08_l(f[j+6]<<1 & x80808080)>> 8;
         case 6:
-          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16; 
-          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16; 
-          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16; 
-          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16; 
-          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16; 
-          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16; 
-          r6 |= word_slice_64_08_l(f[j+5]<<1 & x80808080)>>16; 
+          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16;
+          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16;
+          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16;
+          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16;
+          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16;
+          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16;
+          r6 |= word_slice_64_08_l(f[j+5]<<1 & x80808080)>>16;
         case 5:
-          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24; 
-          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24; 
-          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24; 
-          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24; 
-          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24; 
-          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24; 
-          r6 |= word_slice_64_08_l(f[j+4]<<1 & x80808080)>>24; 
+          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24;
+          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24;
+          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24;
+          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24;
+          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24;
+          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24;
+          r6 |= word_slice_64_08_l(f[j+4]<<1 & x80808080)>>24;
         case 4:
-          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32; 
-          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32; 
-          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32; 
-          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32; 
-          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32; 
-          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32; 
-          r6 |= word_slice_64_08_l(f[j+3]<<1 & x80808080)>>32; 
+          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32;
+          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32;
+          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32;
+          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32;
+          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32;
+          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32;
+          r6 |= word_slice_64_08_l(f[j+3]<<1 & x80808080)>>32;
         case 3:
-          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40; 
-          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40; 
-          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40; 
-          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40; 
-          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40; 
-          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40; 
-          r6 |= word_slice_64_08_l(f[j+2]<<1 & x80808080)>>40; 
+          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40;
+          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40;
+          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40;
+          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40;
+          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40;
+          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40;
+          r6 |= word_slice_64_08_l(f[j+2]<<1 & x80808080)>>40;
         case 2:
-          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48; 
-          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48; 
-          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48; 
-          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48; 
-          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48; 
-          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48; 
-          r6 |= word_slice_64_08_l(f[j+1]<<1 & x80808080)>>48; 
+          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48;
+          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48;
+          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48;
+          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48;
+          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48;
+          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48;
+          r6 |= word_slice_64_08_l(f[j+1]<<1 & x80808080)>>48;
         case 1:
-          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56; 
-          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56; 
-          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56; 
-          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56; 
-          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56; 
-          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56; 
-          r6 |= word_slice_64_08_l(f[j+0]<<1 & x80808080)>>56; 
+          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56;
+          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56;
+          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56;
+          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56;
+          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56;
+          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56;
+          r6 |= word_slice_64_08_l(f[j+0]<<1 & x80808080)>>56;
           break;
         default:
           m4ri_die("impossible");
@@ -740,7 +773,7 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
           t3[j2] |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32 \
             |       word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24 | word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16 \
-            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;          
+            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
 
           t4[j2] |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32 \
@@ -755,61 +788,61 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
         r0 = r1 = r2 = r3 = r4 = r5 = 0;
         switch(F->x->width - j) {
         case 8:
-          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0; 
-          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0; 
-          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0; 
-          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0; 
-          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0; 
-          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0; 
+          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0;
+          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0;
+          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0;
+          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
+          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0;
+          r5 |= word_slice_64_08_l(f[j+7]<<2 & x80808080)>> 0;
         case 7:
-          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8; 
-          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8; 
-          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8; 
-          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8; 
-          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8; 
-          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8; 
+          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8;
+          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8;
+          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8;
+          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8;
+          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8;
+          r5 |= word_slice_64_08_l(f[j+6]<<2 & x80808080)>> 8;
         case 6:
-          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16; 
-          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16; 
-          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16; 
-          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16; 
-          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16; 
-          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16; 
+          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16;
+          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16;
+          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16;
+          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16;
+          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16;
+          r5 |= word_slice_64_08_l(f[j+5]<<2 & x80808080)>>16;
         case 5:
-          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24; 
-          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24; 
-          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24; 
-          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24; 
-          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24; 
-          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24; 
+          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24;
+          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24;
+          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24;
+          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24;
+          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24;
+          r5 |= word_slice_64_08_l(f[j+4]<<2 & x80808080)>>24;
         case 4:
-          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32; 
-          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32; 
-          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32; 
-          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32; 
-          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32; 
-          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32; 
+          r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32;
+          r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32;
+          r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32;
+          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32;
+          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32;
+          r5 |= word_slice_64_08_l(f[j+3]<<2 & x80808080)>>32;
         case 3:
-          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40; 
-          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40; 
-          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40; 
-          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40; 
-          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40; 
-          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40; 
+          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40;
+          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40;
+          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40;
+          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40;
+          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40;
+          r5 |= word_slice_64_08_l(f[j+2]<<2 & x80808080)>>40;
         case 2:
-          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48; 
-          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48; 
-          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48; 
-          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48; 
-          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48; 
-          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48; 
+          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48;
+          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48;
+          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48;
+          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48;
+          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48;
+          r5 |= word_slice_64_08_l(f[j+1]<<2 & x80808080)>>48;
         case 1:
-          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56; 
-          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56; 
-          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56; 
-          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56; 
-          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56; 
-          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56; 
+          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56;
+          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56;
+          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56;
+          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56;
+          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56;
+          r5 |= word_slice_64_08_l(f[j+0]<<2 & x80808080)>>56;
           break;
         default:
           m4ri_die("impossible");
@@ -853,7 +886,7 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
           t3[j2] |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32 \
             |       word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24 | word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16 \
-            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;          
+            |       word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8 | word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
 
           t4[j2] |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56 | word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48 \
             |       word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40 | word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32 \
@@ -863,53 +896,53 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
         r0 = r1 = r2 = r3 = r4 = 0;
         switch(F->x->width - j) {
         case 8:
-          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0; 
-          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0; 
-          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0; 
-          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0; 
-          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0; 
+          r0 |= word_slice_64_08_l(f[j+7]<<7 & x80808080)>> 0;
+          r1 |= word_slice_64_08_l(f[j+7]<<6 & x80808080)>> 0;
+          r2 |= word_slice_64_08_l(f[j+7]<<5 & x80808080)>> 0;
+          r3 |= word_slice_64_08_l(f[j+7]<<4 & x80808080)>> 0;
+          r4 |= word_slice_64_08_l(f[j+7]<<3 & x80808080)>> 0;
         case 7:
-          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8; 
-          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8; 
-          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8; 
-          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8; 
-          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8; 
+          r0 |= word_slice_64_08_l(f[j+6]<<7 & x80808080)>> 8;
+          r1 |= word_slice_64_08_l(f[j+6]<<6 & x80808080)>> 8;
+          r2 |= word_slice_64_08_l(f[j+6]<<5 & x80808080)>> 8;
+          r3 |= word_slice_64_08_l(f[j+6]<<4 & x80808080)>> 8;
+          r4 |= word_slice_64_08_l(f[j+6]<<3 & x80808080)>> 8;
         case 6:
-          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16; 
-          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16; 
-          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16; 
-          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16; 
-          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16; 
+          r0 |= word_slice_64_08_l(f[j+5]<<7 & x80808080)>>16;
+          r1 |= word_slice_64_08_l(f[j+5]<<6 & x80808080)>>16;
+          r2 |= word_slice_64_08_l(f[j+5]<<5 & x80808080)>>16;
+          r3 |= word_slice_64_08_l(f[j+5]<<4 & x80808080)>>16;
+          r4 |= word_slice_64_08_l(f[j+5]<<3 & x80808080)>>16;
         case 5:
-          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24; 
-          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24; 
-          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24; 
-          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24; 
-          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24; 
+          r0 |= word_slice_64_08_l(f[j+4]<<7 & x80808080)>>24;
+          r1 |= word_slice_64_08_l(f[j+4]<<6 & x80808080)>>24;
+          r2 |= word_slice_64_08_l(f[j+4]<<5 & x80808080)>>24;
+          r3 |= word_slice_64_08_l(f[j+4]<<4 & x80808080)>>24;
+          r4 |= word_slice_64_08_l(f[j+4]<<3 & x80808080)>>24;
         case 4:
           r0 |= word_slice_64_08_l(f[j+3]<<7 & x80808080)>>32;
           r1 |= word_slice_64_08_l(f[j+3]<<6 & x80808080)>>32;
           r2 |= word_slice_64_08_l(f[j+3]<<5 & x80808080)>>32;
-          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32; 
-          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32; 
+          r3 |= word_slice_64_08_l(f[j+3]<<4 & x80808080)>>32;
+          r4 |= word_slice_64_08_l(f[j+3]<<3 & x80808080)>>32;
         case 3:
-          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40; 
-          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40; 
-          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40; 
-          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40; 
-          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40; 
+          r0 |= word_slice_64_08_l(f[j+2]<<7 & x80808080)>>40;
+          r1 |= word_slice_64_08_l(f[j+2]<<6 & x80808080)>>40;
+          r2 |= word_slice_64_08_l(f[j+2]<<5 & x80808080)>>40;
+          r3 |= word_slice_64_08_l(f[j+2]<<4 & x80808080)>>40;
+          r4 |= word_slice_64_08_l(f[j+2]<<3 & x80808080)>>40;
         case 2:
-          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48; 
-          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48; 
-          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48; 
-          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48; 
-          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48; 
+          r0 |= word_slice_64_08_l(f[j+1]<<7 & x80808080)>>48;
+          r1 |= word_slice_64_08_l(f[j+1]<<6 & x80808080)>>48;
+          r2 |= word_slice_64_08_l(f[j+1]<<5 & x80808080)>>48;
+          r3 |= word_slice_64_08_l(f[j+1]<<4 & x80808080)>>48;
+          r4 |= word_slice_64_08_l(f[j+1]<<3 & x80808080)>>48;
         case 1:
-          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56; 
-          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56; 
-          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56; 
-          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56; 
-          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56; 
+          r0 |= word_slice_64_08_l(f[j+0]<<7 & x80808080)>>56;
+          r1 |= word_slice_64_08_l(f[j+0]<<6 & x80808080)>>56;
+          r2 |= word_slice_64_08_l(f[j+0]<<5 & x80808080)>>56;
+          r3 |= word_slice_64_08_l(f[j+0]<<4 & x80808080)>>56;
+          r4 |= word_slice_64_08_l(f[j+0]<<3 & x80808080)>>56;
           break;
         default:
           m4ri_die("impossible");
@@ -922,7 +955,7 @@ mzd_slice_t *_mzed_slice8(mzd_slice_t *T, const mzed_t *F) {
       }
     }
     break;
-    
+
   default:
     m4ri_die("impossible\n");
   }
@@ -969,107 +1002,37 @@ mzed_t *_mzed_cling8(mzed_t *T, const mzd_slice_t *F) {
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2) | (word_cling_64_08_l(f6[j2]<< 0)>>1) | (word_cling_64_08_l(f7[j2]<< 0)>>0);
       }
 
-      register word tmp=0;
+      register word tmp = t[T->x->width-1];
       switch(T->x->width - j) {
       case 8:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1) | (word_cling_64_08_l(f7[j2]<<32)>>0);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1) | (word_cling_64_08_l(f7[j2]<<24)>>0);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1) | (word_cling_64_08_l(f7[j2]<<16)>>0);
-        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2) | (word_cling_64_08_l(f6[j2]<< 8)>>1) | (word_cling_64_08_l(f7[j2]<< 8)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
+        t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2) | (word_cling_64_08_l(f6[j2]<< 0)>>1) | (word_cling_64_08_l(f7[j2]<< 0)>>0);
-        t[j+7] = (t[j+7] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 7:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1) | (word_cling_64_08_l(f7[j2]<<32)>>0);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1) | (word_cling_64_08_l(f7[j2]<<24)>>0);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1) | (word_cling_64_08_l(f7[j2]<<16)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
+        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2) | (word_cling_64_08_l(f6[j2]<< 8)>>1) | (word_cling_64_08_l(f7[j2]<< 8)>>0);
-        t[j+6] = (t[j+6] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 6:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1) | (word_cling_64_08_l(f7[j2]<<32)>>0);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1) | (word_cling_64_08_l(f7[j2]<<24)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
+        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1) | (word_cling_64_08_l(f7[j2]<<16)>>0);
-        t[j+5] = (t[j+5] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 5:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1) | (word_cling_64_08_l(f7[j2]<<32)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
+        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1) | (word_cling_64_08_l(f7[j2]<<24)>>0);
-        t[j+4] = (t[j+4] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 4:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
+        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1) | (word_cling_64_08_l(f7[j2]<<32)>>0);
-        t[j+3] = (t[j+3] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 3:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
+        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1) | (word_cling_64_08_l(f7[j2]<<40)>>0);
-        t[j+2] = (t[j+2] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 2:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        tmp    = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
+        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1) | (word_cling_64_08_l(f7[j2]<<48)>>0);
-        t[j+1] = (t[j+1] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 1:
-        tmp    = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
+        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1) | (word_cling_64_08_l(f7[j2]<<56)>>0);
-        t[j+0] = (t[j+0] & ~bitmask_end) | (tmp & bitmask_end);
         break;
       default:
         m4ri_die("impossible");
-      } //switch
+      }
+      t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
     } // for loop
   }
     break;
@@ -1104,107 +1067,37 @@ mzed_t *_mzed_cling8(mzed_t *T, const mzd_slice_t *F) {
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2) | (word_cling_64_08_l(f6[j2]<< 0)>>1);
       }
 
-      register word tmp=0;
+      register word tmp= t[T->x->width-1];
       switch(T->x->width - j) {
       case 8:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1);
-        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2) | (word_cling_64_08_l(f6[j2]<< 8)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
+        t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2) | (word_cling_64_08_l(f6[j2]<< 0)>>1);
-        t[j+7] = (t[j+7] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 7:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
+        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2) | (word_cling_64_08_l(f6[j2]<< 8)>>1);
-        t[j+6] = (t[j+6] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 6:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
+        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2) | (word_cling_64_08_l(f6[j2]<<16)>>1);
-        t[j+5] = (t[j+5] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 5:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
+        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2) | (word_cling_64_08_l(f6[j2]<<24)>>1);
-        t[j+4] = (t[j+4] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 4:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
+        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2) | (word_cling_64_08_l(f6[j2]<<32)>>1);
-        t[j+3] = (t[j+3] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 3:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
+        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2) | (word_cling_64_08_l(f6[j2]<<40)>>1);
-        t[j+2] = (t[j+2] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 2:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        tmp    = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
+        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2) | (word_cling_64_08_l(f6[j2]<<48)>>1);
-        t[j+1] = (t[j+1] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 1:
-        tmp    = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
+        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2) | (word_cling_64_08_l(f6[j2]<<56)>>1);
-        t[j+0] = (t[j+0] & ~bitmask_end) | (tmp & bitmask_end);
         break;
       default:
         m4ri_die("impossible");
       }
+      t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
     }
   }
     break;
@@ -1238,107 +1131,37 @@ mzed_t *_mzed_cling8(mzed_t *T, const mzd_slice_t *F) {
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2);
       }
 
-      register word tmp=0;
+      register word tmp = t[T->x->width-1];
       switch(T->x->width - j) {
       case 8:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2);
-        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
+        t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 0)>>3) | (word_cling_64_08_l(f5[j2]<< 0)>>2);
-        t[j+7] = (t[j+7] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 7:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
+        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
           |      (word_cling_64_08_l(f4[j2]<< 8)>>3) | (word_cling_64_08_l(f5[j2]<< 8)>>2);
-        t[j+6] = (t[j+6] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 6:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
+        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<16)>>3) | (word_cling_64_08_l(f5[j2]<<16)>>2);
-        t[j+5] = (t[j+5] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 5:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
+        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<24)>>3) | (word_cling_64_08_l(f5[j2]<<24)>>2);
-        t[j+4] = (t[j+4] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 4:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
+        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<32)>>3) | (word_cling_64_08_l(f5[j2]<<32)>>2);
-        t[j+3] = (t[j+3] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 3:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
+        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<40)>>3) | (word_cling_64_08_l(f5[j2]<<40)>>2);
-        t[j+2] = (t[j+2] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 2:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        tmp    = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
+        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<48)>>3) | (word_cling_64_08_l(f5[j2]<<48)>>2);
-        t[j+1] = (t[j+1] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
       case 1:
-        tmp    = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
+        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
           |      (word_cling_64_08_l(f4[j2]<<56)>>3) | (word_cling_64_08_l(f5[j2]<<56)>>2);
-        t[j+0] = (t[j+0] & ~bitmask_end) | (tmp & bitmask_end);
         break;
       default:
         m4ri_die("impossible");
       }
+      t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
     }
   }
     break;
@@ -1353,130 +1176,958 @@ mzed_t *_mzed_cling8(mzed_t *T, const mzd_slice_t *F) {
       word *t  = T->x->rows[i];
 
       for(j=0, j2=0; j+8 < T->x->width; j+=8, j2++) {
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3);
-        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3);
-        t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 0)>>3);
+        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) | (word_cling_64_08_l(f4[j2]<<56)>>3);
+        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) | (word_cling_64_08_l(f4[j2]<<48)>>3);
+        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) | (word_cling_64_08_l(f4[j2]<<40)>>3);
+        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) | (word_cling_64_08_l(f4[j2]<<32)>>3);
+        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) | (word_cling_64_08_l(f4[j2]<<24)>>3);
+        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) | (word_cling_64_08_l(f4[j2]<<16)>>3);
+        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) | (word_cling_64_08_l(f4[j2]<< 8)>>3);
+        t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) | (word_cling_64_08_l(f4[j2]<< 0)>>3);
       }
 
-      register word tmp=0;
+      register word tmp = t[T->x->width - 1];
       switch(T->x->width - j) {
-      case 8:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3);
-        t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 0)>>3);
-        t[j+7] = (t[j+7] & ~bitmask_end) | (tmp & bitmask_end);
+      case 8: t[j+7] = (word_cling_64_08_l(f0[j2]<< 0)>>7) | (word_cling_64_08_l(f1[j2]<< 0)>>6) | (word_cling_64_08_l(f2[j2]<< 0)>>5) | (word_cling_64_08_l(f3[j2]<< 0)>>4) | (word_cling_64_08_l(f4[j2]<< 0)>>3);
+      case 7: t[j+6] = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) | (word_cling_64_08_l(f4[j2]<< 8)>>3);
+      case 6: t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) | (word_cling_64_08_l(f4[j2]<<16)>>3);
+      case 5: t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) | (word_cling_64_08_l(f4[j2]<<24)>>3);
+      case 4: t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) | (word_cling_64_08_l(f4[j2]<<32)>>3);
+      case 3: t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) | (word_cling_64_08_l(f4[j2]<<40)>>3);
+      case 2: t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) | (word_cling_64_08_l(f4[j2]<<48)>>3);
+      case 1: t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) | (word_cling_64_08_l(f4[j2]<<56)>>3);
         break;
-      case 7:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3);
-        t[j+5] = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<< 8)>>7) | (word_cling_64_08_l(f1[j2]<< 8)>>6) | (word_cling_64_08_l(f2[j2]<< 8)>>5) | (word_cling_64_08_l(f3[j2]<< 8)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<< 8)>>3);
-        t[j+6] = (t[j+6] & ~bitmask_end) | (tmp & bitmask_end);
-        break;
-      case 6:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        t[j+4] = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<<16)>>7) | (word_cling_64_08_l(f1[j2]<<16)>>6) | (word_cling_64_08_l(f2[j2]<<16)>>5) | (word_cling_64_08_l(f3[j2]<<16)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<16)>>3);
-        t[j+5] = (t[j+5] & ~bitmask_end) | (tmp & bitmask_end);
+      default:
+        m4ri_die("impossible");
+      }
+      t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+    }
+  }
+    break;
+  default:
+    m4ri_die("impossible");
+  }
+  return T;
+}
+
+/* we define these things to keep code compact below. */
+
+#define word_slice_64_16_l_combine_bulk(T, Ti, F, Fi, shift)  \
+   T[Ti] |= word_slice_64_16_l(F[Fi+ 0]<<shift & x80008000)>>60 | word_slice_64_16_l(F[Fi+ 1]<<shift & x80008000)>>56 \
+    |       word_slice_64_16_l(F[Fi+ 2]<<shift & x80008000)>>52 | word_slice_64_16_l(F[Fi+ 3]<<shift & x80008000)>>48 \
+    |       word_slice_64_16_l(F[Fi+ 4]<<shift & x80008000)>>44 | word_slice_64_16_l(F[Fi+ 5]<<shift & x80008000)>>40 \
+    |       word_slice_64_16_l(F[Fi+ 6]<<shift & x80008000)>>36 | word_slice_64_16_l(F[Fi+ 7]<<shift & x80008000)>>32 \
+    |       word_slice_64_16_l(F[Fi+ 8]<<shift & x80008000)>>28 | word_slice_64_16_l(F[Fi+ 9]<<shift & x80008000)>>24 \
+    |       word_slice_64_16_l(F[Fi+10]<<shift & x80008000)>>20 | word_slice_64_16_l(F[Fi+11]<<shift & x80008000)>>16 \
+    |       word_slice_64_16_l(F[Fi+12]<<shift & x80008000)>>12 | word_slice_64_16_l(F[Fi+13]<<shift & x80008000)>> 8 \
+    |       word_slice_64_16_l(F[Fi+14]<<shift & x80008000)>> 4 | word_slice_64_16_l(F[Fi+15]<<shift & x80008000)>> 0;
+
+#define word_slice_64_16_l_slice_rest(F, Fi, shift)         \
+  r0 |= word_slice_64_16_l(F[Fi]<<15 & x80008000)>> shift;         \
+  r1 |= word_slice_64_16_l(F[Fi]<<14 & x80008000)>> shift;         \
+  r2 |= word_slice_64_16_l(F[Fi]<<13 & x80008000)>> shift;         \
+  r3 |= word_slice_64_16_l(F[Fi]<<12 & x80008000)>> shift;         \
+  r4 |= word_slice_64_16_l(F[Fi]<<11 & x80008000)>> shift;         \
+  r5 |= word_slice_64_16_l(F[Fi]<<10 & x80008000)>> shift;         \
+  r6 |= word_slice_64_16_l(F[Fi]<< 9 & x80008000)>> shift;         \
+  r7 |= word_slice_64_16_l(F[Fi]<< 8 & x80008000)>> shift;
+
+mzd_slice_t *_mzed_slice16(mzd_slice_t *T, const mzed_t *F) {
+  assert(T && (8 < T->depth && T->depth <= 16) && T->x[0]->offset == 0);
+  size_t j, j2 = 0;
+  register word r0,r1,r2,r3,r4,r5,r6,r7 = 0;
+
+  const word bitmask_end = __M4RI_LEFT_BITMASK((T->x[0]->offset + T->ncols) % m4ri_radix);
+
+  if (mzed_is_zero(F))
+    return T;
+
+  /* we do multiple runs over T to make the code more compact, we start by doing the first eight
+     bits */
+
+  for(size_t i=0; i<T->nrows; i++) {
+    word *t0 = T->x[0]->rows[i];
+    word *t1 = T->x[1]->rows[i];
+    word *t2 = T->x[2]->rows[i];
+    word *t3 = T->x[3]->rows[i];
+    word *t4 = T->x[4]->rows[i];
+    word *t5 = T->x[5]->rows[i];
+    word *t6 = T->x[6]->rows[i];
+    word *t7 = T->x[7]->rows[i];
+    const word const *f  = F->x->rows[i];
+
+    /* bulk of work */
+    for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+      word_slice_64_16_l_combine_bulk(t0, j2, f, j, 15);
+      word_slice_64_16_l_combine_bulk(t1, j2, f, j, 14);
+      word_slice_64_16_l_combine_bulk(t2, j2, f, j, 13);
+      word_slice_64_16_l_combine_bulk(t3, j2, f, j, 12);
+      word_slice_64_16_l_combine_bulk(t4, j2, f, j, 11);
+      word_slice_64_16_l_combine_bulk(t5, j2, f, j, 10);
+      word_slice_64_16_l_combine_bulk(t6, j2, f, j,  9);
+      word_slice_64_16_l_combine_bulk(t7, j2, f, j,  8);
+    }
+    r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;
+    switch(F->x->width - j) {
+    case 16: word_slice_64_16_l_slice_rest(f, j+15,  0);
+    case 15: word_slice_64_16_l_slice_rest(f, j+14,  4);
+    case 14: word_slice_64_16_l_slice_rest(f, j+13,  8);
+    case 13: word_slice_64_16_l_slice_rest(f, j+12, 12);
+    case 12: word_slice_64_16_l_slice_rest(f, j+11, 16);
+    case 11: word_slice_64_16_l_slice_rest(f, j+10, 20);
+    case 10: word_slice_64_16_l_slice_rest(f, j+ 9, 24);
+    case  9: word_slice_64_16_l_slice_rest(f, j+ 8, 28);
+    case  8: word_slice_64_16_l_slice_rest(f, j+ 7, 32);
+    case  7: word_slice_64_16_l_slice_rest(f, j+ 6, 36);
+    case  6: word_slice_64_16_l_slice_rest(f, j+ 5, 40);
+    case  5: word_slice_64_16_l_slice_rest(f, j+ 4, 44);
+    case  4: word_slice_64_16_l_slice_rest(f, j+ 3, 48);
+    case  3: word_slice_64_16_l_slice_rest(f, j+ 2, 52);
+    case  2: word_slice_64_16_l_slice_rest(f, j+ 1, 56);
+    case  1: word_slice_64_16_l_slice_rest(f, j+ 0, 60);
+      break;
+    default:
+      m4ri_die("impossible");
+    }
+    t0[j2] |= r0 & bitmask_end;
+    t1[j2] |= r1 & bitmask_end;
+    t2[j2] |= r2 & bitmask_end;
+    t3[j2] |= r3 & bitmask_end;
+    t4[j2] |= r4 & bitmask_end;
+    t5[j2] |= r5 & bitmask_end;
+    t6[j2] |= r6 & bitmask_end;
+    t7[j2] |= r7 & bitmask_end;
+  }
+  if(T->depth >= 12) {
+    for(size_t i=0; i<T->nrows; i++) {
+      word *t0 = T->x[ 8]->rows[i];
+      word *t1 = T->x[ 9]->rows[i];
+      word *t2 = T->x[10]->rows[i];
+      word *t3 = T->x[11]->rows[i];
+      const word const *f  = F->x->rows[i];
+
+      /* bulk of work */
+      for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+        word_slice_64_16_l_combine_bulk(t0, j2, f, j,  7);
+        word_slice_64_16_l_combine_bulk(t1, j2, f, j,  6);
+        word_slice_64_16_l_combine_bulk(t2, j2, f, j,  5);
+        word_slice_64_16_l_combine_bulk(t3, j2, f, j,  4);
+      }
+      r0 = r1 = r2 = r3 = 0;
+      switch(F->x->width - j) {
+      case 16: r0 |= word_slice_64_16_l(f[j+15]<< 7 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 6 & x80008000)>>  0; r2 |= word_slice_64_16_l(f[j+15]<< 5 & x80008000)>>  0; r3 |= word_slice_64_16_l(f[j+15]<< 4 & x80008000)>>  0;
+      case 15: r0 |= word_slice_64_16_l(f[j+14]<< 7 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 6 & x80008000)>>  4; r2 |= word_slice_64_16_l(f[j+14]<< 5 & x80008000)>>  4; r3 |= word_slice_64_16_l(f[j+14]<< 4 & x80008000)>>  4;
+      case 14: r0 |= word_slice_64_16_l(f[j+13]<< 7 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 6 & x80008000)>>  8; r2 |= word_slice_64_16_l(f[j+13]<< 5 & x80008000)>>  8; r3 |= word_slice_64_16_l(f[j+13]<< 4 & x80008000)>>  8;
+      case 13: r0 |= word_slice_64_16_l(f[j+12]<< 7 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 6 & x80008000)>> 12; r2 |= word_slice_64_16_l(f[j+12]<< 5 & x80008000)>> 12; r3 |= word_slice_64_16_l(f[j+12]<< 4 & x80008000)>> 12;
+      case 12: r0 |= word_slice_64_16_l(f[j+11]<< 7 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 6 & x80008000)>> 16; r2 |= word_slice_64_16_l(f[j+11]<< 5 & x80008000)>> 16; r3 |= word_slice_64_16_l(f[j+11]<< 4 & x80008000)>> 16;
+      case 11: r0 |= word_slice_64_16_l(f[j+10]<< 7 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 6 & x80008000)>> 20; r2 |= word_slice_64_16_l(f[j+10]<< 5 & x80008000)>> 20; r3 |= word_slice_64_16_l(f[j+10]<< 4 & x80008000)>> 20;
+      case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 7 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 6 & x80008000)>> 24; r2 |= word_slice_64_16_l(f[j+ 9]<< 5 & x80008000)>> 24; r3 |= word_slice_64_16_l(f[j+ 9]<< 4 & x80008000)>> 24;
+      case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 7 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 6 & x80008000)>> 28; r2 |= word_slice_64_16_l(f[j+ 8]<< 5 & x80008000)>> 28; r3 |= word_slice_64_16_l(f[j+ 8]<< 4 & x80008000)>> 28;
+      case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 7 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 6 & x80008000)>> 32; r2 |= word_slice_64_16_l(f[j+ 7]<< 5 & x80008000)>> 32; r3 |= word_slice_64_16_l(f[j+ 7]<< 4 & x80008000)>> 32;
+      case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 7 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 6 & x80008000)>> 36; r2 |= word_slice_64_16_l(f[j+ 6]<< 5 & x80008000)>> 36; r3 |= word_slice_64_16_l(f[j+ 6]<< 4 & x80008000)>> 36;
+      case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 7 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 6 & x80008000)>> 40; r2 |= word_slice_64_16_l(f[j+ 5]<< 5 & x80008000)>> 40; r3 |= word_slice_64_16_l(f[j+ 5]<< 4 & x80008000)>> 40;
+      case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 7 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 6 & x80008000)>> 44; r2 |= word_slice_64_16_l(f[j+ 4]<< 5 & x80008000)>> 44; r3 |= word_slice_64_16_l(f[j+ 4]<< 4 & x80008000)>> 44;
+      case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 7 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 6 & x80008000)>> 48; r2 |= word_slice_64_16_l(f[j+ 3]<< 5 & x80008000)>> 48; r3 |= word_slice_64_16_l(f[j+ 3]<< 4 & x80008000)>> 48;
+      case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 7 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 6 & x80008000)>> 52; r2 |= word_slice_64_16_l(f[j+ 2]<< 5 & x80008000)>> 52; r3 |= word_slice_64_16_l(f[j+ 2]<< 4 & x80008000)>> 52;
+      case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 7 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 6 & x80008000)>> 56; r2 |= word_slice_64_16_l(f[j+ 1]<< 5 & x80008000)>> 56; r3 |= word_slice_64_16_l(f[j+ 1]<< 4 & x80008000)>> 56;
+      case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 7 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 6 & x80008000)>> 60; r2 |= word_slice_64_16_l(f[j+ 0]<< 5 & x80008000)>> 60; r3 |= word_slice_64_16_l(f[j+ 0]<< 4 & x80008000)>> 60;
         break;
-      case 5:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+3] = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<<24)>>7) | (word_cling_64_08_l(f1[j2]<<24)>>6) | (word_cling_64_08_l(f2[j2]<<24)>>5) | (word_cling_64_08_l(f3[j2]<<24)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<24)>>3);
-        t[j+4] = (t[j+4] & ~bitmask_end) | (tmp & bitmask_end);
+      default:
+        m4ri_die("impossible");
+      }
+    t0[j2] |= r0 & bitmask_end;
+    t1[j2] |= r1 & bitmask_end;
+    t2[j2] |= r2 & bitmask_end;
+    t3[j2] |= r3 & bitmask_end;
+    }
+
+    switch(T->depth) {
+    case 16: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[12]->rows[i];
+        word *t1 = T->x[13]->rows[i];
+        word *t2 = T->x[14]->rows[i];
+        word *t3 = T->x[15]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  3);
+          word_slice_64_16_l_combine_bulk(t1, j2, f, j,  2);
+          word_slice_64_16_l_combine_bulk(t2, j2, f, j,  1);
+          word_slice_64_16_l_combine_bulk(t3, j2, f, j,  0);
+        }
+        r0 = r1 = r2 = r3 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 3 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 2 & x80008000)>>  0; r2 |= word_slice_64_16_l(f[j+15]<< 1 & x80008000)>>  0; r3 |= word_slice_64_16_l(f[j+15]<< 0 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 3 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 2 & x80008000)>>  4; r2 |= word_slice_64_16_l(f[j+14]<< 1 & x80008000)>>  4; r3 |= word_slice_64_16_l(f[j+14]<< 0 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 3 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 2 & x80008000)>>  8; r2 |= word_slice_64_16_l(f[j+13]<< 1 & x80008000)>>  8; r3 |= word_slice_64_16_l(f[j+13]<< 0 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 3 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 2 & x80008000)>> 12; r2 |= word_slice_64_16_l(f[j+12]<< 1 & x80008000)>> 12; r3 |= word_slice_64_16_l(f[j+12]<< 0 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 3 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 2 & x80008000)>> 16; r2 |= word_slice_64_16_l(f[j+11]<< 1 & x80008000)>> 16; r3 |= word_slice_64_16_l(f[j+11]<< 0 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 3 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 2 & x80008000)>> 20; r2 |= word_slice_64_16_l(f[j+10]<< 1 & x80008000)>> 20; r3 |= word_slice_64_16_l(f[j+10]<< 0 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 3 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 2 & x80008000)>> 24; r2 |= word_slice_64_16_l(f[j+ 9]<< 1 & x80008000)>> 24; r3 |= word_slice_64_16_l(f[j+ 9]<< 0 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 3 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 2 & x80008000)>> 28; r2 |= word_slice_64_16_l(f[j+ 8]<< 1 & x80008000)>> 28; r3 |= word_slice_64_16_l(f[j+ 8]<< 0 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 3 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 2 & x80008000)>> 32; r2 |= word_slice_64_16_l(f[j+ 7]<< 1 & x80008000)>> 32; r3 |= word_slice_64_16_l(f[j+ 7]<< 0 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 3 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 2 & x80008000)>> 36; r2 |= word_slice_64_16_l(f[j+ 6]<< 1 & x80008000)>> 36; r3 |= word_slice_64_16_l(f[j+ 6]<< 0 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 3 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 2 & x80008000)>> 40; r2 |= word_slice_64_16_l(f[j+ 5]<< 1 & x80008000)>> 40; r3 |= word_slice_64_16_l(f[j+ 5]<< 0 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 3 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 2 & x80008000)>> 44; r2 |= word_slice_64_16_l(f[j+ 4]<< 1 & x80008000)>> 44; r3 |= word_slice_64_16_l(f[j+ 4]<< 0 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 3 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 2 & x80008000)>> 48; r2 |= word_slice_64_16_l(f[j+ 3]<< 1 & x80008000)>> 48; r3 |= word_slice_64_16_l(f[j+ 3]<< 0 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 3 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 2 & x80008000)>> 52; r2 |= word_slice_64_16_l(f[j+ 2]<< 1 & x80008000)>> 52; r3 |= word_slice_64_16_l(f[j+ 2]<< 0 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 3 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 2 & x80008000)>> 56; r2 |= word_slice_64_16_l(f[j+ 1]<< 1 & x80008000)>> 56; r3 |= word_slice_64_16_l(f[j+ 1]<< 0 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 3 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 2 & x80008000)>> 60; r2 |= word_slice_64_16_l(f[j+ 0]<< 1 & x80008000)>> 60; r3 |= word_slice_64_16_l(f[j+ 0]<< 0 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+        t1[j2] |= r1 & bitmask_end;
+        t2[j2] |= r2 & bitmask_end;
+        t3[j2] |= r3 & bitmask_end;
+      }
+    } break;
+    case 15: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[12]->rows[i];
+        word *t1 = T->x[13]->rows[i];
+        word *t2 = T->x[14]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  3);
+          word_slice_64_16_l_combine_bulk(t1, j2, f, j,  2);
+          word_slice_64_16_l_combine_bulk(t2, j2, f, j,  1);
+        }
+        r0 = r1 = r2 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 3 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 2 & x80008000)>>  0; r2 |= word_slice_64_16_l(f[j+15]<< 1 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 3 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 2 & x80008000)>>  4; r2 |= word_slice_64_16_l(f[j+14]<< 1 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 3 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 2 & x80008000)>>  8; r2 |= word_slice_64_16_l(f[j+13]<< 1 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 3 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 2 & x80008000)>> 12; r2 |= word_slice_64_16_l(f[j+12]<< 1 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 3 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 2 & x80008000)>> 16; r2 |= word_slice_64_16_l(f[j+11]<< 1 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 3 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 2 & x80008000)>> 20; r2 |= word_slice_64_16_l(f[j+10]<< 1 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 3 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 2 & x80008000)>> 24; r2 |= word_slice_64_16_l(f[j+ 9]<< 1 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 3 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 2 & x80008000)>> 28; r2 |= word_slice_64_16_l(f[j+ 8]<< 1 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 3 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 2 & x80008000)>> 32; r2 |= word_slice_64_16_l(f[j+ 7]<< 1 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 3 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 2 & x80008000)>> 36; r2 |= word_slice_64_16_l(f[j+ 6]<< 1 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 3 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 2 & x80008000)>> 40; r2 |= word_slice_64_16_l(f[j+ 5]<< 1 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 3 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 2 & x80008000)>> 44; r2 |= word_slice_64_16_l(f[j+ 4]<< 1 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 3 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 2 & x80008000)>> 48; r2 |= word_slice_64_16_l(f[j+ 3]<< 1 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 3 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 2 & x80008000)>> 52; r2 |= word_slice_64_16_l(f[j+ 2]<< 1 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 3 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 2 & x80008000)>> 56; r2 |= word_slice_64_16_l(f[j+ 1]<< 1 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 3 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 2 & x80008000)>> 60; r2 |= word_slice_64_16_l(f[j+ 0]<< 1 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+        t1[j2] |= r1 & bitmask_end;
+        t2[j2] |= r2 & bitmask_end;
+      }
+    } break;
+    case 14: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[12]->rows[i];
+        word *t1 = T->x[13]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  3);
+          word_slice_64_16_l_combine_bulk(t1, j2, f, j,  2);
+        }
+        r0 = r1 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 3 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 2 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 3 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 2 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 3 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 2 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 3 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 2 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 3 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 2 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 3 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 2 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 3 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 2 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 3 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 2 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 3 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 2 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 3 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 2 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 3 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 2 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 3 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 2 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 3 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 2 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 3 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 2 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 3 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 2 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 3 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 2 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+        t1[j2] |= r1 & bitmask_end;
+      }
+    } break;
+    case 13: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[12]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  3);
+        }
+        r0 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 3 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 3 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 3 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 3 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 3 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 3 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 3 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 3 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 3 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 3 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 3 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 3 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 3 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 3 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 3 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 3 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+      }
+    } break;
+    }
+  } else {
+    switch(T->depth) {
+    case 11: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[ 8]->rows[i];
+        word *t1 = T->x[ 9]->rows[i];
+        word *t2 = T->x[10]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  7);
+          word_slice_64_16_l_combine_bulk(t1, j2, f, j,  6);
+          word_slice_64_16_l_combine_bulk(t2, j2, f, j,  5);
+        }
+        r0 = r1 = r2 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 7 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 6 & x80008000)>>  0; r2 |= word_slice_64_16_l(f[j+15]<< 5 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 7 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 6 & x80008000)>>  4; r2 |= word_slice_64_16_l(f[j+14]<< 5 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 7 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 6 & x80008000)>>  8; r2 |= word_slice_64_16_l(f[j+13]<< 5 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 7 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 6 & x80008000)>> 12; r2 |= word_slice_64_16_l(f[j+12]<< 5 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 7 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 6 & x80008000)>> 16; r2 |= word_slice_64_16_l(f[j+11]<< 5 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 7 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 6 & x80008000)>> 20; r2 |= word_slice_64_16_l(f[j+10]<< 5 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 7 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 6 & x80008000)>> 24; r2 |= word_slice_64_16_l(f[j+ 9]<< 5 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 7 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 6 & x80008000)>> 28; r2 |= word_slice_64_16_l(f[j+ 8]<< 5 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 7 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 6 & x80008000)>> 32; r2 |= word_slice_64_16_l(f[j+ 7]<< 5 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 7 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 6 & x80008000)>> 36; r2 |= word_slice_64_16_l(f[j+ 6]<< 5 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 7 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 6 & x80008000)>> 40; r2 |= word_slice_64_16_l(f[j+ 5]<< 5 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 7 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 6 & x80008000)>> 44; r2 |= word_slice_64_16_l(f[j+ 4]<< 5 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 7 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 6 & x80008000)>> 48; r2 |= word_slice_64_16_l(f[j+ 3]<< 5 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 7 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 6 & x80008000)>> 52; r2 |= word_slice_64_16_l(f[j+ 2]<< 5 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 7 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 6 & x80008000)>> 56; r2 |= word_slice_64_16_l(f[j+ 1]<< 5 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 7 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 6 & x80008000)>> 60; r2 |= word_slice_64_16_l(f[j+ 0]<< 5 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+        t1[j2] |= r1 & bitmask_end;
+        t2[j2] |= r2 & bitmask_end;
+      }
+    } break;
+    case 10: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[ 8]->rows[i];
+        word *t1 = T->x[ 9]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  7);
+          word_slice_64_16_l_combine_bulk(t1, j2, f, j,  6);
+        }
+        r0 = r1 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 7 & x80008000)>>  0; r1 |= word_slice_64_16_l(f[j+15]<< 6 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 7 & x80008000)>>  4; r1 |= word_slice_64_16_l(f[j+14]<< 6 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 7 & x80008000)>>  8; r1 |= word_slice_64_16_l(f[j+13]<< 6 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 7 & x80008000)>> 12; r1 |= word_slice_64_16_l(f[j+12]<< 6 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 7 & x80008000)>> 16; r1 |= word_slice_64_16_l(f[j+11]<< 6 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 7 & x80008000)>> 20; r1 |= word_slice_64_16_l(f[j+10]<< 6 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 7 & x80008000)>> 24; r1 |= word_slice_64_16_l(f[j+ 9]<< 6 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 7 & x80008000)>> 28; r1 |= word_slice_64_16_l(f[j+ 8]<< 6 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 7 & x80008000)>> 32; r1 |= word_slice_64_16_l(f[j+ 7]<< 6 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 7 & x80008000)>> 36; r1 |= word_slice_64_16_l(f[j+ 6]<< 6 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 7 & x80008000)>> 40; r1 |= word_slice_64_16_l(f[j+ 5]<< 6 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 7 & x80008000)>> 44; r1 |= word_slice_64_16_l(f[j+ 4]<< 6 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 7 & x80008000)>> 48; r1 |= word_slice_64_16_l(f[j+ 3]<< 6 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 7 & x80008000)>> 52; r1 |= word_slice_64_16_l(f[j+ 2]<< 6 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 7 & x80008000)>> 56; r1 |= word_slice_64_16_l(f[j+ 1]<< 6 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 7 & x80008000)>> 60; r1 |= word_slice_64_16_l(f[j+ 0]<< 6 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+        t1[j2] |= r1 & bitmask_end;
+      }
+    } break;
+    case  9: {
+      for(size_t i=0; i<T->nrows; i++) {
+        word *t0 = T->x[ 8]->rows[i];
+        const word const *f  = F->x->rows[i];
+
+        /* bulk of work */
+        for(j=0, j2=0; j+16 < F->x->width; j+=16,j2++) {
+          word_slice_64_16_l_combine_bulk(t0, j2, f, j,  7);
+        }
+        r0 = 0;
+        switch(F->x->width - j) {
+        case 16: r0 |= word_slice_64_16_l(f[j+15]<< 7 & x80008000)>>  0;
+        case 15: r0 |= word_slice_64_16_l(f[j+14]<< 7 & x80008000)>>  4;
+        case 14: r0 |= word_slice_64_16_l(f[j+13]<< 7 & x80008000)>>  8;
+        case 13: r0 |= word_slice_64_16_l(f[j+12]<< 7 & x80008000)>> 12;
+        case 12: r0 |= word_slice_64_16_l(f[j+11]<< 7 & x80008000)>> 16;
+        case 11: r0 |= word_slice_64_16_l(f[j+10]<< 7 & x80008000)>> 20;
+        case 10: r0 |= word_slice_64_16_l(f[j+ 9]<< 7 & x80008000)>> 24;
+        case  9: r0 |= word_slice_64_16_l(f[j+ 8]<< 7 & x80008000)>> 28;
+        case  8: r0 |= word_slice_64_16_l(f[j+ 7]<< 7 & x80008000)>> 32;
+        case  7: r0 |= word_slice_64_16_l(f[j+ 6]<< 7 & x80008000)>> 36;
+        case  6: r0 |= word_slice_64_16_l(f[j+ 5]<< 7 & x80008000)>> 40;
+        case  5: r0 |= word_slice_64_16_l(f[j+ 4]<< 7 & x80008000)>> 44;
+        case  4: r0 |= word_slice_64_16_l(f[j+ 3]<< 7 & x80008000)>> 48;
+        case  3: r0 |= word_slice_64_16_l(f[j+ 2]<< 7 & x80008000)>> 52;
+        case  2: r0 |= word_slice_64_16_l(f[j+ 1]<< 7 & x80008000)>> 56;
+        case  1: r0 |= word_slice_64_16_l(f[j+ 0]<< 7 & x80008000)>> 60;
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t0[j2] |= r0 & bitmask_end;
+      }
+    } break;
+    default:
+      m4ri_die("impossible");
+    }
+  }
+  return T;
+}
+
+mzed_t *_mzed_cling16(mzed_t *T, const mzd_slice_t *F) {
+  wi_t j,j2 = 0;
+
+  const word bitmask_end = __M4RI_LEFT_BITMASK((T->x->offset + T->x->ncols) % m4ri_radix);
+
+  if (mzd_slice_is_zero(F))
+    return T;
+
+  for(rci_t i=0; i<T->nrows; i++) {
+    const word *f00 = F->x[ 0]->rows[i];
+    const word *f01 = F->x[ 1]->rows[i];
+    const word *f02 = F->x[ 2]->rows[i];
+    const word *f03 = F->x[ 3]->rows[i];
+    const word *f04 = F->x[ 4]->rows[i];
+    const word *f05 = F->x[ 5]->rows[i];
+    const word *f06 = F->x[ 6]->rows[i];
+    const word *f07 = F->x[ 7]->rows[i];
+    word *t  = T->x->rows[i];
+
+    for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+      t[j+ 0] = (word_cling_64_16_l(f00[j2]<<60)>>15) | (word_cling_64_16_l(f01[j2]<<60)>>14) | (word_cling_64_16_l(f02[j2]<<60)>>13) | (word_cling_64_16_l(f03[j2]<<60)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<60)>>11) | (word_cling_64_16_l(f05[j2]<<60)>>10) | (word_cling_64_16_l(f06[j2]<<60)>> 9) | (word_cling_64_16_l(f07[j2]<<60)>> 8);
+      t[j+ 1] = (word_cling_64_16_l(f00[j2]<<56)>>15) | (word_cling_64_16_l(f01[j2]<<56)>>14) | (word_cling_64_16_l(f02[j2]<<56)>>13) | (word_cling_64_16_l(f03[j2]<<56)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<56)>>11) | (word_cling_64_16_l(f05[j2]<<56)>>10) | (word_cling_64_16_l(f06[j2]<<56)>> 9) | (word_cling_64_16_l(f07[j2]<<56)>> 8);
+      t[j+ 2] = (word_cling_64_16_l(f00[j2]<<52)>>15) | (word_cling_64_16_l(f01[j2]<<52)>>14) | (word_cling_64_16_l(f02[j2]<<52)>>13) | (word_cling_64_16_l(f03[j2]<<52)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<52)>>11) | (word_cling_64_16_l(f05[j2]<<52)>>10) | (word_cling_64_16_l(f06[j2]<<52)>> 9) | (word_cling_64_16_l(f07[j2]<<52)>> 8);
+      t[j+ 3] = (word_cling_64_16_l(f00[j2]<<48)>>15) | (word_cling_64_16_l(f01[j2]<<48)>>14) | (word_cling_64_16_l(f02[j2]<<48)>>13) | (word_cling_64_16_l(f03[j2]<<48)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<48)>>11) | (word_cling_64_16_l(f05[j2]<<48)>>10) | (word_cling_64_16_l(f06[j2]<<48)>> 9) | (word_cling_64_16_l(f07[j2]<<48)>> 8);
+      t[j+ 4] = (word_cling_64_16_l(f00[j2]<<44)>>15) | (word_cling_64_16_l(f01[j2]<<44)>>14) | (word_cling_64_16_l(f02[j2]<<44)>>13) | (word_cling_64_16_l(f03[j2]<<44)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<44)>>11) | (word_cling_64_16_l(f05[j2]<<44)>>10) | (word_cling_64_16_l(f06[j2]<<44)>> 9) | (word_cling_64_16_l(f07[j2]<<44)>> 8);
+      t[j+ 5] = (word_cling_64_16_l(f00[j2]<<40)>>15) | (word_cling_64_16_l(f01[j2]<<40)>>14) | (word_cling_64_16_l(f02[j2]<<40)>>13) | (word_cling_64_16_l(f03[j2]<<40)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<40)>>11) | (word_cling_64_16_l(f05[j2]<<40)>>10) | (word_cling_64_16_l(f06[j2]<<40)>> 9) | (word_cling_64_16_l(f07[j2]<<40)>> 8);
+      t[j+ 6] = (word_cling_64_16_l(f00[j2]<<36)>>15) | (word_cling_64_16_l(f01[j2]<<36)>>14) | (word_cling_64_16_l(f02[j2]<<36)>>13) | (word_cling_64_16_l(f03[j2]<<36)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<36)>>11) | (word_cling_64_16_l(f05[j2]<<36)>>10) | (word_cling_64_16_l(f06[j2]<<36)>> 9) | (word_cling_64_16_l(f07[j2]<<36)>> 8);
+      t[j+ 7] = (word_cling_64_16_l(f00[j2]<<32)>>15) | (word_cling_64_16_l(f01[j2]<<32)>>14) | (word_cling_64_16_l(f02[j2]<<32)>>13) | (word_cling_64_16_l(f03[j2]<<32)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<32)>>11) | (word_cling_64_16_l(f05[j2]<<32)>>10) | (word_cling_64_16_l(f06[j2]<<32)>> 9) | (word_cling_64_16_l(f07[j2]<<32)>> 8);
+      t[j+ 8] = (word_cling_64_16_l(f00[j2]<<28)>>15) | (word_cling_64_16_l(f01[j2]<<28)>>14) | (word_cling_64_16_l(f02[j2]<<28)>>13) | (word_cling_64_16_l(f03[j2]<<28)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<28)>>11) | (word_cling_64_16_l(f05[j2]<<28)>>10) | (word_cling_64_16_l(f06[j2]<<28)>> 9) | (word_cling_64_16_l(f07[j2]<<28)>> 8);
+      t[j+ 9] = (word_cling_64_16_l(f00[j2]<<24)>>15) | (word_cling_64_16_l(f01[j2]<<24)>>14) | (word_cling_64_16_l(f02[j2]<<24)>>13) | (word_cling_64_16_l(f03[j2]<<24)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<24)>>11) | (word_cling_64_16_l(f05[j2]<<24)>>10) | (word_cling_64_16_l(f06[j2]<<24)>> 9) | (word_cling_64_16_l(f07[j2]<<24)>> 8);
+      t[j+10] = (word_cling_64_16_l(f00[j2]<<20)>>15) | (word_cling_64_16_l(f01[j2]<<20)>>14) | (word_cling_64_16_l(f02[j2]<<20)>>13) | (word_cling_64_16_l(f03[j2]<<20)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<20)>>11) | (word_cling_64_16_l(f05[j2]<<20)>>10) | (word_cling_64_16_l(f06[j2]<<20)>> 9) | (word_cling_64_16_l(f07[j2]<<20)>> 8);
+      t[j+11] = (word_cling_64_16_l(f00[j2]<<16)>>15) | (word_cling_64_16_l(f01[j2]<<16)>>14) | (word_cling_64_16_l(f02[j2]<<16)>>13) | (word_cling_64_16_l(f03[j2]<<16)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<16)>>11) | (word_cling_64_16_l(f05[j2]<<16)>>10) | (word_cling_64_16_l(f06[j2]<<16)>> 9) | (word_cling_64_16_l(f07[j2]<<16)>> 8);
+      t[j+12] = (word_cling_64_16_l(f00[j2]<<12)>>15) | (word_cling_64_16_l(f01[j2]<<12)>>14) | (word_cling_64_16_l(f02[j2]<<12)>>13) | (word_cling_64_16_l(f03[j2]<<12)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<<12)>>11) | (word_cling_64_16_l(f05[j2]<<12)>>10) | (word_cling_64_16_l(f06[j2]<<12)>> 9) | (word_cling_64_16_l(f07[j2]<<12)>> 8);
+      t[j+13] = (word_cling_64_16_l(f00[j2]<< 8)>>15) | (word_cling_64_16_l(f01[j2]<< 8)>>14) | (word_cling_64_16_l(f02[j2]<< 8)>>13) | (word_cling_64_16_l(f03[j2]<< 8)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<< 8)>>11) | (word_cling_64_16_l(f05[j2]<< 8)>>10) | (word_cling_64_16_l(f06[j2]<< 8)>> 9) | (word_cling_64_16_l(f07[j2]<< 8)>> 8);
+      t[j+14] = (word_cling_64_16_l(f00[j2]<< 4)>>15) | (word_cling_64_16_l(f01[j2]<< 4)>>14) | (word_cling_64_16_l(f02[j2]<< 4)>>13) | (word_cling_64_16_l(f03[j2]<< 4)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<< 4)>>11) | (word_cling_64_16_l(f05[j2]<< 4)>>10) | (word_cling_64_16_l(f06[j2]<< 4)>> 9) | (word_cling_64_16_l(f07[j2]<< 4)>> 8);
+      t[j+15] = (word_cling_64_16_l(f00[j2]<< 0)>>15) | (word_cling_64_16_l(f01[j2]<< 0)>>14) | (word_cling_64_16_l(f02[j2]<< 0)>>13) | (word_cling_64_16_l(f03[j2]<< 0)>>12) \
+        |       (word_cling_64_16_l(f04[j2]<< 0)>>11) | (word_cling_64_16_l(f05[j2]<< 0)>>10) | (word_cling_64_16_l(f06[j2]<< 0)>> 9) | (word_cling_64_16_l(f07[j2]<< 0)>> 8);
+    }
+
+    register word tmp = t[T->x->width-1];
+    switch(T->x->width - j) {
+    case 16: t[j+15] = (word_cling_64_16_l(f00[j2]<< 0)>>15) | (word_cling_64_16_l(f01[j2]<< 0)>>14) | (word_cling_64_16_l(f02[j2]<< 0)>>13) | (word_cling_64_16_l(f03[j2]<< 0)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<< 0)>>11) | (word_cling_64_16_l(f05[j2]<< 0)>>10) | (word_cling_64_16_l(f06[j2]<< 0)>> 9) | (word_cling_64_16_l(f07[j2]<< 0)>> 8);
+    case 15: t[j+14] = (word_cling_64_16_l(f00[j2]<< 4)>>15) | (word_cling_64_16_l(f01[j2]<< 4)>>14) | (word_cling_64_16_l(f02[j2]<< 4)>>13) | (word_cling_64_16_l(f03[j2]<< 4)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<< 4)>>11) | (word_cling_64_16_l(f05[j2]<< 4)>>10) | (word_cling_64_16_l(f06[j2]<< 4)>> 9) | (word_cling_64_16_l(f07[j2]<< 4)>> 8);
+    case 14: t[j+13] = (word_cling_64_16_l(f00[j2]<< 8)>>15) | (word_cling_64_16_l(f01[j2]<< 8)>>14) | (word_cling_64_16_l(f02[j2]<< 8)>>13) | (word_cling_64_16_l(f03[j2]<< 8)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<< 8)>>11) | (word_cling_64_16_l(f05[j2]<< 8)>>10) | (word_cling_64_16_l(f06[j2]<< 8)>> 9) | (word_cling_64_16_l(f07[j2]<< 8)>> 8);
+    case 13: t[j+12] = (word_cling_64_16_l(f00[j2]<<12)>>15) | (word_cling_64_16_l(f01[j2]<<12)>>14) | (word_cling_64_16_l(f02[j2]<<12)>>13) | (word_cling_64_16_l(f03[j2]<<12)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<12)>>11) | (word_cling_64_16_l(f05[j2]<<12)>>10) | (word_cling_64_16_l(f06[j2]<<12)>> 9) | (word_cling_64_16_l(f07[j2]<<12)>> 8);
+    case 12: t[j+11] = (word_cling_64_16_l(f00[j2]<<16)>>15) | (word_cling_64_16_l(f01[j2]<<16)>>14) | (word_cling_64_16_l(f02[j2]<<16)>>13) | (word_cling_64_16_l(f03[j2]<<16)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<16)>>11) | (word_cling_64_16_l(f05[j2]<<16)>>10) | (word_cling_64_16_l(f06[j2]<<16)>> 9) | (word_cling_64_16_l(f07[j2]<<16)>> 8);
+    case 11: t[j+10] = (word_cling_64_16_l(f00[j2]<<20)>>15) | (word_cling_64_16_l(f01[j2]<<20)>>14) | (word_cling_64_16_l(f02[j2]<<20)>>13) | (word_cling_64_16_l(f03[j2]<<20)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<20)>>11) | (word_cling_64_16_l(f05[j2]<<20)>>10) | (word_cling_64_16_l(f06[j2]<<20)>> 9) | (word_cling_64_16_l(f07[j2]<<20)>> 8);
+    case 10: t[j+ 9] = (word_cling_64_16_l(f00[j2]<<24)>>15) | (word_cling_64_16_l(f01[j2]<<24)>>14) | (word_cling_64_16_l(f02[j2]<<24)>>13) | (word_cling_64_16_l(f03[j2]<<24)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<24)>>11) | (word_cling_64_16_l(f05[j2]<<24)>>10) | (word_cling_64_16_l(f06[j2]<<24)>> 9) | (word_cling_64_16_l(f07[j2]<<24)>> 8);
+    case  9: t[j+ 8] = (word_cling_64_16_l(f00[j2]<<28)>>15) | (word_cling_64_16_l(f01[j2]<<28)>>14) | (word_cling_64_16_l(f02[j2]<<28)>>13) | (word_cling_64_16_l(f03[j2]<<28)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<28)>>11) | (word_cling_64_16_l(f05[j2]<<28)>>10) | (word_cling_64_16_l(f06[j2]<<28)>> 9) | (word_cling_64_16_l(f07[j2]<<28)>> 8);
+    case  8: t[j+ 7] = (word_cling_64_16_l(f00[j2]<<32)>>15) | (word_cling_64_16_l(f01[j2]<<32)>>14) | (word_cling_64_16_l(f02[j2]<<32)>>13) | (word_cling_64_16_l(f03[j2]<<32)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<32)>>11) | (word_cling_64_16_l(f05[j2]<<32)>>10) | (word_cling_64_16_l(f06[j2]<<32)>> 9) | (word_cling_64_16_l(f07[j2]<<32)>> 8);
+    case  7: t[j+ 6] = (word_cling_64_16_l(f00[j2]<<36)>>15) | (word_cling_64_16_l(f01[j2]<<36)>>14) | (word_cling_64_16_l(f02[j2]<<36)>>13) | (word_cling_64_16_l(f03[j2]<<36)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<36)>>11) | (word_cling_64_16_l(f05[j2]<<36)>>10) | (word_cling_64_16_l(f06[j2]<<36)>> 9) | (word_cling_64_16_l(f07[j2]<<36)>> 8);
+    case  6: t[j+ 5] = (word_cling_64_16_l(f00[j2]<<40)>>15) | (word_cling_64_16_l(f01[j2]<<40)>>14) | (word_cling_64_16_l(f02[j2]<<40)>>13) | (word_cling_64_16_l(f03[j2]<<40)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<40)>>11) | (word_cling_64_16_l(f05[j2]<<40)>>10) | (word_cling_64_16_l(f06[j2]<<40)>> 9) | (word_cling_64_16_l(f07[j2]<<40)>> 8);
+    case  5: t[j+ 4] = (word_cling_64_16_l(f00[j2]<<44)>>15) | (word_cling_64_16_l(f01[j2]<<44)>>14) | (word_cling_64_16_l(f02[j2]<<44)>>13) | (word_cling_64_16_l(f03[j2]<<44)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<44)>>11) | (word_cling_64_16_l(f05[j2]<<44)>>10) | (word_cling_64_16_l(f06[j2]<<44)>> 9) | (word_cling_64_16_l(f07[j2]<<44)>> 8);
+    case  4: t[j+ 3] = (word_cling_64_16_l(f00[j2]<<48)>>15) | (word_cling_64_16_l(f01[j2]<<48)>>14) | (word_cling_64_16_l(f02[j2]<<48)>>13) | (word_cling_64_16_l(f03[j2]<<48)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<48)>>11) | (word_cling_64_16_l(f05[j2]<<48)>>10) | (word_cling_64_16_l(f06[j2]<<48)>> 9) | (word_cling_64_16_l(f07[j2]<<48)>> 8);
+    case  3: t[j+ 2] = (word_cling_64_16_l(f00[j2]<<52)>>15) | (word_cling_64_16_l(f01[j2]<<52)>>14) | (word_cling_64_16_l(f02[j2]<<52)>>13) | (word_cling_64_16_l(f03[j2]<<52)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<52)>>11) | (word_cling_64_16_l(f05[j2]<<52)>>10) | (word_cling_64_16_l(f06[j2]<<52)>> 9) | (word_cling_64_16_l(f07[j2]<<52)>> 8);
+    case  2: t[j+ 1] = (word_cling_64_16_l(f00[j2]<<56)>>15) | (word_cling_64_16_l(f01[j2]<<56)>>14) | (word_cling_64_16_l(f02[j2]<<56)>>13) | (word_cling_64_16_l(f03[j2]<<56)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<56)>>11) | (word_cling_64_16_l(f05[j2]<<56)>>10) | (word_cling_64_16_l(f06[j2]<<56)>> 9) | (word_cling_64_16_l(f07[j2]<<56)>> 8);
+    case  1: t[j+ 0] = (word_cling_64_16_l(f00[j2]<<60)>>15) | (word_cling_64_16_l(f01[j2]<<60)>>14) | (word_cling_64_16_l(f02[j2]<<60)>>13) | (word_cling_64_16_l(f03[j2]<<60)>>12) | \
+                       (word_cling_64_16_l(f04[j2]<<60)>>11) | (word_cling_64_16_l(f05[j2]<<60)>>10) | (word_cling_64_16_l(f06[j2]<<60)>> 9) | (word_cling_64_16_l(f07[j2]<<60)>> 8);
+      break;
+    default:
+      m4ri_die("impossible");
+    }
+    t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+  }
+
+  if(T->finite_field->degree < 12) {
+    switch(T->finite_field->degree) {
+    case 9: {
+      for(rci_t i=0; i<T->nrows; i++) {
+        const word *f00 = F->x[ 8]->rows[i];
+        word *t  = T->x->rows[i];
+
+        for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+          t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7);
+          t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7);
+          t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7);
+          t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7);
+          t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7);
+          t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7);
+          t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7);
+          t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7);
+          t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7);
+          t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7);
+          t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7);
+          t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7);
+          t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7);
+          t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7);
+          t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7);
+          t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7);
+        }
+
+        register word tmp = t[T->x->width-1];
+        switch(T->x->width - j) {
+        case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7);
+        case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7);
+        case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7);
+        case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7);
+        case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7);
+        case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7);
+        case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7);
+        case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7);
+        case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7);
+        case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7);
+        case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7);
+        case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7);
+        case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7);
+        case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7);
+        case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7);
+        case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7);
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+      }
+    }
+      break;
+    case 10: {
+      for(rci_t i=0; i<T->nrows; i++) {
+        const word *f00 = F->x[ 8]->rows[i];
+        const word *f01 = F->x[ 9]->rows[i];
+        word *t  = T->x->rows[i];
+
+        for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+          t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6);
+          t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6);
+          t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6);
+          t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6);
+          t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6);
+          t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6);
+          t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6);
+          t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6);
+          t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6);
+          t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6);
+          t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6);
+          t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6);
+          t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6);
+          t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6);
+          t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6);
+          t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6);
+        }
+
+        register word tmp = t[T->x->width-1];
+        switch(T->x->width - j) {
+        case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6);
+        case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6);
+        case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6);
+        case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6);
+        case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6);
+        case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6);
+        case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6);
+        case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6);
+        case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6);
+        case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6);
+        case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6);
+        case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6);
+        case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6);
+        case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6);
+        case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6);
+        case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6);
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+      }
+    }
+      break;
+    case 11: {
+      for(rci_t i=0; i<T->nrows; i++) {
+        const word *f00 = F->x[ 8]->rows[i];
+        const word *f01 = F->x[ 9]->rows[i];
+        const word *f02 = F->x[10]->rows[i];
+        word *t  = T->x->rows[i];
+
+        for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+          t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6) | (word_cling_64_16_l(f02[j2]<<60)>>5);
+          t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6) | (word_cling_64_16_l(f02[j2]<<56)>>5);
+          t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6) | (word_cling_64_16_l(f02[j2]<<52)>>5);
+          t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6) | (word_cling_64_16_l(f02[j2]<<48)>>5);
+          t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6) | (word_cling_64_16_l(f02[j2]<<44)>>5);
+          t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6) | (word_cling_64_16_l(f02[j2]<<40)>>5);
+          t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6) | (word_cling_64_16_l(f02[j2]<<36)>>5);
+          t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6) | (word_cling_64_16_l(f02[j2]<<32)>>5);
+          t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6) | (word_cling_64_16_l(f02[j2]<<28)>>5);
+          t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6) | (word_cling_64_16_l(f02[j2]<<24)>>5);
+          t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6) | (word_cling_64_16_l(f02[j2]<<20)>>5);
+          t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6) | (word_cling_64_16_l(f02[j2]<<16)>>5);
+          t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6) | (word_cling_64_16_l(f02[j2]<<12)>>5);
+          t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6) | (word_cling_64_16_l(f02[j2]<< 8)>>5);
+          t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6) | (word_cling_64_16_l(f02[j2]<< 4)>>5);
+          t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6) | (word_cling_64_16_l(f02[j2]<< 0)>>5);
+        }
+
+        register word tmp = t[T->x->width-1];
+        switch(T->x->width - j) {
+        case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6) | (word_cling_64_16_l(f02[j2]<< 0)>>5);
+        case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6) | (word_cling_64_16_l(f02[j2]<< 4)>>5);
+        case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6) | (word_cling_64_16_l(f02[j2]<< 8)>>5);
+        case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6) | (word_cling_64_16_l(f02[j2]<<12)>>5);
+        case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6) | (word_cling_64_16_l(f02[j2]<<16)>>5);
+        case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6) | (word_cling_64_16_l(f02[j2]<<20)>>5);
+        case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6) | (word_cling_64_16_l(f02[j2]<<24)>>5);
+        case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6) | (word_cling_64_16_l(f02[j2]<<28)>>5);
+        case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6) | (word_cling_64_16_l(f02[j2]<<32)>>5);
+        case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6) | (word_cling_64_16_l(f02[j2]<<36)>>5);
+        case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6) | (word_cling_64_16_l(f02[j2]<<40)>>5);
+        case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6) | (word_cling_64_16_l(f02[j2]<<44)>>5);
+        case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6) | (word_cling_64_16_l(f02[j2]<<48)>>5);
+        case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6) | (word_cling_64_16_l(f02[j2]<<52)>>5);
+        case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6) | (word_cling_64_16_l(f02[j2]<<56)>>5);
+        case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6) | (word_cling_64_16_l(f02[j2]<<60)>>5);
+          break;
+        default:
+          m4ri_die("impossible");
+        }
+        t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+      }
+    }
+      break;
+    }
+  } else {
+    for(rci_t i=0; i<T->nrows; i++) {
+      const word *f00 = F->x[ 8]->rows[i];
+      const word *f01 = F->x[ 9]->rows[i];
+      const word *f02 = F->x[10]->rows[i];
+      const word *f03 = F->x[11]->rows[i];
+      word *t  = T->x->rows[i];
+
+      for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+        t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6) | (word_cling_64_16_l(f02[j2]<<60)>>5) | (word_cling_64_16_l(f03[j2]<<60)>>4);
+        t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6) | (word_cling_64_16_l(f02[j2]<<56)>>5) | (word_cling_64_16_l(f03[j2]<<56)>>4);
+        t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6) | (word_cling_64_16_l(f02[j2]<<52)>>5) | (word_cling_64_16_l(f03[j2]<<52)>>4);
+        t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6) | (word_cling_64_16_l(f02[j2]<<48)>>5) | (word_cling_64_16_l(f03[j2]<<48)>>4);
+        t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6) | (word_cling_64_16_l(f02[j2]<<44)>>5) | (word_cling_64_16_l(f03[j2]<<44)>>4);
+        t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6) | (word_cling_64_16_l(f02[j2]<<40)>>5) | (word_cling_64_16_l(f03[j2]<<40)>>4);
+        t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6) | (word_cling_64_16_l(f02[j2]<<36)>>5) | (word_cling_64_16_l(f03[j2]<<36)>>4);
+        t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6) | (word_cling_64_16_l(f02[j2]<<32)>>5) | (word_cling_64_16_l(f03[j2]<<32)>>4);
+        t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6) | (word_cling_64_16_l(f02[j2]<<28)>>5) | (word_cling_64_16_l(f03[j2]<<28)>>4);
+        t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6) | (word_cling_64_16_l(f02[j2]<<24)>>5) | (word_cling_64_16_l(f03[j2]<<24)>>4);
+        t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6) | (word_cling_64_16_l(f02[j2]<<20)>>5) | (word_cling_64_16_l(f03[j2]<<20)>>4);
+        t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6) | (word_cling_64_16_l(f02[j2]<<16)>>5) | (word_cling_64_16_l(f03[j2]<<16)>>4);
+        t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6) | (word_cling_64_16_l(f02[j2]<<12)>>5) | (word_cling_64_16_l(f03[j2]<<12)>>4);
+        t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6) | (word_cling_64_16_l(f02[j2]<< 8)>>5) | (word_cling_64_16_l(f03[j2]<< 8)>>4);
+        t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6) | (word_cling_64_16_l(f02[j2]<< 4)>>5) | (word_cling_64_16_l(f03[j2]<< 4)>>4);
+        t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6) | (word_cling_64_16_l(f02[j2]<< 0)>>5) | (word_cling_64_16_l(f03[j2]<< 0)>>4);
+      }
+
+      register word tmp = t[T->x->width-1];
+      switch(T->x->width - j) {
+      case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>7) | (word_cling_64_16_l(f01[j2]<< 0)>>6) | (word_cling_64_16_l(f02[j2]<< 0)>>5) | (word_cling_64_16_l(f03[j2]<< 0)>>4);
+      case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>7) | (word_cling_64_16_l(f01[j2]<< 4)>>6) | (word_cling_64_16_l(f02[j2]<< 4)>>5) | (word_cling_64_16_l(f03[j2]<< 4)>>4);
+      case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>7) | (word_cling_64_16_l(f01[j2]<< 8)>>6) | (word_cling_64_16_l(f02[j2]<< 8)>>5) | (word_cling_64_16_l(f03[j2]<< 8)>>4);
+      case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>7) | (word_cling_64_16_l(f01[j2]<<12)>>6) | (word_cling_64_16_l(f02[j2]<<12)>>5) | (word_cling_64_16_l(f03[j2]<<12)>>4);
+      case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>7) | (word_cling_64_16_l(f01[j2]<<16)>>6) | (word_cling_64_16_l(f02[j2]<<16)>>5) | (word_cling_64_16_l(f03[j2]<<16)>>4);
+      case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>7) | (word_cling_64_16_l(f01[j2]<<20)>>6) | (word_cling_64_16_l(f02[j2]<<20)>>5) | (word_cling_64_16_l(f03[j2]<<20)>>4);
+      case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>7) | (word_cling_64_16_l(f01[j2]<<24)>>6) | (word_cling_64_16_l(f02[j2]<<24)>>5) | (word_cling_64_16_l(f03[j2]<<24)>>4);
+      case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>7) | (word_cling_64_16_l(f01[j2]<<28)>>6) | (word_cling_64_16_l(f02[j2]<<28)>>5) | (word_cling_64_16_l(f03[j2]<<28)>>4);
+      case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>7) | (word_cling_64_16_l(f01[j2]<<32)>>6) | (word_cling_64_16_l(f02[j2]<<32)>>5) | (word_cling_64_16_l(f03[j2]<<32)>>4);
+      case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>7) | (word_cling_64_16_l(f01[j2]<<36)>>6) | (word_cling_64_16_l(f02[j2]<<36)>>5) | (word_cling_64_16_l(f03[j2]<<36)>>4);
+      case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>7) | (word_cling_64_16_l(f01[j2]<<40)>>6) | (word_cling_64_16_l(f02[j2]<<40)>>5) | (word_cling_64_16_l(f03[j2]<<40)>>4);
+      case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>7) | (word_cling_64_16_l(f01[j2]<<44)>>6) | (word_cling_64_16_l(f02[j2]<<44)>>5) | (word_cling_64_16_l(f03[j2]<<44)>>4);
+      case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>7) | (word_cling_64_16_l(f01[j2]<<48)>>6) | (word_cling_64_16_l(f02[j2]<<48)>>5) | (word_cling_64_16_l(f03[j2]<<48)>>4);
+      case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>7) | (word_cling_64_16_l(f01[j2]<<52)>>6) | (word_cling_64_16_l(f02[j2]<<52)>>5) | (word_cling_64_16_l(f03[j2]<<52)>>4);
+      case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>7) | (word_cling_64_16_l(f01[j2]<<56)>>6) | (word_cling_64_16_l(f02[j2]<<56)>>5) | (word_cling_64_16_l(f03[j2]<<56)>>4);
+      case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>7) | (word_cling_64_16_l(f01[j2]<<60)>>6) | (word_cling_64_16_l(f02[j2]<<60)>>5) | (word_cling_64_16_l(f03[j2]<<60)>>4);
         break;
-      case 4:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+2] = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<<32)>>7) | (word_cling_64_08_l(f1[j2]<<32)>>6) | (word_cling_64_08_l(f2[j2]<<32)>>5) | (word_cling_64_08_l(f3[j2]<<32)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<32)>>3);
-        t[j+3] = (t[j+3] & ~bitmask_end) | (tmp & bitmask_end);
+      default:
+        m4ri_die("impossible");
+      }
+      t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+
+      switch(T->finite_field->degree) {
+      case 13: {
+        for(rci_t i=0; i<T->nrows; i++) {
+          const word *f00 = F->x[12]->rows[i];
+          word *t  = T->x->rows[i];
+
+          for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+            t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3);
+            t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3);
+            t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3);
+            t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3);
+            t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3);
+            t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3);
+            t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3);
+            t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3);
+            t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3);
+            t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3);
+            t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3);
+            t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3);
+            t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3);
+            t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3);
+            t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3);
+            t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3);
+          }
+
+          register word tmp = t[T->x->width-1];
+          switch(T->x->width - j) {
+          case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3);
+          case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3);
+          case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3);
+          case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3);
+          case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3);
+          case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3);
+          case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3);
+          case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3);
+          case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3);
+          case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3);
+          case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3);
+          case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3);
+          case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3);
+          case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3);
+          case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3);
+          case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3);
+            break;
+          default:
+            m4ri_die("impossible");
+          }
+          t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+        }
+      }
         break;
-      case 3:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+1] = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<<40)>>7) | (word_cling_64_08_l(f1[j2]<<40)>>6) | (word_cling_64_08_l(f2[j2]<<40)>>5) | (word_cling_64_08_l(f3[j2]<<40)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<40)>>3);
-        t[j+2] = (t[j+2] & ~bitmask_end) | (tmp & bitmask_end);
+      case 14: {
+        for(rci_t i=0; i<T->nrows; i++) {
+          const word *f00 = F->x[12]->rows[i];
+          const word *f01 = F->x[13]->rows[i];
+          word *t  = T->x->rows[i];
+
+          for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+            t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2);
+            t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2);
+            t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2);
+            t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2);
+            t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2);
+            t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2);
+            t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2);
+            t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2);
+            t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2);
+            t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2);
+            t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2);
+            t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2);
+            t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2);
+            t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2);
+            t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2);
+            t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2);
+          }
+
+          register word tmp = t[T->x->width-1];
+          switch(T->x->width - j) {
+          case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2);
+          case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2);
+          case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2);
+          case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2);
+          case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2);
+          case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2);
+          case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2);
+          case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2);
+          case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2);
+          case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2);
+          case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2);
+          case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2);
+          case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2);
+          case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2);
+          case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2);
+          case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2);
+            break;
+          default:
+            m4ri_die("impossible");
+          }
+          t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+        }
+      }
         break;
-      case 2:
-        t[j+0] = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        tmp    = (word_cling_64_08_l(f0[j2]<<48)>>7) | (word_cling_64_08_l(f1[j2]<<48)>>6) | (word_cling_64_08_l(f2[j2]<<48)>>5) | (word_cling_64_08_l(f3[j2]<<48)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<48)>>3);
-        t[j+1] = (t[j+1] & ~bitmask_end) | (tmp & bitmask_end);
+      case 15: {
+        for(rci_t i=0; i<T->nrows; i++) {
+          const word *f00 = F->x[12]->rows[i];
+          const word *f01 = F->x[13]->rows[i];
+          const word *f02 = F->x[14]->rows[i];
+          word *t  = T->x->rows[i];
+
+          for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+            t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2) | (word_cling_64_16_l(f02[j2]<<60)>>1);
+            t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2) | (word_cling_64_16_l(f02[j2]<<56)>>1);
+            t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2) | (word_cling_64_16_l(f02[j2]<<52)>>1);
+            t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2) | (word_cling_64_16_l(f02[j2]<<48)>>1);
+            t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2) | (word_cling_64_16_l(f02[j2]<<44)>>1);
+            t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2) | (word_cling_64_16_l(f02[j2]<<40)>>1);
+            t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2) | (word_cling_64_16_l(f02[j2]<<36)>>1);
+            t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2) | (word_cling_64_16_l(f02[j2]<<32)>>1);
+            t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2) | (word_cling_64_16_l(f02[j2]<<28)>>1);
+            t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2) | (word_cling_64_16_l(f02[j2]<<24)>>1);
+            t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2) | (word_cling_64_16_l(f02[j2]<<20)>>1);
+            t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2) | (word_cling_64_16_l(f02[j2]<<16)>>1);
+            t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2) | (word_cling_64_16_l(f02[j2]<<12)>>1);
+            t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2) | (word_cling_64_16_l(f02[j2]<< 8)>>1);
+            t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2) | (word_cling_64_16_l(f02[j2]<< 4)>>1);
+            t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2) | (word_cling_64_16_l(f02[j2]<< 0)>>1);
+          }
+
+          register word tmp = t[T->x->width-1];
+          switch(T->x->width - j) {
+          case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2) | (word_cling_64_16_l(f02[j2]<< 0)>>1);
+          case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2) | (word_cling_64_16_l(f02[j2]<< 4)>>1);
+          case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2) | (word_cling_64_16_l(f02[j2]<< 8)>>1);
+          case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2) | (word_cling_64_16_l(f02[j2]<<12)>>1);
+          case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2) | (word_cling_64_16_l(f02[j2]<<16)>>1);
+          case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2) | (word_cling_64_16_l(f02[j2]<<20)>>1);
+          case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2) | (word_cling_64_16_l(f02[j2]<<24)>>1);
+          case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2) | (word_cling_64_16_l(f02[j2]<<28)>>1);
+          case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2) | (word_cling_64_16_l(f02[j2]<<32)>>1);
+          case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2) | (word_cling_64_16_l(f02[j2]<<36)>>1);
+          case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2) | (word_cling_64_16_l(f02[j2]<<40)>>1);
+          case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2) | (word_cling_64_16_l(f02[j2]<<44)>>1);
+          case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2) | (word_cling_64_16_l(f02[j2]<<48)>>1);
+          case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2) | (word_cling_64_16_l(f02[j2]<<52)>>1);
+          case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2) | (word_cling_64_16_l(f02[j2]<<56)>>1);
+          case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2) | (word_cling_64_16_l(f02[j2]<<60)>>1);
+            break;
+          default:
+            m4ri_die("impossible");
+          }
+          t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+        }
+      }
         break;
-      case 1:
-        tmp    = (word_cling_64_08_l(f0[j2]<<56)>>7) | (word_cling_64_08_l(f1[j2]<<56)>>6) | (word_cling_64_08_l(f2[j2]<<56)>>5) | (word_cling_64_08_l(f3[j2]<<56)>>4) \
-          |      (word_cling_64_08_l(f4[j2]<<56)>>3);
-        t[j+0] = (t[j+0] & ~bitmask_end) | (tmp & bitmask_end);
+      case 16: {
+        for(rci_t i=0; i<T->nrows; i++) {
+          const word *f00 = F->x[12]->rows[i];
+          const word *f01 = F->x[13]->rows[i];
+          const word *f02 = F->x[14]->rows[i];
+          const word *f03 = F->x[15]->rows[i];
+          word *t  = T->x->rows[i];
+
+          for(j=0, j2=0; j+16 < T->x->width; j+=16, j2++) {
+            t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2) | (word_cling_64_16_l(f02[j2]<<60)>>1) | (word_cling_64_16_l(f03[j2]<<60)>>0);
+            t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2) | (word_cling_64_16_l(f02[j2]<<56)>>1) | (word_cling_64_16_l(f03[j2]<<56)>>0);
+            t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2) | (word_cling_64_16_l(f02[j2]<<52)>>1) | (word_cling_64_16_l(f03[j2]<<52)>>0);
+            t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2) | (word_cling_64_16_l(f02[j2]<<48)>>1) | (word_cling_64_16_l(f03[j2]<<48)>>0);
+            t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2) | (word_cling_64_16_l(f02[j2]<<44)>>1) | (word_cling_64_16_l(f03[j2]<<44)>>0);
+            t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2) | (word_cling_64_16_l(f02[j2]<<40)>>1) | (word_cling_64_16_l(f03[j2]<<40)>>0);
+            t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2) | (word_cling_64_16_l(f02[j2]<<36)>>1) | (word_cling_64_16_l(f03[j2]<<36)>>0);
+            t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2) | (word_cling_64_16_l(f02[j2]<<32)>>1) | (word_cling_64_16_l(f03[j2]<<32)>>0);
+            t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2) | (word_cling_64_16_l(f02[j2]<<28)>>1) | (word_cling_64_16_l(f03[j2]<<28)>>0);
+            t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2) | (word_cling_64_16_l(f02[j2]<<24)>>1) | (word_cling_64_16_l(f03[j2]<<24)>>0);
+            t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2) | (word_cling_64_16_l(f02[j2]<<20)>>1) | (word_cling_64_16_l(f03[j2]<<20)>>0);
+            t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2) | (word_cling_64_16_l(f02[j2]<<16)>>1) | (word_cling_64_16_l(f03[j2]<<16)>>0);
+            t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2) | (word_cling_64_16_l(f02[j2]<<12)>>1) | (word_cling_64_16_l(f03[j2]<<12)>>0);
+            t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2) | (word_cling_64_16_l(f02[j2]<< 8)>>1) | (word_cling_64_16_l(f03[j2]<< 8)>>0);
+            t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2) | (word_cling_64_16_l(f02[j2]<< 4)>>1) | (word_cling_64_16_l(f03[j2]<< 4)>>0);
+            t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2) | (word_cling_64_16_l(f02[j2]<< 0)>>1) | (word_cling_64_16_l(f03[j2]<< 0)>>0);
+          }
+
+          register word tmp = t[T->x->width-1];
+          switch(T->x->width - j) {
+          case 16: t[j+15] |= (word_cling_64_16_l(f00[j2]<< 0)>>3) | (word_cling_64_16_l(f01[j2]<< 0)>>2) | (word_cling_64_16_l(f02[j2]<< 0)>>1) | (word_cling_64_16_l(f03[j2]<< 0)>>0);
+          case 15: t[j+14] |= (word_cling_64_16_l(f00[j2]<< 4)>>3) | (word_cling_64_16_l(f01[j2]<< 4)>>2) | (word_cling_64_16_l(f02[j2]<< 4)>>1) | (word_cling_64_16_l(f03[j2]<< 4)>>0);
+          case 14: t[j+13] |= (word_cling_64_16_l(f00[j2]<< 8)>>3) | (word_cling_64_16_l(f01[j2]<< 8)>>2) | (word_cling_64_16_l(f02[j2]<< 8)>>1) | (word_cling_64_16_l(f03[j2]<< 8)>>0);
+          case 13: t[j+12] |= (word_cling_64_16_l(f00[j2]<<12)>>3) | (word_cling_64_16_l(f01[j2]<<12)>>2) | (word_cling_64_16_l(f02[j2]<<12)>>1) | (word_cling_64_16_l(f03[j2]<<12)>>0);
+          case 12: t[j+11] |= (word_cling_64_16_l(f00[j2]<<16)>>3) | (word_cling_64_16_l(f01[j2]<<16)>>2) | (word_cling_64_16_l(f02[j2]<<16)>>1) | (word_cling_64_16_l(f03[j2]<<16)>>0);
+          case 11: t[j+10] |= (word_cling_64_16_l(f00[j2]<<20)>>3) | (word_cling_64_16_l(f01[j2]<<20)>>2) | (word_cling_64_16_l(f02[j2]<<20)>>1) | (word_cling_64_16_l(f03[j2]<<20)>>0);
+          case 10: t[j+ 9] |= (word_cling_64_16_l(f00[j2]<<24)>>3) | (word_cling_64_16_l(f01[j2]<<24)>>2) | (word_cling_64_16_l(f02[j2]<<24)>>1) | (word_cling_64_16_l(f03[j2]<<24)>>0);
+          case  9: t[j+ 8] |= (word_cling_64_16_l(f00[j2]<<28)>>3) | (word_cling_64_16_l(f01[j2]<<28)>>2) | (word_cling_64_16_l(f02[j2]<<28)>>1) | (word_cling_64_16_l(f03[j2]<<28)>>0);
+          case  8: t[j+ 7] |= (word_cling_64_16_l(f00[j2]<<32)>>3) | (word_cling_64_16_l(f01[j2]<<32)>>2) | (word_cling_64_16_l(f02[j2]<<32)>>1) | (word_cling_64_16_l(f03[j2]<<32)>>0);
+          case  7: t[j+ 6] |= (word_cling_64_16_l(f00[j2]<<36)>>3) | (word_cling_64_16_l(f01[j2]<<36)>>2) | (word_cling_64_16_l(f02[j2]<<36)>>1) | (word_cling_64_16_l(f03[j2]<<36)>>0);
+          case  6: t[j+ 5] |= (word_cling_64_16_l(f00[j2]<<40)>>3) | (word_cling_64_16_l(f01[j2]<<40)>>2) | (word_cling_64_16_l(f02[j2]<<40)>>1) | (word_cling_64_16_l(f03[j2]<<40)>>0);
+          case  5: t[j+ 4] |= (word_cling_64_16_l(f00[j2]<<44)>>3) | (word_cling_64_16_l(f01[j2]<<44)>>2) | (word_cling_64_16_l(f02[j2]<<44)>>1) | (word_cling_64_16_l(f03[j2]<<44)>>0);
+          case  4: t[j+ 3] |= (word_cling_64_16_l(f00[j2]<<48)>>3) | (word_cling_64_16_l(f01[j2]<<48)>>2) | (word_cling_64_16_l(f02[j2]<<48)>>1) | (word_cling_64_16_l(f03[j2]<<48)>>0);
+          case  3: t[j+ 2] |= (word_cling_64_16_l(f00[j2]<<52)>>3) | (word_cling_64_16_l(f01[j2]<<52)>>2) | (word_cling_64_16_l(f02[j2]<<52)>>1) | (word_cling_64_16_l(f03[j2]<<52)>>0);
+          case  2: t[j+ 1] |= (word_cling_64_16_l(f00[j2]<<56)>>3) | (word_cling_64_16_l(f01[j2]<<56)>>2) | (word_cling_64_16_l(f02[j2]<<56)>>1) | (word_cling_64_16_l(f03[j2]<<56)>>0);
+          case  1: t[j+ 0] |= (word_cling_64_16_l(f00[j2]<<60)>>3) | (word_cling_64_16_l(f01[j2]<<60)>>2) | (word_cling_64_16_l(f02[j2]<<60)>>1) | (word_cling_64_16_l(f03[j2]<<60)>>0);
+            break;
+          default:
+            m4ri_die("impossible");
+          }
+          t[T->x->width-1] = (t[T->x->width-1] & bitmask_end) | (tmp & ~bitmask_end);
+        }
+      }
         break;
-      default:
-        m4ri_die("impossible");
       }
     }
   }
-    break;
-  default:
-    m4ri_die("impossible");
-  } 
   return T;
 }
diff --git a/src/conversion.h b/src/conversion.h
index 517fd12..95be5a3 100644
--- a/src/conversion.h
+++ b/src/conversion.h
@@ -35,8 +35,8 @@
 /**
  * \brief Pack a bitslice matrix into a packed represenation.
  *
- * \param A Matrix over GF(2^k) or NULL
- * \param Z Bitslice matrix over GF(2^k)
+ * \param A Matrix over \GF2E or NULL
+ * \param Z Bitslice matrix over \GF2E
  *
  * \ingroup Constructions
  */
@@ -68,24 +68,33 @@ mzd_slice_t *mzed_slice(mzd_slice_t *A, const mzed_t *Z);
 mzd_slice_t *_mzed_slice2(mzd_slice_t *A, const mzed_t *Z);
 
 /**
- * \brief Unpack the matrix Z over GF(2^e) into bitslice representation.
+ * \brief Unpack the matrix Z over \GF2E into bitslice representation.
  *
- * \param A Zero bitslice matrix over GF(2^e)
- * \param Z Matrix over GF(2^e)
+ * \param A Zero bitslice matrix over \GF2E
+ * \param Z Matrix over \GF2E
  */
 
 mzd_slice_t *_mzed_slice4(mzd_slice_t *A, const mzed_t *Z);
 
 /**
- * \brief Unpack the matrix Z over GF(2^e) into bitslice representation.
+ * \brief Unpack the matrix Z over \GF2E into bitslice representation.
  *
- * \param A Zero bitslice matrix over GF(2^e)
- * \param Z Matrix over GF(2^e)
+ * \param A Zero bitslice matrix over \GF2E
+ * \param Z Matrix over \GF2E
  */
 
 mzd_slice_t *_mzed_slice8(mzd_slice_t *A, const mzed_t *Z);
 
 /**
+ * \brief Unpack the matrix Z over \GF2E into bitslice representation.
+ *
+ * \param A Zero bitslice matrix over \GF2E
+ * \param Z Matrix over \GF2E
+ */
+
+mzd_slice_t *_mzed_slice16(mzd_slice_t *A, const mzed_t *Z);
+
+/**
  * \brief Pack a bitslice matrix into a classical represenation over GF(2^2).
  *
  * Elements in GF(2^2) can be represented as c_1*a + c_0 where a is a
@@ -100,24 +109,33 @@ mzed_t *_mzed_cling2(mzed_t *A, const mzd_slice_t *Z);
 
 
 /**
- * \brief Pack a bitslice matrix into a classical represenation over GF(2^e) for e in {3,4}.
+ * \brief Pack a bitslice matrix into a classical represenation over \GF2E for 2 < e <= 4.
  *
- * \param A Matrix over GF(2^e), must be zero
- * \param Z Bitslice matrix over GF(2^e)
+ * \param A Matrix over \GF2E, must be zero
+ * \param Z Bitslice matrix over \GF2E
  */
 
 mzed_t *_mzed_cling4(mzed_t *A, const mzd_slice_t *Z);
 
 /**
- * \brief Pack a bitslice matrix into a classical represenation over GF(2^e) for e in {5,6,7,8}.
+ * \brief Pack a bitslice matrix into a classical represenation over \GF2E for 4 < e <= 8.
  *
- * \param A Matrix over GF(2^e), must be zero
- * \param Z Bitslice matrix over GF(2^e)
+ * \param A Matrix over \GF2E, must be zero
+ * \param Z Bitslice matrix over \GF2E
  */
 
 mzed_t *_mzed_cling8(mzed_t *A, const mzd_slice_t *Z);
 
 /**
+ * \brief Pack a bitslice matrix into a classical represenation over \GF2E for 8 < e <= 16.
+ *
+ * \param A Matrix over \GF2E, must be zero
+ * \param Z Bitslice matrix over \GF2E
+ */
+
+mzed_t *_mzed_cling16(mzed_t *A, const mzd_slice_t *Z);
+
+/**
  * \brief Compute C += A*B using Karatsuba multiplication of polynomials over GF(2).
  *
  * \param C Preallocated return matrix, may be NULL for automatic creation.
diff --git a/src/mzd_slice.h b/src/mzd_slice.h
index 0501879..1fc6716 100644
--- a/src/mzd_slice.h
+++ b/src/mzd_slice.h
@@ -645,20 +645,8 @@ static inline mzd_slice_t *mzd_slice_addmul(mzd_slice_t *C, const mzd_slice_t *A
  */
 
 static inline void mzd_slice_randomize(mzd_slice_t *A) {
-  switch(A->depth) {
-  case 10: mzd_randomize(A->x[9]);
-  case  9: mzd_randomize(A->x[8]);
-  case  8: mzd_randomize(A->x[7]);
-  case  7: mzd_randomize(A->x[6]);
-  case  6: mzd_randomize(A->x[5]);
-  case  5: mzd_randomize(A->x[4]);
-  case  4: mzd_randomize(A->x[3]);
-  case  3: mzd_randomize(A->x[2]);
-  case  2: mzd_randomize(A->x[1]);
-  case  1: mzd_randomize(A->x[0]); break;
-  default:
-    m4ri_die("impossible");
-  }
+  for(int i=0; i<A->depth; i++)
+    mzd_randomize(A->x[i]);
 }
 
 /**
@@ -897,20 +885,8 @@ void mzd_slice_print(const mzd_slice_t *A);
  */
 
 static inline void _mzd_slice_compress_l(mzd_slice_t *A, const rci_t r1, const rci_t n1, const rci_t r2) {
-  switch(A->finite_field->degree) {
-  case 10: _mzd_compress_l(A->x[9], r1, n1, r2);
-  case  9: _mzd_compress_l(A->x[8], r1, n1, r2);
-  case  8: _mzd_compress_l(A->x[7], r1, n1, r2);
-  case  7: _mzd_compress_l(A->x[6], r1, n1, r2);
-  case  6: _mzd_compress_l(A->x[5], r1, n1, r2);
-  case  5: _mzd_compress_l(A->x[4], r1, n1, r2);
-  case  4: _mzd_compress_l(A->x[3], r1, n1, r2);
-  case  3: _mzd_compress_l(A->x[2], r1, n1, r2);
-  case  2: _mzd_compress_l(A->x[1], r1, n1, r2);
-  case  1: _mzd_compress_l(A->x[0], r1, n1, r2); break;
-  default:
-    m4ri_die("impossible");
-  };
+  for(int i=0; i<A->depth; i++)
+    _mzd_compress_l(A->x[i], r1, n1, r2);
 }
 
 #endif //M4RIE_MZD_SLICE
diff --git a/tests/test_smallops.cc b/tests/test_smallops.cc
index 01bc0c6..a4532f8 100644
--- a/tests/test_smallops.cc
+++ b/tests/test_smallops.cc
@@ -189,33 +189,29 @@ int test_batch(gf2e *ff, int m, int n) {
 
 int main(int argc, char **argv) {
 
-  gf2e *ff[17];
+  gf2e *ff;
   int fail_ret = 0;
 
   for(int k=2; k<=16; k++) {
-    ff[k] = gf2e_init(irreducible_polynomials[k][1]);
-  }
-
-  for(int k=2; k<=8; k++) {
-    fail_ret += test_batch(ff[k],   2, m4ri_radix/gf2e_degree_to_w(ff[k]));
-    fail_ret += test_batch(ff[k],   2, 2*m4ri_radix/gf2e_degree_to_w(ff[k]));
-    fail_ret += test_batch(ff[k],   2, 3*m4ri_radix/gf2e_degree_to_w(ff[k]));
-    fail_ret += test_batch(ff[k],   2, 4*m4ri_radix/gf2e_degree_to_w(ff[k]));
-    fail_ret += test_batch(ff[k],   4,   3);
-    fail_ret += test_batch(ff[k],   1,   2);
-    fail_ret += test_batch(ff[k],  10,  11);
-    fail_ret += test_batch(ff[k],  20,  19);
-    fail_ret += test_batch(ff[k],  32,  64);
-    fail_ret += test_batch(ff[k],  63,  65);
-    fail_ret += test_batch(ff[k],  64,  65);
-    fail_ret += test_batch(ff[k],  64, 128);
-    fail_ret += test_batch(ff[k],  65, 129);
-    fail_ret += test_batch(ff[k], 201, 200);
-    fail_ret += test_batch(ff[k], 217,   2);
-  }
-
-  for(int k=2; k<=16; k++) {
-    gf2e_free(ff[k]);
+    ff = gf2e_init(irreducible_polynomials[k][1]);
+
+    fail_ret += test_batch(ff,   2,   m4ri_radix/gf2e_degree_to_w(ff));
+    fail_ret += test_batch(ff,   2, 2*m4ri_radix/gf2e_degree_to_w(ff));
+    fail_ret += test_batch(ff,   2, 3*m4ri_radix/gf2e_degree_to_w(ff));
+    fail_ret += test_batch(ff,   2, 4*m4ri_radix/gf2e_degree_to_w(ff));
+    fail_ret += test_batch(ff,   4,   3);
+    fail_ret += test_batch(ff,   1,   2);
+    fail_ret += test_batch(ff,  10,  11);
+    fail_ret += test_batch(ff,  20,  19);
+    fail_ret += test_batch(ff,  32,  64);
+    fail_ret += test_batch(ff,  63,  65);
+    fail_ret += test_batch(ff,  64,  65);
+    fail_ret += test_batch(ff,  64, 128);
+    fail_ret += test_batch(ff,  65, 129);
+    fail_ret += test_batch(ff, 201, 200);
+    fail_ret += test_batch(ff, 217,   2);
+
+    gf2e_free(ff);
   }
 
   return fail_ret;

-- 
Fast arithmetic with dense matrices over F_{2^e}



More information about the debian-science-commits mailing list