[lua-torch-torch7] 01/11: New upstream version 0~20160908-ge5ebac6

Sat Sep 10 04:47:33 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.

commit d3f95c66472eb3fa63f2ffd166d41f87b50a81b4
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Sat Sep 10 03:25:40 2016 +0000

    New upstream version 0~20160908-ge5ebac6
---
 File.lua                                         |   4 +-
 ROADMAP.md                                       |   2 +-
 Tensor.lua                                       |   2 +-
 Tester.lua                                       |   2 +-
 doc/maths.md                                     |  43 +-
 doc/random.md                                    |  10 +-
 generic/Tensor.c                                 |   1 +
 lib/TH/CMakeLists.txt                            |  56 ++-
 lib/TH/THAllocator.c                             | 305 ++++++++++--
 lib/TH/THAllocator.h                             |  18 +-
 lib/TH/THAtomic.h                                |   7 +
 lib/TH/THDiskFile.c                              |   2 +-
 lib/TH/THTensor.c                                |   1 +
 lib/TH/THVector.c                                |  17 +
 lib/TH/THVector.h                                | 571 +----------------------
 lib/TH/generic/THStorage.c                       |  24 +-
 lib/TH/generic/THStorage.h                       |   3 +-
 lib/TH/generic/THTensorMath.c                    |  16 +-
 lib/TH/generic/THTensorRandom.c                  |   2 +-
 lib/TH/generic/THVector.h                        |  14 +
 lib/TH/generic/{THVector.c => THVectorDefault.c} |  12 +-
 lib/TH/generic/THVectorDispatch.c                | 140 ++++++
 lib/TH/generic/simd/simd.h                       |  91 ++++
 lib/TH/vector/NEON.c                             | 252 ++++++++++
 lib/TH/vector/SSE.c                              | 213 +++++++++
 lib/luaT/CMakeLists.txt                          |   4 +
 lib/luaT/README.md                               |   2 +-
 test/test.lua                                    |   9 +-
 28 files changed, 1132 insertions(+), 691 deletions(-)

diff --git a/File.lua b/File.lua
index 1cc4dfe..62249a3 100644
--- a/File.lua
+++ b/File.lua
@@ -275,7 +275,7 @@ function File:readObject()
        local dumped = self:readChar(size):string()
        local func, err = loadstring(dumped)
        if not func then
-          error(string.format('Failed to load function from bytecode: %s', err))
+          io.stderr:write(string.format('Warning: Failed to load function from bytecode: %s', err))
        end
        local upvalues = self:readObject()
        for index,upvalue in ipairs(upvalues) do
@@ -298,7 +298,7 @@ function File:readObject()
          local dumped = self:readChar(size):string()
          local func, err = loadstring(dumped)
          if not func then
-            error(string.format('Failed to load function from bytecode: %s', err))
+	    io.stderr:write(string.format('Warning: Failed to load function from bytecode: %s', err))
          end
          if not force then
              objects[index] = func
diff --git a/ROADMAP.md b/ROADMAP.md
index cb9c5ad..d906126 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -84,7 +84,7 @@ The roadmap focuses on five separate things
 		   Also, I agree, I actually could not install iTorch on my laptop 
                    before cvpr tutorial somehow, it did not want to work :).
   - **soumith**: I think we should propose a common display API that any interface can implement, 
-                 that way the users dont need to change scripts across different UI backends.
+                 that way the users don't need to change scripts across different UI backends.
 	         Also, szym/display is a good candidate for the Web UI, ITorch is indeed a bit of a pain to install.
 
   - Should we endorse iTorch for everyone to use? 
diff --git a/Tensor.lua b/Tensor.lua
index 0d573aa..b4b3e95 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -389,7 +389,7 @@ torch.repeatTensor = Tensor.repeatTensor
 --- One of the size elements can be -1,
  --- a new LongStorage is then returned.
  --- The length of the unspecified dimension
- --- is infered from the number of remaining elements.
+ --- is inferred from the number of remaining elements.
 local function specifyFully(size, nElements)
     local nCoveredElements = 1
     local remainingDim = nil
diff --git a/Tester.lua b/Tester.lua
index a3b3ff3..f512edb 100644
--- a/Tester.lua
+++ b/Tester.lua
@@ -236,7 +236,7 @@ function Tester:_assertTensorEqOrNeq(ta, tb, negate, ...)
    if self._assertTensorEqIgnoresDims and (not negate) and success
          and not ta:isSameSizeAs(tb) then
      self:_warning("Tensors have the same content but different dimensions. "
-                   .. "For backwards compatability, they are considered equal, "
+                   .. "For backwards compatibility, they are considered equal, "
                    .. "but this may change in the future. Consider using :eq "
                    .. "to check for equality instead.")
    end
diff --git a/doc/maths.md b/doc/maths.md
index fa322e0..dd427ea 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -978,20 +978,17 @@ The number of elements must match: both `Tensor`s are seen as a 1D vector.
 
 
 <a name="torch.addmv"></a>
-### [res] torch.addmv([res,] [beta,] [v1,] vec1, [v2,] mat, vec2) ###
+### [res] torch.addmv([res,] [v1,] vec1, [v2,] mat, vec2) ###
 <a name="torch.addmv"></a>
 
 Performs a matrix-vector multiplication between `mat` (2D `Tensor`) and `vec2` (1D `Tensor`) and add it to `vec1`.
 
 Optional values `v1` and `v2` are scalars that multiply `vec1` and `vec2` respectively.
 
-Optional value `beta` is  a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
-Defaults to `1.0`.
-
 In other words,
 
 ```
-res = (beta * res) + (v1 * vec1) + (v2 * (mat * vec2))
+res = (v1 * vec1) + (v2 * (mat * vec2))
 ```
 
 Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` matrix, `vec2` must be vector of size `m` and `vec1` must be a vector of size `n`.
@@ -1012,12 +1009,21 @@ Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` m
 
 `torch.addmv(r, x, y, z)` puts the result in `r`.
 
-`x:addmv(y, z)` accumulates `y * z` into `x`.
+**Differences when used as a method**
+
+`x:addmv(y, z)` does `x = x + y * z`
+
+`r:addmv(x, y, z)`  does `r = x + y * z` if x is a vector
 
-`r:addmv(x, y, z)` puts the result of `x + y * z` into `r` if `x` is a vector.
+`r:addmv(s, y, z)`   does `r = r + s * y * z` if `s` is a scalar.
 
-`r:addmv(s, y, z)` puts the result of `s * r + y * z` into `r` if `s` is a scalar.
+`r:addmv(x, s, y, z)`   does `r = x + s * y * z` if `s` is a scalar and `x` is a vector.
 
+`r:addmv(s1, s2, y, z)`   does `r = s1 * r + s2 * y * z` if `s1` and `s2` are scalars.
+
+The last example does not accurately fit into the function signature, and needs a special mention. It changes the function signature to:
+
+`[vec1] = vec1:addmv([v1,] [v2,] mat, vec2)`
 
 <a name="torch.addr"></a>
 ### [res] torch.addr([res,] [v1,] mat, [v2,] vec1, vec2) ###
@@ -1075,20 +1081,17 @@ If `vec1` is a vector of size `n` and `vec2` is a vector of size `m`, then `mat`
 
 
 <a name="torch.addmm"></a>
-### [res] torch.addmm([res,] [beta,] [v1,] M, [v2,] mat1, mat2) ###
+### [res] torch.addmm([res,] [v1,] M, [v2,] mat1, mat2) ###
 <a name="torch.addmm"></a>
 
 Performs a matrix-matrix multiplication between `mat1` (2D `Tensor`) and `mat2` (2D `Tensor`).
 
 Optional values `v1` and `v2` are scalars that multiply `M` and `mat1 * mat2` respectively.
 
-Optional value `beta` is  a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
-Defaults to `1.0`.
-
 In other words,
 
 ```
-res = (res * beta) + (v1 * M) + (v2 * mat1 * mat2)
+res = (v1 * M) + (v2 * mat1 * mat2)
 ```
 
 If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `M` must be a `n × p` matrix.
@@ -1097,9 +1100,19 @@ If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `M` must be a `n × p`
 
 `torch.addmm(r, M, mat1, mat2)` puts the result in `r`.
 
-`M:addmm(mat1, mat2)` puts the result in `M`.
+**Differences when used as a method**
+
+`M:addmm(mat1, mat2)` does `M = M + mat1 * mat2`.
+
+`r:addmm(M, mat1, mat2)`  does `r = M + mat1 * mat2`.
+
+`r:addmm(v1, M, v2, mat1, mat2)` does `r = (v1 * M) + (v2 * mat1 * mat2)`.
+
+`M:addmm(v1, v2, mat1, mat2)` does `M = (v1 * M) + (v2 * mat1 * mat2)`.
+
+The last example does not accurately fit into the function signature, and needs a special mention. It changes the function signature to:
 
-`r:addmm(M, mat1, mat2)` puts the result in `r`.
+`[M] = M:addmm([v1,] [v2,] mat1, mat2)`
 
 
 <a name="torch.addbmm"></a>
diff --git a/doc/random.md b/doc/random.md
index 7097edb..e6fa6ab 100644
--- a/doc/random.md
+++ b/doc/random.md
@@ -128,12 +128,12 @@ returns its argument, `state`.
 <a name="torch.random"></a>
 ### [number] random([gen,] [a], [b]) ###
 
-Returns an unsigned 32 bit integer random number from [a,b]. By default `a` is 1 and `b` is 2^32.
+Returns an unsigned 32 bit integer random number from `[a,b]`. By default `a` is `1` and `b` is `2^32`.
 
 <a name="torch.uniform"></a>
 ### [number] uniform([gen,] [a],[b]) ###
 
-Returns a random real number according to uniform distribution on [a,b). By default `a` is 0 and `b` is 1.
+Returns a random real number according to uniform distribution on `[a,b)`. By default `a` is `0` and `b` is `1`.
 
 <a name="torch.normal"></a>
 ### [number] normal([gen,] [mean],[stdv]) ###
@@ -145,13 +145,13 @@ Returns a random real number according to a normal distribution with the given `
 ### [number] exponential([gen,] lambda) ###
 
 Returns a random real number according to the exponential distribution
-''p(x) = lambda * exp(-lambda * x)''
+`p(x) = lambda * exp(-lambda * x)`
 
 <a name="torch.cauchy"></a>
 ### [number] cauchy([gen,] median, sigma) ###
 
 Returns a random real number according to the Cauchy distribution
-''p(x) = sigma/(pi*(sigma^2 + (x-median)^2))''
+`p(x) = sigma/(pi*(sigma^2 + (x-median)^2))`
 
 <a name="torch.logNormal"></a>
 ### [number] logNormal([gen,] mean, stdv) ###
@@ -164,7 +164,7 @@ the given `mean` and standard deviation `stdv`.
 ### [number] geometric([gen,] p) ###
 
 Returns a random integer number according to a geometric distribution
-''p(i) = (1-p) * p^(i-1)`. `p` must satisfy `0 < p < 1''.
+`p(i) = (1-p) * p^(i-1)`. `p` must satisfy `0 < p < 1`.
 
 <a name="torch.bernoulli"></a>
 ### [number] bernoulli([gen,] [p]) ###
diff --git a/generic/Tensor.c b/generic/Tensor.c
index 0bf74e1..3067213 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -1318,6 +1318,7 @@ void torch_Tensor_(init)(lua_State *L)
                     torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
   luaT_setfuncs(L, torch_Tensor_(_), 0);
   lua_pop(L, 1);
+  THVector_(vectorDispatchInit)();
 }
 
 #endif
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 551ea50..e1610af 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -70,28 +70,6 @@ IF (CORTEXA9_FOUND)
   SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
 ENDIF (CORTEXA9_FOUND)
 
-IF(UNIX)
-  INCLUDE(CheckFunctionExists)
-  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
-  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
-  IF(HAVE_MMAP)
-    ADD_DEFINITIONS(-DHAVE_MMAP=1)
-  ENDIF(HAVE_MMAP)
-  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
-  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
-  IF(HAVE_SHM_OPEN)
-    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
-  ENDIF(HAVE_SHM_OPEN)
-  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
-  IF(HAVE_SHM_UNLINK)
-    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
-  ENDIF(HAVE_SHM_UNLINK)
-  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
-  IF(HAVE_MALLOC_USABLE_SIZE)
-    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
-  ENDIF(HAVE_MALLOC_USABLE_SIZE)
-ENDIF(UNIX)
-
 FIND_PACKAGE(SSE)
 IF(C_SSE2_FOUND)
   SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
@@ -129,7 +107,7 @@ SET(hdr
 
 SET(src
   THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
-  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c)
+  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
 
 SET(src ${src} ${hdr} ${simd})
 ADD_LIBRARY(TH SHARED ${src})
@@ -137,6 +115,10 @@ if(BUILD_STATIC)
   ADD_LIBRARY(TH_static STATIC ${src})
 endif()
 
+SET_TARGET_PROPERTIES(TH PROPERTIES
+  VERSION   0
+  SOVERSION 0)
+
 CHECK_C_SOURCE_RUNS("
 #include <stdatomic.h>
 int main()
@@ -220,9 +202,34 @@ IF (UNIX AND NOT APPLE)
    CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
    IF(NEED_LIBRT)
      TARGET_LINK_LIBRARIES(TH rt)
+     SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
    ENDIF(NEED_LIBRT)
 ENDIF(UNIX AND NOT APPLE)
 
+IF(UNIX)
+  INCLUDE(CheckFunctionExists)
+  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+  IF(HAVE_MMAP)
+    ADD_DEFINITIONS(-DHAVE_MMAP=1)
+  ENDIF(HAVE_MMAP)
+  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+  IF(HAVE_SHM_OPEN)
+    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+  ENDIF(HAVE_SHM_OPEN)
+  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+  IF(HAVE_SHM_UNLINK)
+    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+  ENDIF(HAVE_SHM_UNLINK)
+  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+  IF(HAVE_MALLOC_USABLE_SIZE)
+    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+  ENDIF(HAVE_MALLOC_USABLE_SIZE)
+ENDIF(UNIX)
+
+
+
 IF(NOT MSVC)
   TARGET_LINK_LIBRARIES(TH m)
 ENDIF(NOT MSVC)
@@ -327,7 +334,8 @@ INSTALL(FILES
   generic/THTensorMath.h
   generic/THTensorRandom.c
   generic/THTensorRandom.h
-  generic/THVector.c
+  generic/THVectorDispatch.c
+  generic/THVector.h
   DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/generic")
 
 
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
index 6992544..d64b752 100644
--- a/lib/TH/THAllocator.c
+++ b/lib/TH/THAllocator.c
@@ -1,4 +1,5 @@
 #include "THAllocator.h"
+#include "THAtomic.h"
 
 /* stuff for mapped files */
 #ifdef _WIN32
@@ -36,22 +37,60 @@ THAllocator THDefaultAllocator = {
 
 struct THMapAllocatorContext_ {
   char *filename; /* file name */
-  int shared; /* is shared or not */
+  int flags;
   long size; /* mapped size */
+  int fd;
 };
 
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared)
+#define TH_ALLOC_ALIGNMENT 64
+
+typedef struct {
+  int refcount;
+} THMapInfo;
+
+char * unknown_filename = "filename not specified";
+
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags)
 {
   THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext));
 
-  ctx->filename = THAlloc(strlen(filename)+1);
-  strcpy(ctx->filename, filename);
-  ctx->shared = shared;
+  if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+    flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE;
+  if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0)
+    THError("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file "
+        "in shared mode");
+
+  if (filename) {
+    ctx->filename = THAlloc(strlen(filename)+1);
+    strcpy(ctx->filename, filename);
+  } else {
+    ctx->filename = unknown_filename;
+  }
+  ctx->flags = flags;
   ctx->size = 0;
+  ctx->fd = -1;
+
+  return ctx;
+}
+
+THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename, int fd, int flags)
+{
+  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
+  ctx->fd = fd;
 
   return ctx;
 }
 
+char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx)
+{
+  return ctx->filename;
+}
+
+int THMapAllocatorContext_fd(THMapAllocatorContext *ctx)
+{
+  return ctx->fd;
+}
+
 long THMapAllocatorContext_size(THMapAllocatorContext *ctx)
 {
   return ctx->size;
@@ -59,11 +98,12 @@ long THMapAllocatorContext_size(THMapAllocatorContext *ctx)
 
 void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
 {
-  THFree(ctx->filename);
+  if (ctx->filename != unknown_filename)
+    THFree(ctx->filename);
   THFree(ctx);
 }
 
-static void *THMapAllocator_alloc(void* ctx_, long size)
+static void *_map_alloc(void* ctx_, long size)
 {
   THMapAllocatorContext *ctx = ctx_;
   void *data = NULL;
@@ -75,9 +115,18 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
     DWORD size_hi, size_lo;
     size_t hfilesz;
 
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+      THError("exclusive file mapping is not supported on Windows");
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
+      THError("file mapping without creation is not supported on Windows");
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
+      THError("TH_ALLOCATOR_MAPPED_KEEPFD not supported on Windows");
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
+      THError("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows");
+
     /* open file */
     /* FILE_FLAG_RANDOM_ACCESS ? */
-    if(ctx->shared)
+    if(ctx->flags)
     {
       hfile = CreateFileA(ctx->filename, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
       if (hfile == INVALID_HANDLE_VALUE)
@@ -103,7 +152,7 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
     {
       if(size > hfilesz)
       {
-        if(ctx->shared)
+        if(ctx->flags)
         {
 #if SIZEOF_SIZE_T > 4
           size_hi = (DWORD)((size) >> 32);
@@ -144,7 +193,7 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
 #endif
 
     /* get map handle */
-    if(ctx->shared)
+    if(ctx->flags)
     {
       if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, size_hi, size_lo, NULL)) == NULL )
         THError("could not create a map on file <%s>", ctx->filename);
@@ -156,66 +205,89 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
     }
 
     /* map the stuff */
-    if(ctx->shared)
+    if(ctx->flags)
       data = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
     else
       data = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0);
 
-    CloseHandle(hfile); 
-    CloseHandle(hmfile); 
+    CloseHandle(hfile);
+    CloseHandle(hmfile);
   }
 #else /* _WIN32 */
   {
     /* open file */
     int fd;
-    long fdsz;
+    int flags;
+    struct stat file_stat;
 
-    if(ctx->shared == TH_ALLOCATOR_MAPPED_SHARED)
-    {
-      if((fd = open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
-        THError("unable to open file <%s> in read-write mode", ctx->filename);
-    }
-    else if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
-    {
+    if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
+      flags = O_RDWR | O_CREAT;
+    else
+      flags = O_RDONLY;
+
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+      flags |= O_EXCL;
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
+      flags &= ~O_CREAT;
+
+    if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)) {
+      if(ctx->flags & TH_ALLOCATOR_MAPPED_SHARED)
+      {
+        if((fd = open(ctx->filename, flags, (mode_t)0600)) == -1)
+          THError("unable to open file <%s> in read-write mode", ctx->filename);
+      }
+      else if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+      {
 #ifdef HAVE_SHM_OPEN
-      if((fd = shm_open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
-        THError("unable to open file <%s> in read-write mode", ctx->filename);
+        if((fd = shm_open(ctx->filename, flags, (mode_t)0600)) == -1)
+          THError("unable to open shared memory object <%s> in read-write mode", ctx->filename);
 #else
-      THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform");
+        THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform", ctx->filename);
 #endif
+      }
+      else
+      {
+        if((fd = open(ctx->filename, O_RDONLY)) == -1)
+          THError("unable to open file <%s> in read-only mode", ctx->filename);
+      }
+    } else {
+      fd = ctx->fd;
     }
-    else
-    {
-      if((fd = open(ctx->filename, O_RDONLY)) == -1)
-        THError("unable to open file <%s> in read-only mode", ctx->filename);
-    }
-    if((fdsz = lseek(fd, 0, SEEK_END)) == -1)
+
+    if(fstat(fd, &file_stat) == -1)
     {
-      close(fd);
-      THError("unable to seek at end of file <%s>", ctx->filename);
+      if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD))
+        close(fd);
+      THError("unable to stat the file <%s>", ctx->filename);
     }
+
     if(size > 0)
     {
-      if(size > fdsz)
+      if(size > file_stat.st_size)
       {
-        if(ctx->shared)
+        if(ctx->flags)
         {
           /* if it is shared mem, let's put it in correct size */
-          if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+          if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
           {
             if(ftruncate(fd, size) == -1)
               THError("unable to resize shared memory file <%s> to the right size", ctx->filename);
           }
-          if((fdsz = lseek(fd, size-1, SEEK_SET)) == -1)
+          if(fstat(fd, &file_stat) == -1 || file_stat.st_size < size)
           {
             close(fd);
             THError("unable to stretch file <%s> to the right size", ctx->filename);
           }
+/* on OS X write returns with errno 45 (Opperation not supported) when used
+ * with a file descriptor obtained via shm_open
+ */
+#ifndef __APPLE__
           if((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */
           {
             close(fd);
             THError("unable to write to file <%s>", ctx->filename);
           }
+#endif
         }
         else
         {
@@ -225,18 +297,40 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
       }
     }
     else
-      size = fdsz;
+      size = file_stat.st_size;
 
     ctx->size = size; /* if we are here, it must be the right size */
-    
+
     /* map it */
-    if(ctx->shared)
+    if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
       data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
     else
       data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
 
-    if(close(fd) == -1)
-      THError("Error closing file <%s>", ctx->filename);
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
+      ctx->fd = fd;
+    } else {
+      if(close(fd) == -1)
+        THError("Error closing file <%s>", ctx->filename);
+      ctx->fd = -1;
+    }
+
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK) {
+      if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+      {
+#ifdef HAVE_SHM_UNLINK
+        if (shm_unlink(ctx->filename) == -1)
+          THError("could not unlink the shared memory file %s", ctx->filename);
+#else
+        THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+#endif
+      }
+      else
+      {
+        if (unlink(ctx->filename) == -1)
+          THError("could not unlink file %s", ctx->filename);
+      }
+    }
 
     if(data == MAP_FAILED)
     {
@@ -249,6 +343,10 @@ static void *THMapAllocator_alloc(void* ctx_, long size)
   return data;
 }
 
+static void * THMapAllocator_alloc(void *ctx, long size) {
+  return _map_alloc(ctx, size);
+}
+
 static void *THMapAllocator_realloc(void* ctx, void* ptr, long size) {
   THError("cannot realloc mapped data");
   return NULL;
@@ -260,26 +358,35 @@ static void THMapAllocator_free(void* ctx_, void* data) {
 #ifdef _WIN32
   if(!UnmapViewOfFile((LPINT)data))
     THError("could not unmap the shared memory file");
-#else
+#else /* _WIN32 */
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
+    if (close(ctx->fd) == -1)
+      THError("could not close file descriptor %d", ctx->fd);
+  }
+
   if (munmap(data, ctx->size))
     THError("could not unmap the shared memory file");
-  if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+
+  if (!(ctx->flags & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK)))
   {
+    if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
+    {
 #ifdef HAVE_SHM_UNLINK
-    if (shm_unlink(ctx->filename) == -1)
-      THError("could not unlink the shared memory file %s", ctx->filename);
+      if (shm_unlink(ctx->filename) == -1)
+        THError("could not unlink the shared memory file %s", ctx->filename);
 #else
-    THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+      THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
 #endif
+    }
   }
-#endif
+#endif /* _WIN32 */
 
   THMapAllocatorContext_free(ctx);
 }
 
 #else
 
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared) {
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags) {
   THError("file mapping not supported on your system");
   return NULL;
 }
@@ -304,8 +411,110 @@ static void THMapAllocator_free(void* ctx, void* data) {
 
 #endif
 
+#if (defined(_WIN32) || defined(HAVE_MMAP)) && defined(TH_ATOMIC_IPC_REFCOUNT)
+
+static void * THRefcountedMapAllocator_alloc(void *_ctx, long size) {
+  THMapAllocatorContext *ctx = _ctx;
+
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
+    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_FROMFD flag");
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
+    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_KEEPFD flag");
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK)
+    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_UNLINK flag");
+  if (!(ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
+    THError("THRefcountedMapAllocator requires TH_ALLOCATOR_MAPPED_SHAREDMEM flag");
+
+  size = size + TH_ALLOC_ALIGNMENT;
+  void *ptr = _map_alloc(ctx, size);
+  char *data = ((char*)ptr) + TH_ALLOC_ALIGNMENT;
+  THMapInfo *map_info = (THMapInfo*)ptr;
+
+  if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
+    map_info->refcount = 1;
+  else
+    THAtomicIncrementRef(&map_info->refcount);
+
+  return (void*)data;
+}
+
+static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, long size) {
+  THError("cannot realloc mapped data");
+  return NULL;
+}
+
+static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
+  THMapAllocatorContext *ctx = ctx_;
+
+#ifdef _WIN32
+  if(!UnmapViewOfFile((LPINT)data))
+    THError("could not unmap the shared memory file");
+#else /* _WIN32 */
+
+  THMapInfo *info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+  if (THAtomicDecrementRef(&info->refcount)) {
+#ifdef HAVE_SHM_UNLINK
+    if (shm_unlink(ctx->filename) == -1)
+      THError("could not unlink the shared memory file %s", ctx->filename);
+#else
+    THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+#endif /* HAVE_SHM_UNLINK */
+  }
+  if (munmap(info, ctx->size))
+    THError("could not unmap the shared memory file %s", ctx->filename);
+#endif /* _WIN32 */
+
+  THMapAllocatorContext_free(ctx);
+}
+
+void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
+{
+  THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+  THAtomicIncrementRef(&map_info->refcount);
+}
+
+int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
+{
+  THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
+  return THAtomicDecrementRef(&map_info->refcount);
+}
+
+#else
+
+static void * THRefcountedMapAllocator_alloc(void *ctx, long size) {
+  THError("refcounted file mapping not supported on your system");
+  return NULL;
+}
+
+static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, long size) {
+  THError("refcounted file mapping not supported on your system");
+  return NULL;
+}
+
+static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
+  THError("refcounted file mapping not supported on your system");
+}
+
+void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
+{
+  THError("refcounted file mapping not supported on your system");
+}
+
+int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
+{
+  THError("refcounted file mapping not supported on your system");
+}
+
+#endif
+
 THAllocator THMapAllocator = {
   &THMapAllocator_alloc,
   &THMapAllocator_realloc,
   &THMapAllocator_free
 };
+
+THAllocator THRefcountedMapAllocator = {
+  &THRefcountedMapAllocator_alloc,
+  &THRefcountedMapAllocator_realloc,
+  &THRefcountedMapAllocator_free
+};
diff --git a/lib/TH/THAllocator.h b/lib/TH/THAllocator.h
index dbc75a8..14c433a 100644
--- a/lib/TH/THAllocator.h
+++ b/lib/TH/THAllocator.h
@@ -5,6 +5,11 @@
 
 #define TH_ALLOCATOR_MAPPED_SHARED 1
 #define TH_ALLOCATOR_MAPPED_SHAREDMEM 2
+#define TH_ALLOCATOR_MAPPED_EXCLUSIVE 4
+#define TH_ALLOCATOR_MAPPED_NOCREATE 8
+#define TH_ALLOCATOR_MAPPED_KEEPFD 16
+#define TH_ALLOCATOR_MAPPED_FROMFD 32
+#define TH_ALLOCATOR_MAPPED_UNLINK 64
 
 /* Custom allocator
  */
@@ -22,10 +27,17 @@ extern THAllocator THDefaultAllocator;
 /* file map allocator
  */
 typedef struct THMapAllocatorContext_  THMapAllocatorContext;
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared);
-long THMapAllocatorContext_size(THMapAllocatorContext *ctx);
-void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
+TH_API THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags);
+TH_API THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename,
+    int fd, int flags);
+TH_API char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx);
+TH_API int THMapAllocatorContext_fd(THMapAllocatorContext *ctx);
+TH_API long THMapAllocatorContext_size(THMapAllocatorContext *ctx);
+TH_API void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
+TH_API void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data);
+TH_API int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data);
 
 extern THAllocator THMapAllocator;
+extern THAllocator THRefcountedMapAllocator;
 
 #endif
diff --git a/lib/TH/THAtomic.h b/lib/TH/THAtomic.h
index 3a37c31..3a0b6fa 100644
--- a/lib/TH/THAtomic.h
+++ b/lib/TH/THAtomic.h
@@ -86,4 +86,11 @@ TH_API long THAtomicAddLong(long volatile *a, long value);
 */
 TH_API long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue);
 
+#if defined(USE_C11_ATOMICS) && defined(ATOMIC_INT_LOCK_FREE) && \
+  ATOMIC_INT_LOCK_FREE == 2
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#elif defined(USE_MSC_ATOMICS) || defined(USE_GCC_ATOMICS)
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#endif
+
 #endif
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index dff9710..7064b7f 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -207,7 +207,7 @@ static size_t THDiskFile_position(THFile *self)
   if (offset > -1)
       return (size_t)offset;
   else if(!dfself->file.isQuiet)
-      THError("unable to obtain disk file offset (maybe a long overflow occured)");
+      THError("unable to obtain disk file offset (maybe a long overflow occurred)");
 
   return 0;
 }
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index b0ab0a5..2878fc9 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -1,6 +1,7 @@
 #include "THAtomic.h"
 #include "THTensor.h"
 #include "THVector.h"
+
 #include "THBlas.h"
 #include "THLapack.h"
 #include "THRandom.h"
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
new file mode 100644
index 0000000..6179d89
--- /dev/null
+++ b/lib/TH/THVector.c
@@ -0,0 +1,17 @@
+#include "THVector.h"
+#include "generic/simd/simd.h"
+
+#ifdef __NEON__
+#include "vector/NEON.c"
+#endif
+
+#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+        || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+#include "vector/SSE.c"
+#endif
+
+#include "generic/THVectorDefault.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THVectorDispatch.c"
+#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THVector.h b/lib/TH/THVector.h
index 1344e75..e29917b 100644
--- a/lib/TH/THVector.h
+++ b/lib/TH/THVector.h
@@ -5,570 +5,9 @@
 
 #define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
 
-#if defined USE_SSE2 || defined USE_SSE3 || defined USE_SSSE3 \
-  || defined USE_SSE4_1 || defined USE_SSE4_2
+/* We are going to use dynamic dispatch, and want only to generate declarations
+ * of the vector functions */
+#include "generic/THVector.h"
+#include "THGenerateAllTypes.h"
 
-#ifdef USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#ifdef USE_SSE3
-#include <pmmintrin.h>
-#endif
-
-#ifdef USE_SSSE3
-#include <tmmintrin.h>
-#endif
-
-#if defined (USE_SSE4_2) || defined (USE_SSE4_1)
-#include <smmintrin.h>
-#endif
-
-#define THDoubleVector_fill(x, c, n) {          \
-    long i;                                     \
-    long off;                                   \
-    __m128d XMM0 = _mm_set1_pd(c);              \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      _mm_storeu_pd((x)+i  , XMM0);             \
-      _mm_storeu_pd((x)+i+2, XMM0);             \
-      _mm_storeu_pd((x)+i+4, XMM0);             \
-      _mm_storeu_pd((x)+i+6, XMM0);             \
-    }                                           \
-    off = (n) - ((n)%8);                        \
-    for (i=0; i<((n)%8); i++) {                 \
-      x[off+i] = c;                             \
-    }                                           \
-  }
-
-
-#define THDoubleVector_add(y, x, c, n) {        \
-    long i = 0;                                 \
-    __m128d XMM7 = _mm_set1_pd(c);              \
-    __m128d XMM0,XMM2;                          \
-    for (; i<=((n)-2); i+=2) {                  \
-      XMM0 = _mm_loadu_pd((x)+i);               \
-      XMM2 = _mm_loadu_pd((y)+i);               \
-      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
-      XMM2 = _mm_add_pd(XMM2, XMM0);            \
-      _mm_storeu_pd((y)+i  , XMM2);             \
-    }                                           \
-    for (; i<(n); i++) {                        \
-      y[i] += c * x[i];                         \
-    }                                           \
-  }
-
-#define THDoubleVector_diff(z, x, y, n) {       \
-    long i;                                     \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
-      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
-      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
-      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
-      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
-      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
-      XMM0 = _mm_sub_pd(XMM0, XMM4);            \
-      XMM1 = _mm_sub_pd(XMM1, XMM5);            \
-      XMM2 = _mm_sub_pd(XMM2, XMM6);            \
-      XMM3 = _mm_sub_pd(XMM3, XMM7);            \
-      _mm_storeu_pd((z)+i  , XMM0);             \
-      _mm_storeu_pd((z)+i+2, XMM1);             \
-      _mm_storeu_pd((z)+i+4, XMM2);             \
-      _mm_storeu_pd((z)+i+6, XMM3);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      z[off+i] = x[off+i] - y[off+i];           \
-    }                                           \
-  }
-
-#define THDoubleVector_scale(y, c, n) {         \
-    long i;                                     \
-    __m128d XMM7 = _mm_set1_pd(c);              \
-    for (i=0; i<=((n)-4); i+=4) {               \
-      __m128d XMM0 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((y)+i+2);     \
-      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
-      XMM1 = _mm_mul_pd(XMM1, XMM7);            \
-      _mm_storeu_pd((y)+i  , XMM0);             \
-      _mm_storeu_pd((y)+i+2, XMM1);             \
-    }                                           \
-    long off = (n) - ((n)%4);                   \
-    for (i=0; i<((n)%4); i++) {                 \
-      y[off+i] *= c;                            \
-    }                                           \
-  }
-
-#define THDoubleVector_mul(y, x, n) {           \
-    long i;                                     \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
-      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
-      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
-      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
-      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
-      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
-      XMM4 = _mm_mul_pd(XMM4, XMM0);            \
-      XMM5 = _mm_mul_pd(XMM5, XMM1);            \
-      XMM6 = _mm_mul_pd(XMM6, XMM2);            \
-      XMM7 = _mm_mul_pd(XMM7, XMM3);            \
-      _mm_storeu_pd((y)+i  , XMM4);             \
-      _mm_storeu_pd((y)+i+2, XMM5);             \
-      _mm_storeu_pd((y)+i+4, XMM6);             \
-      _mm_storeu_pd((y)+i+6, XMM7);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      y[off+i] *= x[off+i];                     \
-    }                                           \
-  }
-
-#define THFloatVector_fill(x, c, n) {           \
-    long i;                                     \
-    __m128 XMM0 = _mm_set_ps1(c);               \
-    long off;                                   \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      _mm_storeu_ps((x)+i  ,  XMM0);            \
-      _mm_storeu_ps((x)+i+4,  XMM0);            \
-      _mm_storeu_ps((x)+i+8,  XMM0);            \
-      _mm_storeu_ps((x)+i+12, XMM0);            \
-    }                                           \
-    off = (n) - ((n)%16);                       \
-    for (i=0; i<((n)%16); i++) {                \
-      x[off+i] = c;                             \
-    }                                           \
-  }
-
-#define THFloatVector_add(y, x, c, n) {         \
-    long i = 0;                                 \
-    __m128 XMM7 = _mm_set_ps1(c);               \
-    __m128 XMM0,XMM2;                           \
-    for (; i<=((n)-4); i+=4) {                  \
-      XMM0 = _mm_loadu_ps((x)+i);               \
-      XMM2 = _mm_loadu_ps((y)+i);               \
-      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
-      XMM2 = _mm_add_ps(XMM2, XMM0);            \
-      _mm_storeu_ps((y)+i  , XMM2);             \
-    }                                           \
-    for (; i<(n); i++) {                        \
-      y[i] += c * x[i];                         \
-    }                                           \
-  }
-
-#define THFloatVector_diff(z, x, y, n) {        \
-    long i;                                     \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
-      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
-      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
-      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
-      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
-      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
-      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
-      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
-      XMM0 = _mm_sub_ps(XMM0, XMM4);            \
-      XMM1 = _mm_sub_ps(XMM1, XMM5);            \
-      XMM2 = _mm_sub_ps(XMM2, XMM6);            \
-      XMM3 = _mm_sub_ps(XMM3, XMM7);            \
-      _mm_storeu_ps((z)+i   , XMM0);            \
-      _mm_storeu_ps((z)+i+ 4, XMM1);            \
-      _mm_storeu_ps((z)+i+ 8, XMM2);            \
-      _mm_storeu_ps((z)+i+12, XMM3);            \
-    }                                           \
-    long off = (n) - ((n)%16);                  \
-    for (i=0; i<((n)%16); i++) {                \
-      z[off+i] = x[off+i] - y[off+i];           \
-    }                                           \
-  }
-
-#define THFloatVector_scale(y, c, n) {          \
-    long i;                                     \
-    __m128 XMM7 = _mm_set_ps1(c);               \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128 XMM0 = _mm_loadu_ps((y)+i  );      \
-      __m128 XMM1 = _mm_loadu_ps((y)+i+4);      \
-      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
-      XMM1 = _mm_mul_ps(XMM1, XMM7);            \
-      _mm_storeu_ps((y)+i  , XMM0);             \
-      _mm_storeu_ps((y)+i+4, XMM1);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      y[off+i] *= c;                            \
-    }                                           \
-  }
-
-#define THFloatVector_mul(y, x, n) {            \
-    long i;                                     \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
-      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
-      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
-      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
-      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
-      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
-      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
-      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
-      XMM4 = _mm_mul_ps(XMM4, XMM0);            \
-      XMM5 = _mm_mul_ps(XMM5, XMM1);            \
-      XMM6 = _mm_mul_ps(XMM6, XMM2);            \
-      XMM7 = _mm_mul_ps(XMM7, XMM3);            \
-      _mm_storeu_ps((y)+i   , XMM4);            \
-      _mm_storeu_ps((y)+i+ 4, XMM5);            \
-      _mm_storeu_ps((y)+i+ 8, XMM6);            \
-      _mm_storeu_ps((y)+i+12, XMM7);            \
-    }                                           \
-    long off = (n) - ((n)%16);                  \
-    for (i=0; i<((n)%16); i++) {                \
-      y[off+i] *= x[off+i];                     \
-    }                                           \
-  }
-
-#elif defined __NEON__
-/* ARM NEON Assembly routine for operating on floats */
-
-#define THFloatVector_fill(x, c, n) {                   \
-        float ctemp = c;                                \
-        float * caddr = &ctemp;                         \
-        __asm__ __volatile__ (                          \
-            "mov         r0, %0           @ \n\t"       \
-            "ldr         r4, [%1]         @ \n\t"       \
-            "vdup.32     q12, r4          @ \n\t"       \
-            "vdup.32     q13, r4          @ \n\t"       \
-            "lsrs        r4, %2, #3       @ \n\t"       \
-            "beq         3f               @ \n\t"       \
-            "1:                           @ \n\t"       \
-            "vst1.32     {d24-d27}, [r0]! @ \n\t"       \
-            "subs        r4, r4, #1       @ \n\t"       \
-            "bne         1b               @ \n\t"       \
-            "3:                           @ \n\t"       \
-            "ands        r4, %2, #7       @ \n\t"       \
-            "beq         5f               @ \n\t"       \
-            "4:                           @ \n\t"       \
-            "subs        r4, r4, #1       @ \n\t"       \
-            "vst1.32     {d24[0]}, [r0]!  @ \n\t"       \
-            "bne         4b               @ \n\t"       \
-            "5:                           @ "           \
-            :                                           \
-            :"r" (x), "r"(caddr),"r"(n)                 \
-            : "cc", "r0", "r4",  "memory",              \
-              "q12",                                    \
-              "d24", "d25", "d26", "d27"                \
-            );                                          \
-    }
-
-#define THFloatVector_diff(z, x, y, n) {                                \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %2           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, %0           @ \n\t"                       \
-            "lsrs        r4, %3, #3       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vsub.f32    q12, q8, q0      @ \n\t"                       \
-            "vsub.f32    q13, q9, q1      @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "ands        r4, %3, #7       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vsub.f32    d24, d16, d0     @ \n\t"                       \
-            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
-            "bne         4b               @ \n\t"                       \
-            "5:                           @ "                           \
-            :                                                           \
-            :"r" (z), "r" (x),"r" (y), "r"(n)                           \
-            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
-              "q0", "q1", "q8", "q9", "q12", "q13",                     \
-              "d0", "d1", "d2", "d3",                                   \
-              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
-            );                                                          \
-    }
-
-#define THFloatVector_scale(y, c, n) {                                  \
-        float ctemp = c;                                                \
-        float * caddr = &ctemp;                                         \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "ldr         r5, [%1]         @ \n\t"                       \
-            "vdup.32     q14, r5          @ \n\t"                       \
-            "lsrs        r5, %2, #5       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
-            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vmul.f32    q2, q2, q14      @ \n\t"                       \
-            "vmul.f32    q3, q3, q14      @ \n\t"                       \
-            "vmul.f32    q4, q4, q14      @ \n\t"                       \
-            "vmul.f32    q5, q5, q14      @ \n\t"                       \
-            "vmul.f32    q6, q6, q14      @ \n\t"                       \
-            "vmul.f32    q7, q7, q14      @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
-            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
-            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
-            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
-            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "lsrs        r5, %2, #4       @ \n\t"                       \
-            "ands        r5, r5, #1       @ \n\t"                       \
-            "beq         4f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vmul.f32    q2, q2, q14      @ \n\t"                       \
-            "vmul.f32    q3, q3, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "lsrs        r5, %2, #3       @ \n\t"                       \
-            "ands        r5, r5, #1       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "5:                           @ \n\t"                       \
-            "ands        r5, %2, #7       @ \n\t"                       \
-            "beq         7f               @ \n\t"                       \
-            "6:                           @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "vld1.32     d0[0], [r0]!     @ \n\t"                       \
-            "vmul.f32    d0, d0, d28      @ \n\t"                       \
-            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
-            "bne         6b               @ \n\t"                       \
-            "7:                           @ "                           \
-            :                                                           \
-            :"r" (y), "r"(caddr),"r"(n)                                 \
-            : "cc", "r0", "r2", "r5", "memory",                         \
-              "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",    \
-              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
-              "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",     \
-              "d28", "d29"                                              \
-            );                                                          \
-    }
-
-#define THFloatVector_mul(y, x, n) {                                    \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "lsrs        r4, %2, #3       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmul.f32    q12, q8, q0      @ \n\t"                       \
-            "vmul.f32    q13, q9, q1      @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "ands        r4, %2, #7       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q12, q8, q0      @ \n\t"                       \
-            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
-            "bne         4b               @ \n\t"                       \
-            "5:                           @ "                           \
-            :                                                           \
-            :"r" (y),"r" (x),"r"(n)                                     \
-            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
-              "q0", "q1", "q8", "q9", "q12", "q13",                     \
-              "d0", "d1", "d2", "d3",                                   \
-              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
-            );                                                          \
-    }
-#define THFloatVector_add(y, x, c, n) {                                 \
-        float ctemp = c;                                                \
-        float * caddr = &ctemp;                                         \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "ldr         r5, [%2]         @ \n\t"                       \
-            "vdup.32     q14, r5          @ \n\t"                       \
-            "lsrs        r5, %3, #4       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmla.f32    q0, q8, q14      @ \n\t"                       \
-            "vmla.f32    q1, q9, q14      @ \n\t"                       \
-            "vmla.f32    q2, q10, q14     @ \n\t"                       \
-            "vmla.f32    q3, q11, q14     @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "lsrs        r5, %3, #3       @ \n\t"                       \
-            "ands        r5, #1           @ \n\t"                       \
-            "beq         4f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vmla.f32    q0, q8, q14      @ \n\t"                       \
-            "vmla.f32    q1, q9, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "ands        r5, %3, #7       @ \n\t"                       \
-            "beq         6f               @ \n\t"                       \
-            "5:                           @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vmla.f32    d0, d16, d28     @ \n\t"                       \
-            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
-            "bne         5b               @ \n\t"                       \
-            "6:                           @ "                           \
-            :                                                           \
-            :"r" (y),"r" (x), "r"(caddr),"r"(n)                         \
-            : "cc", "r0", "r1", "r2", "r5", "memory",                   \
-              "q0", "q1", "q2", "q3", "q14",                            \
-              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
-              "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" \
-            );                                                          \
-    }
-
-static inline void THDoubleVector_fill(double *x, const double c, const long n) {
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    x[i] = c;
-    x[i+1] = c;
-    x[i+2] = c;
-    x[i+3] = c;
-  }
-
-  for(; i < n; i++)
-    x[i] = c;
-}
-
-static inline void THDoubleVector_add(double *y, const double *x, const double c, const long n)
-{
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    y[i] += c * x[i];
-    y[i+1] += c * x[i+1];
-    y[i+2] += c * x[i+2];
-    y[i+3] += c * x[i+3];
-  }
-
-  for(; i < n; i++)
-    y[i] += c * x[i];
-}
-
-static inline void THDoubleVector_diff(double *z, const double *x, const double *y, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    z[i] = x[i] - y[i];
-    z[i+1] = x[i+1] - y[i+1];
-    z[i+2] = x[i+2] - y[i+2];
-    z[i+3] = x[i+3] - y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] - y[i];
-}
-
-static inline void THDoubleVector_scale(double *y, const double c, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i +=4)
-  {
-    y[i] *= c;
-    y[i+1] *= c;
-    y[i+2] *= c;
-    y[i+3] *= c;
-  }
-
-  for(; i < n; i++)
-    y[i] *= c;
-}
-
-static inline void THDoubleVector_mul(double *y, const double *x, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    y[i] *= x[i];
-    y[i+1] *= x[i+1];
-    y[i+2] *= x[i+2];
-    y[i+3] *= x[i+3];
-  }
-
-  for(; i < n; i++)
-    y[i] *= x[i];
-}
-
-
-#else
-
-/* If SSE2 not defined, then generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateFloatTypes.h"
-
-#endif
-
-/* For non-float types, generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateIntTypes.h"
-
-#endif
+#endif // TH_VECTOR_INC
diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index cac043e..788f6c7 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -41,9 +41,9 @@ THStorage* THStorage_(newWithAllocator)(long size,
   return storage;
 }
 
-THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared)
+THStorage* THStorage_(newWithMapping)(const char *filename, long size, int flags)
 {
-  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, shared);
+  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
 
   THStorage *storage = THStorage_(newWithAllocator)(size,
                                                     &THMapAllocator,
@@ -203,4 +203,24 @@ real THStorage_(get)(const THStorage *self, long idx)
   return self->data[idx];
 }
 
+void THStorage_(swap)(THStorage *storage1, THStorage *storage2)
+{
+#define SWAP(val) { val = storage1->val; storage1->val = storage2->val; storage2->val = val; }
+    real *data;
+    long size;
+    char flag;
+    THAllocator *allocator;
+    void *allocatorContext;
+    struct THStorage *view;
+
+    SWAP(data);
+    SWAP(size);
+    SWAP(flag);
+    // don't swap refcount!
+    SWAP(allocator);
+    SWAP(allocatorContext);
+    SWAP(view);
+#undef SWAP
+}
+
 #endif
diff --git a/lib/TH/generic/THStorage.h b/lib/TH/generic/THStorage.h
index 79013d8..0f6dcca 100644
--- a/lib/TH/generic/THStorage.h
+++ b/lib/TH/generic/THStorage.h
@@ -46,7 +46,7 @@ TH_API THStorage* THStorage_(newWithSize1)(real);
 TH_API THStorage* THStorage_(newWithSize2)(real, real);
 TH_API THStorage* THStorage_(newWithSize3)(real, real, real);
 TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real);
-TH_API THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared);
+TH_API THStorage* THStorage_(newWithMapping)(const char *filename, long size, int flags);
 
 /* takes ownership of data */
 TH_API THStorage* THStorage_(newWithData)(real *data, long size);
@@ -61,6 +61,7 @@ TH_API THStorage* THStorage_(newWithDataAndAllocator)(
 TH_API void THStorage_(setFlag)(THStorage *storage, const char flag);
 TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag);
 TH_API void THStorage_(retain)(THStorage *storage);
+TH_API void THStorage_(swap)(THStorage *storage1, THStorage *storage2);
 
 /* might differ with other API (like CUDA) */
 TH_API void THStorage_(free)(THStorage *storage);
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index c3da469..cae5959 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -823,8 +823,6 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
     THTensor_(copy)(r_, t);
   }
 
-/*  printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */
-
   /* r_ */
   if(r_->stride[0] == 1 &&
      r_->stride[1] != 0)
@@ -1937,7 +1935,7 @@ void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
   for(r = 0; r < t_size_0; r++)
   {
     long sz = THMin(r+k+1, t_size_1);
-    for(c = THMax(0, r+k); c < t_size_1; c++)
+    for(c = THMax(0, r+k+1); c < t_size_1; c++)
       r__data[r*r__stride_0+c*r__stride_1] = 0;
     for(c = 0; c < sz; c++)
       r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
@@ -2066,30 +2064,26 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
   void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
   {									\
     THByteTensor_rawResize(r_, t->nDimension, t->size, NULL);		\
-    THByteTensor_zero(r_);						\
     TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
-		     if (*t_data OP value) *r__data = 1;);		\
+		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
   {									\
     THTensor_(rawResize)(r_, t->nDimension, t->size, NULL);		\
-    THTensor_(zero)(r_);						\
     TH_TENSOR_APPLY2(real, r_, real, t,					\
-		     if (*t_data OP value) *r__data = 1;);		\
+		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
     THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL);		\
-    THByteTensor_zero(r_);						\
     TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
-		     if(*ta_data OP *tb_data) *r__data = 1;);		\
+		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
     THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL);		\
-    THTensor_(zero)(r_);						\
     TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
-		     if(*ta_data OP *tb_data) *r__data = 1;);		\
+		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
 
 
diff --git a/lib/TH/generic/THTensorRandom.c b/lib/TH/generic/THTensorRandom.c
index f8097c8..514d3dd 100644
--- a/lib/TH/generic/THTensorRandom.c
+++ b/lib/TH/generic/THTensorRandom.c
@@ -119,7 +119,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
     THArgCheckWithCleanup((sum > 0), THCleanup(THDoubleTensor_free(cum_dist);), 2,
                           "invalid multinomial distribution (sum of probabilities <= 0)");
     /* normalize cumulative probability distribution so that last val is 1
-    i.e. dosen't assume original prob_dist row sums to one */
+    i.e. doesn't assume original prob_dist row sums to one */
     if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) )
     {
       for (j=0; j<n_categories; j++)
diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h
new file mode 100644
index 0000000..09067e5
--- /dev/null
+++ b/lib/TH/generic/THVector.h
@@ -0,0 +1,14 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVector.h"
+#else
+
+TH_API void THVector_(fill)(real *x, const real c, const long n);
+TH_API void THVector_(add)(real *y, const real *x, const real c, const long n);
+TH_API void THVector_(diff)(real *z, const real *x, const real *y, const long n);
+TH_API void THVector_(scale)(real *y, const real c, const long n);
+TH_API void THVector_(mul)(real *y, const real *x, const long n);
+
+/* Initialize the dispatch pointers */
+TH_API void THVector_(vectorDispatchInit)();
+
+#endif
diff --git a/lib/TH/generic/THVector.c b/lib/TH/generic/THVectorDefault.c
similarity index 67%
rename from lib/TH/generic/THVector.c
rename to lib/TH/generic/THVectorDefault.c
index 6c8a96b..d51be03 100644
--- a/lib/TH/generic/THVector.c
+++ b/lib/TH/generic/THVectorDefault.c
@@ -1,8 +1,8 @@
 #ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVector.c"
+#define TH_GENERIC_FILE "generic/THVectorDefault.c"
 #else
 
-static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
+void THVector_(fill_DEFAULT)(real *x, const real c, const long n) {
   long i = 0;
 
   for(; i < n-4; i += 4)
@@ -17,7 +17,7 @@ static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
     x[i] = c;
 }
 
-static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const long n)
+void THVector_(add_DEFAULT)(real *y, const real *x, const real c, const long n)
 {
   long i = 0;
 
@@ -33,7 +33,7 @@ static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const
     y[i] += c * x[i];
 }
 
-static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, const long n)
+void THVector_(diff_DEFAULT)(real *z, const real *x, const real *y, const long n)
 {
   long i = 0;
 
@@ -49,7 +49,7 @@ static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, con
     z[i] = x[i] - y[i];
 }
 
-static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
+void THVector_(scale_DEFAULT)(real *y, const real c, const long n)
 {
   long i = 0;
 
@@ -65,7 +65,7 @@ static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
     y[i] *= c;
 }
 
-static TH_INLINE void THVector_(mul)(real *y, const real *x, const long n)
+void THVector_(mul_DEFAULT)(real *y, const real *x, const long n)
 {
   long i = 0;
 
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
new file mode 100644
index 0000000..f16bcda
--- /dev/null
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -0,0 +1,140 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVectorDispatch.c"
+#else
+
+/* For now there are only SIMD implementations for FLOAT and DOUBLE.
+ * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
+ * for a lot of functions */
+/* Each function with multiple implementations has:
+ * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
+ * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
+ *    which SIMD extension a given implementation uses
+ * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
+ */
+
+static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const long) = &THVector_(fill_DEFAULT);
+static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+  FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(fill)(real *x, const real c, const long n) {
+  THVector_(fill_DISPATCHPTR)(x, c, n);
+}
+
+
+static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const long) = &THVector_(add_DEFAULT);
+static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(add_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(add_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(add_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(add)(real *y, const real *x, const real c, const long n) {
+  THVector_(add_DISPATCHPTR)(y, x, c, n);
+}
+
+
+static void (*THVector_(diff_DISPATCHPTR))(real *, const real *, const real *, const long) = &THVector_(diff_DEFAULT);
+static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(diff_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(diff_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(diff_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(diff)(real *z, const real *x, const real *y, const long n) {
+  THVector_(diff_DISPATCHPTR)(z, x, y, n);
+}
+
+
+static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const long) = &THVector_(scale_DEFAULT);
+static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(scale_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(scale_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(scale_DEFAULT), SIMDExtension_DEFAULT)
+};
+TH_API void THVector_(scale)(real *y, const real c, const long n) {
+  THVector_(scale_DISPATCHPTR)(y, c, n);
+}
+
+
+static void (*THVector_(mul_DISPATCHPTR))(real *, const real *, const long) = &THVector_(mul_DEFAULT);
+static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(mul_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(mul_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(mul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(mul)(real *y, const real *x, const long n) {
+  THVector_(mul_DISPATCHPTR);
+}
+
+/* This needs to be called in order to initialize the dispatch pointers at runtime.
+ * This function simply checks what SIMD extensions are available, and then walks the dispatch table
+ * to choose the best function.
+ * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
+ *       This means that in the dispatch tables, implementations supporting more recent extensions
+ *       need to come first
+ */
+void THVector_(vectorDispatchInit)()
+{
+  uint32_t hostSimdExts = detectHostSIMDExtensions();
+  INIT_DISPATCH_PTR(fill);
+  INIT_DISPATCH_PTR(add);
+  INIT_DISPATCH_PTR(diff);
+  INIT_DISPATCH_PTR(scale);
+  INIT_DISPATCH_PTR(mul);
+}
+
+#endif
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
new file mode 100644
index 0000000..e4660b1
--- /dev/null
+++ b/lib/TH/generic/simd/simd.h
@@ -0,0 +1,91 @@
+#ifndef TH_SIMD_INC
+#define TH_SIMD_INC
+
+#include <stdint.h>
+
+// Can be found on Intel ISA Reference for CPUID
+#define CPUID_AVX2_BIT 0x10       // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX_BIT  0x10000000 // Bit 28 of ECX for EAX=0x1
+#define CPUID_SSE_BIT  0x2000000  // bit 25 of EDX for EAX=0x1
+
+// Helper macros for initialization
+#define FUNCTION_IMPL(NAME, EXT) \
+    { .function=(void *)NAME,    \
+      .supportedSimdExt=EXT      \
+    }
+
+#define INIT_DISPATCH_PTR(OP)    \
+  do {                           \
+    int i;                       \
+    for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \
+      THVector_(OP ## _DISPATCHPTR) = THVector_(OP ## _DISPATCHTABLE)[i].function;                     \
+      if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) {                       \
+        break;                                                                                     \
+      }                                                                                            \
+    }                                                                                              \
+  } while(0)
+
+
+typedef struct FunctionDescription
+{
+  void *function;
+  uint32_t supportedSimdExt;
+} FunctionDescription;
+
+
+enum SIMDExtensions
+{
+#if defined(__NEON__)
+  SIMDExtension_NEON    = 0x1,
+#else
+  SIMDExtension_AVX2    = 0x1,
+  SIMDExtension_AVX     = 0x2,
+  SIMDExtension_SSE     = 0x4,
+#endif
+  SIMDExtension_DEFAULT = 0x0
+};
+
+#if defined(__NEON__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_NEON;
+}
+
+#else // x86
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+  uint32_t a = *eax, b, c, d;
+  asm volatile ( "cpuid\n\t"
+                 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+}
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t hostSimdExts = 0x0;
+
+  // Check for AVX2. Requires separate CPUID
+  eax = 0x7;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & CPUID_AVX2_BIT)
+    hostSimdExts |= SIMDExtension_AVX2;
+
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ecx & CPUID_AVX_BIT)
+    hostSimdExts |= SIMDExtension_AVX;
+  if (edx & CPUID_SSE_BIT)
+    hostSimdExts |= SIMDExtension_SSE;
+
+  return hostSimdExts;
+}
+
+#endif // end x86 SIMD extension detection code
+
+#endif
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
new file mode 100644
index 0000000..9d65550
--- /dev/null
+++ b/lib/TH/vector/NEON.c
@@ -0,0 +1,252 @@
+static void THFloatVector_fill_NEON(float *x, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "ldr         r4, [%1]         @ \n\t"
+      "vdup.32     q12, r4          @ \n\t"
+      "vdup.32     q13, r4          @ \n\t"
+      "lsrs        r4, %2, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "1:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r0]! @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "bne         1b               @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %2, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vst1.32     {d24[0]}, [r0]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (x), "r"(caddr),"r"(n)
+      : "cc", "r0", "r4",  "memory",
+        "q12",
+        "d24", "d25", "d26", "d27"
+      );
+}
+
+
+static void THFloatVector_diff_NEON(float *y, const float *x, const float c, const long n) {
+  __asm__ __volatile__ (
+      "mov         r0, %2           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, %0           @ \n\t"
+      "lsrs        r4, %3, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vsub.f32    q12, q8, q0      @ \n\t"
+      "vsub.f32    q13, q9, q1      @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %3, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vsub.f32    d24, d16, d0     @ \n\t"
+      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (z), "r" (x),"r" (y), "r"(n)
+      : "cc", "r0", "r1", "r2", "r4", "memory",
+        "q0", "q1", "q8", "q9", "q12", "q13",
+        "d0", "d1", "d2", "d3",
+        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+      );
+}
+
+
+static void THFloatVector_scale_NEON(float *y, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "ldr         r5, [%1]         @ \n\t"
+      "vdup.32     q14, r5          @ \n\t"
+      "lsrs        r5, %2, #5       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
+      "vld1.32     {d12-d15}, [r0]! @ \n\t"
+      "1:                           @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vmul.f32    q2, q2, q14      @ \n\t"
+      "vmul.f32    q3, q3, q14      @ \n\t"
+      "vmul.f32    q4, q4, q14      @ \n\t"
+      "vmul.f32    q5, q5, q14      @ \n\t"
+      "vmul.f32    q6, q6, q14      @ \n\t"
+      "vmul.f32    q7, q7, q14      @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
+      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
+      "vst1.32     {d12-d15}, [r2]! @ \n\t"
+      "vld1.32     {d12-d15}, [r0]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
+      "vst1.32     {d12-d15}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "lsrs        r5, %2, #4       @ \n\t"
+      "ands        r5, r5, #1       @ \n\t"
+      "beq         4f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vmul.f32    q2, q2, q14      @ \n\t"
+      "vmul.f32    q3, q3, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "4:                           @ \n\t"
+      "lsrs        r5, %2, #3       @ \n\t"
+      "ands        r5, r5, #1       @ \n\t"
+      "beq         5f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "5:                           @ \n\t"
+      "ands        r5, %2, #7       @ \n\t"
+      "beq         7f               @ \n\t"
+      "6:                           @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "vld1.32     d0[0], [r0]!     @ \n\t"
+      "vmul.f32    d0, d0, d28      @ \n\t"
+      "vst1.32     d0[0], [r2]!     @ \n\t"
+      "bne         6b               @ \n\t"
+      "7:                           @ "
+      :
+      :"r" (y), "r"(caddr),"r"(n)
+      : "cc", "r0", "r2", "r5", "memory",
+        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+        "d28", "d29"
+      );
+
+}
+
+static void THFloatVector_mul_NEON(float *y, const float *x, const long n) {
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "lsrs        r4, %2, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vmul.f32    q12, q8, q0      @ \n\t"
+      "vmul.f32    q13, q9, q1      @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %2, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vmul.f32    q12, q8, q0      @ \n\t"
+      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (y),"r" (x),"r"(n)
+      : "cc", "r0", "r1", "r2", "r4", "memory",
+        "q0", "q1", "q8", "q9", "q12", "q13",
+        "d0", "d1", "d2", "d3",
+        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+      );
+}
+
+static void THFloatVector_add_NEON(float *y, const float *x, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "ldr         r5, [%2]         @ \n\t"
+      "vdup.32     q14, r5          @ \n\t"
+      "lsrs        r5, %3, #4       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d20-d23}, [r1]! @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vmla.f32    q0, q8, q14      @ \n\t"
+      "vmla.f32    q1, q9, q14      @ \n\t"
+      "vmla.f32    q2, q10, q14     @ \n\t"
+      "vmla.f32    q3, q11, q14     @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d20-d23}, [r1]! @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "3:                           @ \n\t"
+      "lsrs        r5, %3, #3       @ \n\t"
+      "ands        r5, #1           @ \n\t"
+      "beq         4f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vmla.f32    q0, q8, q14      @ \n\t"
+      "vmla.f32    q1, q9, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "4:                           @ \n\t"
+      "ands        r5, %3, #7       @ \n\t"
+      "beq         6f               @ \n\t"
+      "5:                           @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vmla.f32    d0, d16, d28     @ \n\t"
+      "vst1.32     d0[0], [r2]!     @ \n\t"
+      "bne         5b               @ \n\t"
+      "6:                           @ "
+      :
+      :"r" (y),"r" (x), "r"(caddr),"r"(n)
+      : "cc", "r0", "r1", "r2", "r5", "memory",
+        "q0", "q1", "q2", "q3", "q14",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
+      );
+}
diff --git a/lib/TH/vector/SSE.c b/lib/TH/vector/SSE.c
new file mode 100644
index 0000000..f909907
--- /dev/null
+++ b/lib/TH/vector/SSE.c
@@ -0,0 +1,213 @@
+#include <x86intrin.h>
+
+
+static void THDoubleVector_fill_SSE(double *x, const double c, const long n) {
+  long i;
+  long off;
+  __m128d XMM0 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    _mm_storeu_pd((x)+i  , XMM0);
+    _mm_storeu_pd((x)+i+2, XMM0);
+    _mm_storeu_pd((x)+i+4, XMM0);
+    _mm_storeu_pd((x)+i+6, XMM0);
+  }
+  off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    x[off+i] = c;
+  }
+}
+
+
+static void THDoubleVector_add_SSE(double *y, const double *x, const double c, const long n) {
+  long i = 0;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0,XMM2;
+  for (; i<=((n)-2); i+=2) {
+    XMM0 = _mm_loadu_pd((x)+i);
+    XMM2 = _mm_loadu_pd((y)+i);
+    XMM0 = _mm_mul_pd(XMM0, XMM7);
+    XMM2 = _mm_add_pd(XMM2, XMM0);
+    _mm_storeu_pd((y)+i  , XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] += c * x[i];
+  }
+}
+
+
+static void THDoubleVector_diff_SSE(double *z, const double *x, const double *y, const long n) {
+  long i;
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_loadu_pd((y)+i  );
+    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+    XMM0 = _mm_sub_pd(XMM0, XMM4);
+    XMM1 = _mm_sub_pd(XMM1, XMM5);
+    XMM2 = _mm_sub_pd(XMM2, XMM6);
+    XMM3 = _mm_sub_pd(XMM3, XMM7);
+    _mm_storeu_pd((z)+i  , XMM0);
+    _mm_storeu_pd((z)+i+2, XMM1);
+    _mm_storeu_pd((z)+i+4, XMM2);
+    _mm_storeu_pd((z)+i+6, XMM3);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    z[off+i] = x[off+i] - y[off+i];
+  }
+}
+
+
+static void THDoubleVector_scale_SSE(double *y, const double c, const long n) {
+  long i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-4); i+=4) {
+    __m128d XMM0 = _mm_loadu_pd((y)+i  );
+    __m128d XMM1 = _mm_loadu_pd((y)+i+2);
+    XMM0 = _mm_mul_pd(XMM0, XMM7);
+    XMM1 = _mm_mul_pd(XMM1, XMM7);
+    _mm_storeu_pd((y)+i  , XMM0);
+    _mm_storeu_pd((y)+i+2, XMM1);
+  }
+  long off = (n) - ((n)%4);
+  for (i=0; i<((n)%4); i++) {
+    y[off+i] *= c;
+  }
+}
+
+
+static void THDoubleVector_mul_SSE(double *y, const double *x, const long n) {
+  long i;
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_loadu_pd((y)+i  );
+    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+    XMM4 = _mm_mul_pd(XMM4, XMM0);
+    XMM5 = _mm_mul_pd(XMM5, XMM1);
+    XMM6 = _mm_mul_pd(XMM6, XMM2);
+    XMM7 = _mm_mul_pd(XMM7, XMM3);
+    _mm_storeu_pd((y)+i  , XMM4);
+    _mm_storeu_pd((y)+i+2, XMM5);
+    _mm_storeu_pd((y)+i+4, XMM6);
+    _mm_storeu_pd((y)+i+6, XMM7);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] *= x[off+i];
+  }
+}
+
+
+static void THFloatVector_fill_SSE(float *x, const float c, const long n) {
+  long i;
+  __m128 XMM0 = _mm_set_ps1(c);
+  long off;
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm_storeu_ps((x)+i  ,  XMM0);
+    _mm_storeu_ps((x)+i+4,  XMM0);
+    _mm_storeu_ps((x)+i+8,  XMM0);
+    _mm_storeu_ps((x)+i+12, XMM0);
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    x[off+i] = c;
+  }
+}
+
+
+static void THFloatVector_add_SSE(float *y, const float *x, const float c, const long n) {
+  long i = 0;
+  __m128 XMM7 = _mm_set_ps1(c);
+  __m128 XMM0,XMM2;
+  for (; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_ps((x)+i);
+    XMM2 = _mm_loadu_ps((y)+i);
+    XMM0 = _mm_mul_ps(XMM0, XMM7);
+    XMM2 = _mm_add_ps(XMM2, XMM0);
+    _mm_storeu_ps((y)+i  , XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] += c * x[i];
+  }
+}
+
+
+static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, const long n) {
+  long i;
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_loadu_ps((y)+i   );
+    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+    XMM0 = _mm_sub_ps(XMM0, XMM4);
+    XMM1 = _mm_sub_ps(XMM1, XMM5);
+    XMM2 = _mm_sub_ps(XMM2, XMM6);
+    XMM3 = _mm_sub_ps(XMM3, XMM7);
+    _mm_storeu_ps((z)+i   , XMM0);
+    _mm_storeu_ps((z)+i+ 4, XMM1);
+    _mm_storeu_ps((z)+i+ 8, XMM2);
+    _mm_storeu_ps((z)+i+12, XMM3);
+  }
+  long off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    z[off+i] = x[off+i] - y[off+i];
+  }
+}
+
+
+static void THFloatVector_scale_SSE(float *y, const float c, const long n) {
+  long i;
+  __m128 XMM7 = _mm_set_ps1(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128 XMM0 = _mm_loadu_ps((y)+i  );
+    __m128 XMM1 = _mm_loadu_ps((y)+i+4);
+    XMM0 = _mm_mul_ps(XMM0, XMM7);
+    XMM1 = _mm_mul_ps(XMM1, XMM7);
+    _mm_storeu_ps((y)+i  , XMM0);
+    _mm_storeu_ps((y)+i+4, XMM1);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] *= c;
+  }
+}
+
+
+static void THFloatVector_mul_SSE(float *y, const float *x, const long n) {
+  long i;
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_loadu_ps((y)+i   );
+    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+    XMM4 = _mm_mul_ps(XMM4, XMM0);
+    XMM5 = _mm_mul_ps(XMM5, XMM1);
+    XMM6 = _mm_mul_ps(XMM6, XMM2);
+    XMM7 = _mm_mul_ps(XMM7, XMM3);
+    _mm_storeu_ps((y)+i   , XMM4);
+    _mm_storeu_ps((y)+i+ 4, XMM5);
+    _mm_storeu_ps((y)+i+ 8, XMM6);
+    _mm_storeu_ps((y)+i+12, XMM7);
+  }
+  long off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    y[off+i] *= x[off+i];
+  }
+}
diff --git a/lib/luaT/CMakeLists.txt b/lib/luaT/CMakeLists.txt
index b221a17..f33768c 100644
--- a/lib/luaT/CMakeLists.txt
+++ b/lib/luaT/CMakeLists.txt
@@ -13,6 +13,10 @@ if(BUILD_STATIC)
   ADD_LIBRARY(luaT_static STATIC luaT.h luaT.c)
 endif()
 
+SET_TARGET_PROPERTIES(luaT PROPERTIES
+  VERSION   0
+  SOVERSION 0)
+
 IF(APPLE)
   SET_TARGET_PROPERTIES(luaT PROPERTIES
     LINK_FLAGS "-undefined dynamic_lookup")
diff --git a/lib/luaT/README.md b/lib/luaT/README.md
index 431e37f..f28d143 100644
--- a/lib/luaT/README.md
+++ b/lib/luaT/README.md
@@ -237,7 +237,7 @@ shall not be freed. It is a pointer inside `tname` string.
 
 <a name="luat_classmodulename"/>
 ### int luaT_classmodulename(const char *tname, char *parent_name) ###
-Alias to `luaT_fullparentname ` for ensuring backwards compatibilty; 
+Alias to `luaT_fullparentname ` for ensuring backwards compatibility; 
 use of `luaT_fullparentname` is preferred.
 
 <a name="luat_fullparentname"/>
diff --git a/test/test.lua b/test/test.lua
index 20ca035..21df3b6 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -183,7 +183,7 @@ function torchtest.rsqrt()
 end
 
 function torchtest.sigmoid()
-   -- cant use genericSingleOpTest, since `math.sigmoid` doesnt exist, have to use
+   -- can't use genericSingleOpTest, since `math.sigmoid` doesn't exist, have to use
    -- `torch.sigmoid` instead
    local inputValues = {-1000,-1,0,0.5,1,2,1000}
    local expectedOutput = {0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000}
@@ -2921,7 +2921,12 @@ function torchtest.abs()
    end
 
    -- Checking that the right abs function is called for LongTensor
-   local bignumber = 2^31 + 1
+   local bignumber
+   if torch.LongTensor():elementSize() > 4 then
+      bignumber = 2^31 + 1
+   else
+      bignumber = 2^15 + 1
+   end
    local input = torch.LongTensor{-bignumber}
    mytester:assertgt(input:abs()[1], 0, 'torch.abs(3)')
 end

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git