[lua-torch-cutorch] 01/08: New upstream version 0~20170202-g64536bc

Tue Feb 7 04:09:15 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-cutorch.

commit d4dca69e2318e1b27972dc787d3618031b6a4d22
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon Feb 6 05:33:21 2017 +0000

    New upstream version 0~20170202-g64536bc
---
 .gitignore                                        |    1 +
 .travis.yml                                       |   54 +
 CMakeLists.txt                                    |   36 +
 CONTRIBUTING.md                                   |  130 +
 FFI.lua                                           |  118 +
 LICENSE                                           |   35 +
 README.md                                         |  114 +
 Storage.c                                         |   19 +
 Tensor.c                                          |   20 +
 Tensor.lua                                        |   91 +
 TensorMath.lua                                    | 2117 +++++++++++
 TensorOperator.c                                  |   13 +
 generic/CStorage.c                                |  116 +
 generic/CStorageCopy.c                            |   64 +
 generic/CTensor.c                                 |  223 ++
 generic/CTensorCopy.c                             |   60 +
 generic/TensorOperator.c                          |  262 ++
 init.c                                            | 1125 ++++++
 init.lua                                          |  153 +
 lib/CMakeLists.txt                                |    1 +
 lib/THC/CMakeLists.txt                            |  327 ++
 lib/THC/THC.h                                     |   20 +
 lib/THC/THCAllocator.c                            |   67 +
 lib/THC/THCAllocator.h                            |   10 +
 lib/THC/THCApply.cuh                              |  644 ++++
 lib/THC/THCAsmUtils.cuh                           |   52 +
 lib/THC/THCAtomics.cuh                            |  134 +
 lib/THC/THCBlas.cu                                |  414 ++
 lib/THC/THCBlas.h                                 |   41 +
 lib/THC/THCCachingAllocator.cpp                   |  376 ++
 lib/THC/THCCachingAllocator.h                     |    9 +
 lib/THC/THCCachingHostAllocator.cpp               |  249 ++
 lib/THC/THCCachingHostAllocator.h                 |   30 +
 lib/THC/THCDeviceTensor-inl.cuh                   |  420 ++
 lib/THC/THCDeviceTensor.cuh                       |  513 +++
 lib/THC/THCDeviceTensorUtils-inl.cuh              |  118 +
 lib/THC/THCDeviceTensorUtils.cuh                  |   33 +
 lib/THC/THCDeviceUtils.cuh                        |   36 +
 lib/THC/THCGeneral.c                              |  770 ++++
 lib/THC/THCGeneral.h.in                           |  191 +
 lib/THC/THCGenerateAllTypes.h                     |   37 +
 lib/THC/THCGenerateByteType.h                     |   20 +
 lib/THC/THCGenerateCharType.h                     |   20 +
 lib/THC/THCGenerateDoubleType.h                   |   22 +
 lib/THC/THCGenerateFloatType.h                    |   24 +
 lib/THC/THCGenerateFloatTypes.h                   |   32 +
 lib/THC/THCGenerateHalfType.h                     |   38 +
 lib/THC/THCGenerateIntType.h                      |   20 +
 lib/THC/THCGenerateLongType.h                     |   20 +
 lib/THC/THCGenerateShortType.h                    |   20 +
 lib/THC/THCHalf.cu                                |  141 +
 lib/THC/THCHalf.h                                 |   29 +
 lib/THC/THCNumerics.cuh                           |  631 +++
 lib/THC/THCReduce.cuh                             |  323 ++
 lib/THC/THCReduceAll.cuh                          |  347 ++
 lib/THC/THCReduceApplyUtils.cu                    |   35 +
 lib/THC/THCReduceApplyUtils.cuh                   |   81 +
 lib/THC/THCScanUtils.cuh                          |  116 +
 lib/THC/THCSleep.cu                               |   21 +
 lib/THC/THCSleep.h                                |   10 +
 lib/THC/THCSortUtils.cuh                          |  171 +
 lib/THC/THCStorage.c                              |    8 +
 lib/THC/THCStorage.cu                             |   13 +
 lib/THC/THCStorage.h                              |   17 +
 lib/THC/THCStorageCopy.c                          |    6 +
 lib/THC/THCStorageCopy.cu                         |    8 +
 lib/THC/THCStorageCopy.h                          |   11 +
 lib/THC/THCStream.c                               |   30 +
 lib/THC/THCStream.h                               |   19 +
 lib/THC/THCTensor.c                               |    7 +
 lib/THC/THCTensor.cu                              |    4 +
 lib/THC/THCTensor.h                               |   21 +
 lib/THC/THCTensorConv.cu                          |  953 +++++
 lib/THC/THCTensorConv.h                           |   26 +
 lib/THC/THCTensorCopy.c                           |    5 +
 lib/THC/THCTensorCopy.cu                          |  208 +
 lib/THC/THCTensorCopy.h                           |   11 +
 lib/THC/THCTensorIndex.cu                         |  336 ++
 lib/THC/THCTensorInfo.cuh                         |  280 ++
 lib/THC/THCTensorMasked.cuh                       |   58 +
 lib/THC/THCTensorMath.cu                          |  112 +
 lib/THC/THCTensorMath.cuh                         |  102 +
 lib/THC/THCTensorMath.h                           |   49 +
 lib/THC/THCTensorMath2.cu                         |   30 +
 lib/THC/THCTensorMathBlas.cu                      |    8 +
 lib/THC/THCTensorMathCompare.cuh                  |   87 +
 lib/THC/THCTensorMathCompareT.cuh                 |   74 +
 lib/THC/THCTensorMathMagma.cu                     |   27 +
 lib/THC/THCTensorMathMagma.cuh                    |   22 +
 lib/THC/THCTensorMathPairwise.cu                  |  403 ++
 lib/THC/THCTensorMathPointwise.cuh                |  663 ++++
 lib/THC/THCTensorMathReduce.cu                    |   31 +
 lib/THC/THCTensorMathReduce.cuh                   |  690 ++++
 lib/THC/THCTensorMathScan.cu                      |  127 +
 lib/THC/THCTensorRandom.cpp                       |  133 +
 lib/THC/THCTensorRandom.cu                        |  156 +
 lib/THC/THCTensorRandom.cuh                       |  282 ++
 lib/THC/THCTensorRandom.h                         |   37 +
 lib/THC/THCTensorScatterGather.cu                 |  153 +
 lib/THC/THCTensorSort.cu                          |   78 +
 lib/THC/THCTensorSort.cuh                         |   87 +
 lib/THC/THCTensorTopK.cu                          |  535 +++
 lib/THC/THCTensorTopK.h                           |   14 +
 lib/THC/THCTensorTypeUtils.cu                     |  263 ++
 lib/THC/THCTensorTypeUtils.cuh                    |  180 +
 lib/THC/THCThreadLocal.c                          |   46 +
 lib/THC/THCThreadLocal.h                          |   17 +
 lib/THC/THCThrustAllocator.cuh                    |   33 +
 lib/THC/cmake/FindMAGMA.cmake                     |   27 +
 lib/THC/cmake/select_compute_arch.cmake           |  200 +
 lib/THC/generated/THCTensorMaskedByte.cu          |    3 +
 lib/THC/generated/THCTensorMaskedChar.cu          |    3 +
 lib/THC/generated/THCTensorMaskedDouble.cu        |    3 +
 lib/THC/generated/THCTensorMaskedFloat.cu         |    3 +
 lib/THC/generated/THCTensorMaskedHalf.cu          |    3 +
 lib/THC/generated/THCTensorMaskedInt.cu           |    3 +
 lib/THC/generated/THCTensorMaskedLong.cu          |    3 +
 lib/THC/generated/THCTensorMaskedShort.cu         |    3 +
 lib/THC/generated/THCTensorMathCompareByte.cu     |    3 +
 lib/THC/generated/THCTensorMathCompareChar.cu     |    3 +
 lib/THC/generated/THCTensorMathCompareDouble.cu   |    3 +
 lib/THC/generated/THCTensorMathCompareFloat.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareHalf.cu     |    3 +
 lib/THC/generated/THCTensorMathCompareInt.cu      |    3 +
 lib/THC/generated/THCTensorMathCompareLong.cu     |    3 +
 lib/THC/generated/THCTensorMathCompareShort.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareTByte.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareTChar.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareTDouble.cu  |    3 +
 lib/THC/generated/THCTensorMathCompareTFloat.cu   |    3 +
 lib/THC/generated/THCTensorMathCompareTHalf.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareTInt.cu     |    3 +
 lib/THC/generated/THCTensorMathCompareTLong.cu    |    3 +
 lib/THC/generated/THCTensorMathCompareTShort.cu   |    3 +
 lib/THC/generated/THCTensorMathPointwiseByte.cu   |    3 +
 lib/THC/generated/THCTensorMathPointwiseChar.cu   |    3 +
 lib/THC/generated/THCTensorMathPointwiseDouble.cu |    3 +
 lib/THC/generated/THCTensorMathPointwiseFloat.cu  |    3 +
 lib/THC/generated/THCTensorMathPointwiseHalf.cu   |    3 +
 lib/THC/generated/THCTensorMathPointwiseInt.cu    |    3 +
 lib/THC/generated/THCTensorMathPointwiseLong.cu   |    3 +
 lib/THC/generated/THCTensorMathPointwiseShort.cu  |    3 +
 lib/THC/generated/THCTensorMathReduceByte.cu      |    3 +
 lib/THC/generated/THCTensorMathReduceChar.cu      |    3 +
 lib/THC/generated/THCTensorMathReduceDouble.cu    |    3 +
 lib/THC/generated/THCTensorMathReduceFloat.cu     |    3 +
 lib/THC/generated/THCTensorMathReduceHalf.cu      |    3 +
 lib/THC/generated/THCTensorMathReduceInt.cu       |    3 +
 lib/THC/generated/THCTensorMathReduceLong.cu      |    3 +
 lib/THC/generated/THCTensorMathReduceShort.cu     |    3 +
 lib/THC/generated/THCTensorSortByte.cu            |    3 +
 lib/THC/generated/THCTensorSortChar.cu            |    3 +
 lib/THC/generated/THCTensorSortDouble.cu          |    3 +
 lib/THC/generated/THCTensorSortFloat.cu           |    3 +
 lib/THC/generated/THCTensorSortHalf.cu            |    3 +
 lib/THC/generated/THCTensorSortInt.cu             |    3 +
 lib/THC/generated/THCTensorSortLong.cu            |    3 +
 lib/THC/generated/THCTensorSortShort.cu           |    3 +
 lib/THC/generic/THCDeviceTensorUtils.cu           |   55 +
 lib/THC/generic/THCStorage.c                      |  190 +
 lib/THC/generic/THCStorage.cu                     |   94 +
 lib/THC/generic/THCStorage.h                      |   60 +
 lib/THC/generic/THCStorageCopy.c                  |   60 +
 lib/THC/generic/THCStorageCopy.cu                 |   46 +
 lib/THC/generic/THCStorageCopy.h                  |   42 +
 lib/THC/generic/THCTensor.c                       |  858 +++++
 lib/THC/generic/THCTensor.cu                      |   36 +
 lib/THC/generic/THCTensor.h                       |  133 +
 lib/THC/generic/THCTensorCopy.c                   |  169 +
 lib/THC/generic/THCTensorCopy.cu                  |   47 +
 lib/THC/generic/THCTensorCopy.h                   |   43 +
 lib/THC/generic/THCTensorIndex.cu                 |  507 +++
 lib/THC/generic/THCTensorIndex.h                  |   15 +
 lib/THC/generic/THCTensorMasked.cu                |  193 +
 lib/THC/generic/THCTensorMasked.h                 |   38 +
 lib/THC/generic/THCTensorMath.cu                  |  394 ++
 lib/THC/generic/THCTensorMath.h                   |   22 +
 lib/THC/generic/THCTensorMathBlas.cu              |  600 +++
 lib/THC/generic/THCTensorMathBlas.h               |   13 +
 lib/THC/generic/THCTensorMathCompare.cu           |  101 +
 lib/THC/generic/THCTensorMathCompare.h            |   20 +
 lib/THC/generic/THCTensorMathCompareT.cu          |  113 +
 lib/THC/generic/THCTensorMathCompareT.h           |   19 +
 lib/THC/generic/THCTensorMathMagma.cu             |  650 ++++
 lib/THC/generic/THCTensorMathMagma.h              |   23 +
 lib/THC/generic/THCTensorMathPairwise.cu          |  213 ++
 lib/THC/generic/THCTensorMathPairwise.h           |   14 +
 lib/THC/generic/THCTensorMathPointwise.cu         |  522 +++
 lib/THC/generic/THCTensorMathPointwise.h          |   57 +
 lib/THC/generic/THCTensorMathReduce.cu            |  364 ++
 lib/THC/generic/THCTensorMathReduce.h             |   41 +
 lib/THC/generic/THCTensorMathScan.cu              |   89 +
 lib/THC/generic/THCTensorMathScan.h               |    8 +
 lib/THC/generic/THCTensorRandom.cu                |  351 ++
 lib/THC/generic/THCTensorRandom.h                 |   23 +
 lib/THC/generic/THCTensorScatterGather.cu         |  266 ++
 lib/THC/generic/THCTensorScatterGather.h          |    9 +
 lib/THC/generic/THCTensorSort.cu                  |  336 ++
 lib/THC/generic/THCTensorSort.h                   |   20 +
 rocks/cutorch-1.0-0.rockspec                      |   38 +
 rocks/cutorch-scm-1.rockspec                      |   37 +
 rocks/version.sh                                  |   28 +
 test/test.lua                                     | 4225 +++++++++++++++++++++
 test/test_shutdown.lua                            |   64 +
 torch/generic/Storage.c                           |  280 ++
 torch/generic/Tensor.c                            | 1440 +++++++
 torch/utils.c                                     |   60 +
 torch/utils.h                                     |   64 +
 208 files changed, 31285 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..9f9205c
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,54 @@
+language: c
+compiler:
+  - gcc
+  - clang
+cache:
+  directories:
+  - $HOME/OpenBlasInstall
+sudo: true
+env:
+  - TORCH_LUA_VERSION=LUAJIT21
+  - TORCH_LUA_VERSION=LUA51
+  - TORCH_LUA_VERSION=LUA52
+addons:
+  apt:
+    packages:
+    - cmake
+    - gfortran
+    - gcc-multilib
+    - gfortran-multilib
+    - liblapack-dev
+    - build-essential
+    - gcc
+    - g++
+    - curl
+    - cmake
+    - libreadline-dev
+    - git-core
+    - libqt4-core
+    - libqt4-gui
+    - libqt4-dev
+    - libjpeg-dev
+    - libpng-dev
+    - ncurses-dev
+    - imagemagick
+    - libzmq3-dev
+    - gfortran
+    - unzip
+    - gnuplot
+    - gnuplot-x11
+before_script:
+- export ROOT_TRAVIS_DIR=$(pwd)
+- export INSTALL_PREFIX=~/torch/install
+-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
+- git clone https://github.com/torch/distro.git ~/torch --recursive
+- cd ~/torch && git submodule update --init --recursive
+- mkdir build && cd build
+- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
+- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
+- make && make install
+- cd $ROOT_TRAVIS_DIR
+- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
+- sudo -E $ROOT_TRAVIS_DIR/travis_cuda_install.sh
+script:
+- ${INSTALL_PREFIX}/bin/luarocks make rocks/cutorch-scm-1.rockspec
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..9d1d0a0
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,36 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.8)
+
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/lib/THC/cmake ${CMAKE_MODULE_PATH})
+
+FIND_PACKAGE(Torch REQUIRED)
+FIND_PACKAGE(CUDA 6.5 REQUIRED)
+FIND_PACKAGE(MAGMA)
+
+IF (NOT WIN32)
+SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
+ENDIF (NOT WIN32)
+IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  ADD_DEFINITIONS(-DTH_GENERIC_USE_HALF=1)
+  ADD_DEFINITIONS(-DCUDA_HAS_FP16=1)
+ENDIF()
+
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+
+ADD_SUBDIRECTORY(lib)
+
+INCLUDE_DIRECTORIES(BEFORE "${CMAKE_CURRENT_BINARY_DIR}/lib/THC")
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/lib/THC")
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/torch")
+
+SET(src Storage.c init.c Tensor.c TensorMath.c TensorOperator.c torch/utils.c)
+SET(luasrc init.lua Tensor.lua FFI.lua test/test.lua)
+
+ADD_TORCH_WRAP(cudatensormathwrap TensorMath.lua)
+
+ADD_TORCH_PACKAGE(cutorch "${src}" "${luasrc}")
+TARGET_LINK_LIBRARIES(cutorch luaT THC)
+
+IF(LUALIB)
+  TARGET_LINK_LIBRARIES(cutorch ${LUALIB})
+ENDIF()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..9100042
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,130 @@
+# Contributing to Torch7 Core (torch7, nn, cutorch, cunn)
+
+Thanks a lot! There are plenty of ways you can help!
+
+Please take a moment to review this document in order to make the contribution
+process easy and effective for everyone involved.
+
+Following these guidelines helps to communicate that you respect the time of
+the developers managing and developing this open source project. In return,
+they should reciprocate that respect in addressing your issue or assessing
+patches and features.
+
+
+## Using the issue tracker
+
+The [issue tracker](https://github.com/torch/cutorch/issues) is
+the preferred channel for [bug reports](#bugs), [features requests](#features)
+and [submitting pull requests](#pull-requests), but please respect the following
+restrictions:
+
+* Please **do not** use the issue tracker for personal support requests (use
+  [mailing-list](https://groups.google.com/forum/#!forum/torch7)).
+
+* Please **do not** open issues regarding the code in a torch package 
+  outside the core. For example dont open issues about the 
+  REPL in the cutorch issue tracker, use the trepl issue tracker for that.
+
+<a name="bugs"></a>
+## Bug reports
+
+A bug is a _demonstrable problem_ that is caused by the code in the repository.
+Good bug reports are extremely helpful - thank you!
+
+Guidelines for bug reports:
+
+1. **Use the GitHub issue search** — check if the issue has already been
+   reported.
+
+2. **Check if the issue has been fixed** — try to reproduce it using the
+   latest `master` or development branch in the repository.
+
+3. **Isolate the problem** — ideally create test case that is within reason,
+   preferably within 100 lines of code.
+
+A good bug report shouldn't leave others needing to chase you up for more
+information. Please try to be as detailed as possible in your report. What is
+your environment? What steps will reproduce the issue? What OS do you
+experience the problem? What would you expect to be the outcome? All these
+details will help people to fix any potential bugs.
+
+<a name="features"></a>
+## Feature requests
+
+Feature requests are welcome to be filed. Torch is community-developed, 
+the maintainers are not exclusive torch developers, so keep that in mind.
+The purpose of feature requests is for others who are looking to implement
+a feature are aware of the interest in the feature.
+
+
+<a name="pull-requests"></a>
+## Pull requests
+
+Good pull requests - patches, improvements, new features - are a fantastic
+help. They should remain focused in scope **and avoid containing unrelated
+commits.**
+
+**Please ask first** before embarking on any significant pull request (e.g.
+implementing features, refactoring code, porting to a different language),
+otherwise you risk spending a lot of time working on something that the
+project's developers might not want to merge into the project.
+
+Please adhere to the coding conventions used throughout a project (indentation,
+accurate comments, etc.) and any other requirements (such as test coverage).
+
+Adhering to the following this process is the best way to get your work
+included in the project:
+
+1. [Fork](https://help.github.com/articles/fork-a-repo) the project, clone your
+   fork, and configure the remotes:
+
+   ```bash
+   # Clone your fork of the repo into the current directory
+   git clone https://github.com/<your-username>/cutorch.git
+   # Navigate to the newly cloned directory
+   cd cutorch
+   # Assign the original repo to a remote called "upstream"
+   git remote add upstream https://github.com/torch/cutorch.git
+   ```
+
+2. If you cloned a while ago, get the latest changes from upstream:
+
+   ```bash
+   git checkout master
+   git pull upstream master
+   ```
+
+3. Create a new topic branch (off the main project development branch) to
+   contain your feature, change, or fix:
+
+   ```bash
+   git checkout -b <topic-branch-name>
+   ```
+
+4. Commit your changes in logical chunks. Please try to adhere to these [git commit
+   message guidelines](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html)
+   . Use Git's [interactive rebase](https://help.github.com/articles/about-git-rebase)
+   feature to tidy up your commits before making them public. This helps us keep the 
+   commit history in logical blocks and clean, as torch grows. 
+   For example: 
+     - If you are adding a new function or a module, keep the module + tests + doc 
+       to a single commit unless logically warranted. 
+     - If you are fixing a bug, keep the bugfix to a single commit unless logically warranted.
+
+5. Locally merge (or rebase) the upstream development branch into your topic branch:
+
+   ```bash
+   git pull [--rebase] upstream master
+   ```
+
+6. Push your topic branch up to your fork:
+
+   ```bash
+   git push origin <topic-branch-name>
+   ```
+
+7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/)
+    with a clear title and description.
+
+**IMPORTANT**: By submitting a patch, you agree to allow the project owners to
+license your work under the terms of the BSD License.
diff --git a/FFI.lua b/FFI.lua
new file mode 100644
index 0000000..b2777a2
--- /dev/null
+++ b/FFI.lua
@@ -0,0 +1,118 @@
+local ok, ffi = pcall(require, 'ffi')
+if ok then
+   local unpack = unpack or table.unpack
+   local cdefs = [[
+typedef struct CUstream_st *cudaStream_t;
+
+struct cublasContext;
+typedef struct cublasContext *cublasHandle_t;
+typedef struct CUhandle_st *cublasHandle_t;
+
+typedef struct _THCStream {
+   cudaStream_t stream;
+   int device;
+   int refcount;
+} THCStream;
+
+
+typedef struct _THCCudaResourcesPerDevice {
+  THCStream** streams;
+  cublasHandle_t* blasHandles;
+  size_t scratchSpacePerStream;
+  void** devScratchSpacePerStream;
+} THCCudaResourcesPerDevice;
+
+
+typedef struct THCState
+{
+  struct THCRNGState* rngState;
+  struct cudaDeviceProp* deviceProperties;
+  THCCudaResourcesPerDevice* resourcesPerDevice;
+  int numDevices;
+  int numUserStreams;
+  int numUserBlasHandles;
+  struct THAllocator* cudaHostAllocator;
+} THCState;
+
+cudaStream_t THCState_getCurrentStream(THCState *state);
+
+]]
+
+   local CudaTypes = {
+      {'float', ''},
+      {'unsigned char', 'Byte'},
+      {'char', 'Char'},
+      {'short', 'Short'},
+      {'int', 'Int'},
+      {'long','Long'},
+      {'double','Double'},
+  }
+  if cutorch.hasHalf then
+      table.insert(CudaTypes, {'half','Half'})
+  end
+
+   for _, typedata in ipairs(CudaTypes) do
+      local real, Real = unpack(typedata)
+      local ctype_def = [[
+typedef struct THCStorage
+{
+    real *data;
+    ptrdiff_t size;
+    int refcount;
+    char flag;
+    THAllocator *allocator;
+    void *allocatorContext;
+    struct THCStorage *view;
+    int device;
+} THCStorage;
+
+typedef struct THCTensor
+{
+    long *size;
+    long *stride;
+    int nDimension;
+
+    THCStorage *storage;
+    ptrdiff_t storageOffset;
+    int refcount;
+
+    char flag;
+
+} THCTensor;
+]]
+
+      ctype_def = ctype_def:gsub('real',real):gsub('THCStorage','THCuda'..Real..'Storage'):gsub('THCTensor','THCuda'..Real..'Tensor')
+      cdefs = cdefs .. ctype_def
+   end
+   if cutorch.hasHalf then
+      ffi.cdef([[
+typedef struct {
+    unsigned short x;
+} __half;
+typedef __half half;
+      ]])
+   end
+   ffi.cdef(cdefs)
+
+   for _, typedata in ipairs(CudaTypes) do
+      local real, Real = unpack(typedata)
+      local Storage = torch.getmetatable('torch.Cuda' .. Real .. 'Storage')
+      local Storage_tt = ffi.typeof('THCuda' .. Real .. 'Storage**')
+
+      rawset(Storage, "cdata", function(self) return Storage_tt(self)[0] end)
+      rawset(Storage, "data", function(self) return Storage_tt(self)[0].data end)
+      -- Tensor
+      local Tensor = torch.getmetatable('torch.Cuda' .. Real .. 'Tensor')
+      local Tensor_tt = ffi.typeof('THCuda' .. Real .. 'Tensor**')
+
+      rawset(Tensor, "cdata", function(self) return Tensor_tt(self)[0] end)
+
+      rawset(Tensor, "data",
+             function(self)
+                self = Tensor_tt(self)[0]
+                return self.storage ~= nil and self.storage.data + self.storageOffset or nil
+             end
+      )
+   end
+
+end
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2e4118c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,35 @@
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of NEC Laboratories American and IDIAP Research
+   Institute nor the names of its contributors may be used to endorse or
+   promote products derived from this software without specific prior
+   written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3b4a174
--- /dev/null
+++ b/README.md
@@ -0,0 +1,114 @@
+cutorch
+=======
+** [NOTE on API changes and versioning](#api-changes-and-versioning) **
+
+Cutorch provides a CUDA backend for torch7.
+
+Cutorch provides the following:
+
+- a new tensor type: `torch.CudaTensor` that acts like `torch.FloatTensor`, but all it's operations are on the GPU. Most of the tensor operations are supported by cutorch. There are a few missing ones, which are being implemented. The missing list can be found here: https://github.com/torch/cutorch/issues/70
+- several other GPU tensor types, with limited functionality. Currently limited to copying/conversion, and several indexing and shaping operations.
+- `cutorch.*` - Functions to set/get GPU, get device properties, memory usage, set/get low-level streams, set/get random number generator's seed, synchronization etc. They are described in more detail below.
+
+### torch.CudaTensor
+This new tensor type behaves exactly like a `torch.FloatTensor`, but has a couple of extra functions of note:
+- `t:getDevice()` - Given a CudaTensor `t`, you can call :getDevice on it to find out the GPU ID on which the tensor memory is allocated.
+
+### Other CUDA tensor types
+Most other (besides float) CPU torch tensor types now have a cutorch equivalent, with similar names:
+
+- `torch.CudaDoubleTensor`
+- `torch.CudaByteTensor`
+- `torch.CudaCharTensor`
+- `torch.CudaIntTensor`
+- `torch.CudaShortTensor`
+- `torch.CudaLongTensor`
+- and `torch.CudaHalfTensor` when supported as indicated by `cutorch.hasHalf`; these are half-precision (16-bit) floats.
+
+**Note:** these are currently limited to copying/conversion, and several indexing and shaping operations (e.g. `narrow`, `select`, `unfold`, `transpose`).
+
+### CUDA memory allocation
+Set the environment variable `THC_CACHING_ALLOCATOR=1` to enable the caching CUDA memory allocator.
+
+By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA device and pinned memory allocations to avoid synchronizations.
+
+With the caching memory allocator, device allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.
+
+###`cutorch.*` API
+- `cutorch.synchronize()` : All of the CUDA API is asynchronous (barring a few functions), which means that you can queue up operations. To wait for the operations to finish, you can issue `cutorch.synchronize()` in your code, when the code waits for all GPU operations on the current GPU to finish. WARNING: synchronizes the CPU host with respect to the current device (as per `cutorch.getDevice()`) only.
+- `cutorch.synchronizeAll()` : Same as `cutorch.synchronize()` except synchronizes the CPU host with all visible GPU devices in the system. Equivalent to calling `cutorch.synchronize()` once per each device.
+- `cutorch.setDevice(i)` : If one has multiple-GPUs, you can switch the default GPU (to allocate CUDA tensors and do operations). The GPU IDs are 1-indexed, so having 4 GPUs means, you can setDevice(1), setDevice(2), setDevice(3), setDevice(4).
+- `idx = cutorch.getDevice()` : Returns the currently set GPU device index.
+- `count = cutorch.getDeviceCount()` : Gets the number of available GPUs.
+- `freeMemory, totalMemory = cutorch.getMemoryUsage(devID)` : Gets the total and free memory in bytes for the given device ID.
+- `cutorch.seed([devID])` - Sets and returns a random seed for the current or specified device.
+- `cutorch.seedAll()` - Sets and returns a random seed for all available GPU devices.
+- `cutorch.initialSeed([devID])` - Returns the seed for the current or specified device
+- `cutorch.manualSeed(seed [, device])` - Sets a manually specified RNG seed for the current or specified device
+- `cutorch.manualSeedAll(seed)` - Sets a manually specified RNG seed for all available GPUs
+- `cutorch.getRNGState([device])` - returns the current RNG state in the form of a byte tensor, for the current or specified device.
+- `cutorch.setRNGState(state [, device])` - Sets the RNG state from a previously saved state, on the current or specified device.
+- `cutorch.getState()` - Returns the global state of the cutorch package. This state is not for users, it stores the raw RNG states, cublas handles and other thread and device-specific stuff.
+- `cutorch.withDevice(devID, f)` - This is a convenience for multi-GPU code, that takes in a device ID as well as a function f. It switches cutorch to the new device, executes the function f, and switches back cutorch to the original device.
+- `cutorch.createCudaHostTensor([...])` - Allocates a `torch.FloatTensor` of [host-pinned memory](https://devblogs.nvidia.com/parallelforall/how-optimize-data-transfers-cuda-cc/), where dimensions can be given as an argument list of sizes or a `torch.LongStorage`.
+
+#### Low-level streams functions (dont use this as a user, easy to shoot yourself in the foot):
+- `cutorch.reserveStreams(n [, nonblocking])`: creates n user streams for use on every device. NOTE: stream index `s` on device 1 is a different cudaStream_t than stream `s` on device 2. Takes an optional non-blocking flag; by default, this is assumed to be false. If true, then the stream is created with cudaStreamNonBlocking.
+- `n = cutorch.getNumStreams()`: returns the number of user streams available on every device. By `default`, this is `0`, meaning only the default stream (stream 0) is available.
+- `cutorch.setStream(n)`: specifies that the current stream active for the current device (or any other device) is `n`. This is preserved across device switches. 1-N are user streams, `0` is the default stream.
+- `n = cutorch.getStream()`: returns the current stream active. By default, returns `0`.
+- `cutorch.setDefaultStream()`: an alias for `cutorch.setStream(0)`
+- `cutorch.streamWaitFor(streamWaiting, {streamsToWaitOn...})`: A 1-to-N-way barrier. `streamWaiting` will wait for the list of streams specified to finish executing all kernels/events/barriers. Does not block any of the streamsToWaitOn. Current device only.
+- `cutorch.streamWaitForMultiDevice(deviceWaiting, streamWaiting, {[device]={streamsToWaitOn...}...})`: (deviceWaiting, streamWaiting) will wait on the list of (`device`, `streams`...) pairs; handles single or multiple device. `cutorch.streamWaitForMultiDevice, a, b, {[a]={streams...}})` is equivalent to `cutorch.setDevice(a); cutorch.streamWaitFor(b, {streams...})`.
+- `cutorch.streamBarrier({streams...})`: an N-to-N-way barrier between all the streams; all streams will wait for the completion of all other streams on the current device only. More efficient than creating the same N-to-N-way dependency via `streamWaitFor`.
+- `cutorch.streamBarrierMultiDevice({[device]={streamsToWaitOn...}...})`: As with streamBarrier but allows barriers between streams on arbitrary devices. Creates a cross-device N-to-N-way barrier between all (device, stream) values listed.
+- `cutorch.streamSynchronize(stream)`: equivalent to `cudaStreamSynchronize(stream)` for the current device. Blocks the CPU until stream completes its queued kernels/events.
+- `cutorch.setPeerToPeerAccess(dev, devToAccess, f)`: explicitly enable (`f` true) or disable p2p access (`f` false) from `dev` accessing memory on `devToAccess`. Affects copy efficiency (if disabled, copies will be d2d rather than p2p; i.e., the CPU intermediates), and affects kernel p2p access as well. Can only be enabled if the underlying hardware supports p2p access. p2p access is enabled by default for all pairs of devices if the underlying hardware supports it.
+- `cutorch.getPeerToPeerAccess(dev, devToAccess)`: returns whether or not p2p access is currently enabled or disabled, for reasons of a prior call of `setPeerToPeerAccess` or underlying hardware support.
+- `cutorch.setKernelPeerToPeerAccess(f)`: by default, kernels running on one device cannot directly access memory on another device. This is a check imposed by cutorch, to prevent synchronization and performance issues. To disable the check, call this with `f` true. Kernel p2p access is actually only allowed for a pair of devices if both this is true and the underlying `getPeerToPeerAccess` for the pair involved is true.
+- `cutorch.getKernelPeerToPeerAccess()`: returns whether or not kernel p2p checks are enabled or disabled.
+
+##### Common Examples
+Transfering a FloatTensor `src` to the GPU:
+```lua
+dest = src:cuda() -- dest is on the current GPU
+```
+
+Allocating a tensor on a given GPU:
+Allocate `src` on GPU 3
+```lua
+cutorch.setDevice(3)
+src = torch.CudaTensor(100)
+```
+
+Copying a CUDA tensor from one GPU to another:
+Given a tensor called `src` on GPU 1, if you want to create it's clone on GPU 2, then:
+
+```lua
+cutorch.setDevice(2)
+local dest = src:clone()
+```
+
+OR
+
+```lua
+local dest
+cutorch.withDevice(2, function() dest = src:clone() end)
+```
+
+## API changes and Versioning
+
+Version 1.0 can be installed via: `luarocks install cutorch 1.0-0`
+Compared to version 1.0, these are the following API changes:
+
+| operators | 1.0 | master |
+|---|---|---|
+| `lt`, `le`, `gt`, `ge`, `eq`, `ne` return type | torch.CudaTensor | torch.CudaByteTensor |
+| `min`,`max` (2nd return value)                 | torch.CudaTensor | torch.CudaLongTensor |
+| `maskedFill`, `maskedCopy` (mask input)        | torch.CudaTensor | torch.CudaByteTensor |
+| `topk`, `sort` (2nd return value)              | torch.CudaTensor | torch.CudaLongTensor |
+
+## Inconsistencies with CPU API
+
+| operators | CPU | CUDA |
+|---|---|---|
diff --git a/Storage.c b/Storage.c
new file mode 100644
index 0000000..9ffc1b5
--- /dev/null
+++ b/Storage.c
@@ -0,0 +1,19 @@
+#include "torch/utils.h"
+#include "THC.h"
+#include "THFile.h"
+#include "luaT.h"
+
+#define torch_Storage_(NAME) TH_CONCAT_4(torch_,CReal,Storage_,NAME)
+#define torch_Storage TH_CONCAT_STRING_3(torch.,CReal,Storage)
+#define cutorch_Storage_(NAME) TH_CONCAT_4(cutorch_,CReal,Storage_,NAME)
+#define cutorch_StorageCopy_(NAME) TH_CONCAT_4(cutorch_,Real,StorageCopy_,NAME)
+
+// generate the torch types -- we could also do this via THGenerateAllTypes,
+// but this allows us to be self contained.
+#define FORCE_TH_HALF
+#include "generic/CStorageCopy.c"
+#include "THCGenerateAllTypes.h"
+#undef FORCE_TH_HALF
+#include "generic/CStorage.c"
+#include "THCGenerateAllTypes.h"
+
diff --git a/Tensor.c b/Tensor.c
new file mode 100644
index 0000000..560d224
--- /dev/null
+++ b/Tensor.c
@@ -0,0 +1,20 @@
+#include "torch/utils.h"
+#include "THC.h"
+#include "THFile.h"
+#include "luaT.h"
+
+#define torch_Storage_(NAME) TH_CONCAT_4(torch_,CReal,Storage_,NAME)
+#define torch_Storage TH_CONCAT_STRING_3(torch.,CReal,Storage)
+#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor)
+#define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME)
+#define cutorch_TensorCopy_(NAME) TH_CONCAT_4(cutorch_,Real,TensorCopy_,NAME)
+
+// generate the torch types -- we could also do this via THGenerateAllTypes,
+// but this allows us to be self contained.
+#define FORCE_TH_HALF
+#include "generic/CTensorCopy.c"
+#include "THCGenerateAllTypes.h"
+#undef FORCE_TH_HALF
+#include "generic/CTensor.c"
+#include "THCGenerateAllTypes.h"
diff --git a/Tensor.lua b/Tensor.lua
new file mode 100644
index 0000000..0029291
--- /dev/null
+++ b/Tensor.lua
@@ -0,0 +1,91 @@
+function torch.CudaTensor.apply(self, func)
+   local x = torch.FloatTensor(self:size()):copy(self)
+   x:apply(func)
+   self:copy(x)
+   return self
+end
+
+local function Tensor__type(self,type)
+   local current = torch.typename(self)
+   if not type then return current end
+   if type ~= current then
+      local new = torch.getmetatable(type).new()
+      if self:nElement() > 0 then
+         new:resize(self:size()):copy(self)
+      end
+      return new
+   else
+      return self
+   end
+end
+local function Tensor__typeAs(self,tensor)
+   return self:type(tensor:type())
+end
+
+local TensorTypes = {
+   float  = 'torch.FloatTensor',
+   half   = 'torch.HalfTensor',
+   double = 'torch.DoubleTensor',
+   byte   = 'torch.ByteTensor',
+   char   = 'torch.CharTensor',
+   int    = 'torch.IntTensor',
+   short  = 'torch.ShortTensor',
+   long   = 'torch.LongTensor',
+   cuda       = 'torch.CudaTensor',
+   cudaDouble = 'torch.CudaDoubleTensor',
+   cudaByte   = 'torch.CudaByteTensor',
+   cudaChar   = 'torch.CudaCharTensor',
+   cudaInt    = 'torch.CudaIntTensor',
+   cudaShort  = 'torch.CudaShortTensor',
+   cudaLong   = 'torch.CudaLongTensor'
+}
+
+if cutorch.hasHalf then
+    TensorTypes['cudaHalf'] = 'torch.CudaHalfTensor'
+end
+
+local function Tensor__converter(type)
+    return function(self)
+        return self:type(type)
+    end
+end
+
+for _, SrcType in pairs(TensorTypes) do
+    for FuncName, DstType in pairs(TensorTypes) do
+        rawset(torch.getmetatable(SrcType), FuncName, Tensor__converter(DstType))
+    end
+end
+
+for _, CudaTensorType in pairs(TensorTypes) do
+    local metatable = torch.getmetatable(CudaTensorType)
+    rawset(metatable, 'type', Tensor__type)
+    rawset(metatable, 'typeAs', Tensor__typeAs)
+    rawset(metatable, 'view', torch['view'])
+    for _,func in pairs{'expand', 'expandAs', 'viewAs', 'repeatTensor',
+                        'permute', 'split', 'chunk'} do
+        rawset(metatable, func, torch[func])
+    end
+end
+
+local CudaTensorTypes = {
+   float  = 'torch.CudaTensor',
+   double = 'torch.CudaDoubleTensor',
+   byte   = 'torch.CudaByteTensor',
+   char   = 'torch.CudaCharTensor',
+   int    = 'torch.CudaIntTensor',
+   short  = 'torch.CudaShortTensor',
+   long   = 'torch.CudaLongTensor'
+}
+
+if cutorch.hasHalf then
+   CudaTensorTypes['half'] = 'torch.CudaHalfTensor'
+end
+
+for ValueType, CudaTensorType in pairs(CudaTensorTypes) do
+  local function Tensor__totable(self)
+    local host_tensor = self[ValueType](self)
+    return host_tensor:totable()
+  end
+  rawset(torch.getmetatable(CudaTensorType), 'totable', Tensor__totable)
+end
+
diff --git a/TensorMath.lua b/TensorMath.lua
new file mode 100644
index 0000000..936d897
--- /dev/null
+++ b/TensorMath.lua
@@ -0,0 +1,2117 @@
+local wrap = require 'cwrap'
+
+local interface = wrap.CInterface.new()
+local method = wrap.CInterface.new()
+local argtypes = wrap.CInterface.argtypes
+
+argtypes['ptrdiff_t'] = {
+
+  helpname = function(arg)
+                return 'ptrdiff_t'
+             end,
+
+  declare = function(arg)
+               -- if it is a number we initialize here
+               local default = tonumber(tostring(arg.default)) or 0
+               return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
+            end,
+
+  check = function(arg, idx)
+             return string.format("lua_isinteger(L, %d)", idx)
+          end,
+
+  read = function(arg, idx)
+            return string.format("arg%d = (%s)lua_tointeger(L, %d);", arg.i, 'ptrdiff_t', idx)
+         end,
+
+  init = function(arg)
+            -- otherwise do it here
+            if arg.default then
+               local default = tostring(arg.default)
+               if not tonumber(default) then
+                  return string.format("arg%d = %s;", arg.i, default)
+               end
+            end
+         end,
+
+  carg = function(arg)
+            return string.format('arg%d', arg.i)
+         end,
+
+  creturn = function(arg)
+               return string.format('arg%d', arg.i)
+            end,
+
+  precall = function(arg)
+               if arg.returned then
+                  return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+               end
+            end,
+
+  postcall = function(arg)
+                if arg.creturned then
+                   return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+                end
+             end
+}
+
+interface:print('/* WARNING: autogenerated file */')
+interface:print('')
+interface:print('#include "THC.h"')
+interface:print('#include "luaT.h"')
+interface:print('#include "torch/utils.h"')
+interface:print('')
+interface:print('')
+
+interface:print([[
+static int torch_isnonemptytable(lua_State *L, int idx)
+{
+  int empty;
+  if (!lua_istable(L, idx)) return 0;
+
+  lua_rawgeti(L, idx, 1);
+  empty = lua_isnil(L, -1);
+  lua_pop(L, 1);
+  return !empty;
+}
+]])
+
+-- Lua 5.2 compatibility
+local unpack = unpack or table.unpack
+
+-- specific to CUDA
+local typenames = {
+   'CudaByteTensor',
+   'CudaCharTensor',
+   'CudaShortTensor',
+   'CudaIntTensor',
+   'CudaLongTensor',
+   'CudaTensor',
+   'CudaDoubleTensor',
+   'CudaHalfTensor'
+}
+
+for _, typename in ipairs(typenames) do
+-- cut and paste from wrap/types.lua
+wrap.types[typename] = {
+
+   helpname = function(arg)
+      if arg.dim then
+         return string.format('%s~%dD', typename, arg.dim)
+      else
+         return typename
+      end
+   end,
+
+   declare = function(arg)
+      local txt = {}
+      table.insert(txt, string.format("TH%s *arg%d = NULL;", typename, arg.i))
+      if arg.returned then
+         table.insert(txt, string.format("int arg%d_idx = 0;", arg.i));
+      end
+      return table.concat(txt, '\n')
+   end,
+
+   check = function(arg, idx)
+      if arg.dim then
+         return string.format('(arg%d = luaT_toudata(L, %d, "torch.%s")) && (arg%d->nDimension == %d)', arg.i, idx, typename, arg.i, arg.dim)
+      else
+         return string.format('(arg%d = luaT_toudata(L, %d, "torch.%s"))', arg.i, idx, typename)
+      end
+   end,
+
+   read = function(arg, idx)
+      if arg.returned then
+         return string.format("arg%d_idx = %d;", arg.i, idx)
+      end
+   end,
+
+   init = function(arg)
+      if type(arg.default) == 'boolean' then
+         return string.format('arg%d = TH%s_new(cutorch_getstate(L));', arg.i, typename)
+      elseif type(arg.default) == 'number' then
+         return string.format('arg%d = %s;', arg.i, arg.args[arg.default]:carg())
+      else
+         error('unknown default tensor type value')
+      end
+   end,
+
+   carg = function(arg)
+      return string.format('arg%d', arg.i)
+   end,
+
+   creturn = function(arg)
+      return string.format('arg%d', arg.i)
+   end,
+
+   precall = function(arg)
+      local txt = {}
+      if arg.default and arg.returned then
+         table.insert(txt, string.format('if(arg%d_idx)', arg.i)) -- means it was passed as arg
+         table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+         table.insert(txt, string.format('else'))
+         if type(arg.default) == 'boolean' then -- boolean: we did a new()
+            table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.%s");', arg.i, typename))
+         else  -- otherwise: point on default tensor --> retain
+            table.insert(txt, string.format('{'))
+            table.insert(txt, string.format('TH%s_retain(arg%d);', typename, arg.i)) -- so we need a retain
+            table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.%s");', arg.i, typename))
+            table.insert(txt, string.format('}'))
+         end
+      elseif arg.default then
+         -- we would have to deallocate the beast later if we did a new
+         -- unlikely anyways, so i do not support it for now
+         if type(arg.default) == 'boolean' then
+            error('a tensor cannot be optional if not returned')
+         end
+      elseif arg.returned then
+         table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+      end
+      return table.concat(txt, '\n')
+   end,
+
+   postcall = function(arg)
+      local txt = {}
+      if arg.creturned then
+         -- if a tensor is returned by a wrapped C function, the refcount semantics
+         -- are ambiguous (transfer ownership vs. shared ownership).
+         -- We never actually do this, so lets just not allow it.
+         error('a tensor cannot be creturned')
+      end
+      return table.concat(txt, '\n')
+   end
+}
+
+wrap.types[typename .. 'Array'] = {
+
+   helpname = function(arg)
+                 return string.format('{%s+}', typename)
+            end,
+
+   declare = function(arg)
+                local txt = {}
+                table.insert(txt, string.format('TH%s **arg%d_data = NULL;', typename, arg.i))
+                table.insert(txt, string.format('long arg%d_size = 0;', arg.i))
+                table.insert(txt, string.format('int arg%d_i = 0;', arg.i))
+                return table.concat(txt, '\n')
+           end,
+
+   check = function(arg, idx)
+              return string.format('torch_isnonemptytable(L, %d)', idx)
+         end,
+
+   read = function(arg, idx)
+             local txt = {}
+             -- Iterate over the array to find its length, leave elements on stack.
+             table.insert(txt, string.format('do'))
+             table.insert(txt, string.format('{'))
+             table.insert(txt, string.format('  arg%d_size++;', arg.i))
+             table.insert(txt, string.format('  lua_checkstack(L, 1);'))
+             table.insert(txt, string.format('  lua_rawgeti(L, %d, arg%d_size);', idx, arg.i))
+             table.insert(txt, string.format('}'))
+             table.insert(txt, string.format('while (!lua_isnil(L, -1));'))
+             table.insert(txt, string.format('arg%d_size--;', arg.i))
+             -- Pop nil element from stack.
+             table.insert(txt, string.format('lua_pop(L, 1);'))
+             -- Allocate tensor pointers and read values from stack backwards.
+             table.insert(txt, string.format('arg%d_data = (TH%s**)THAlloc(arg%d_size * sizeof(TH%s*));', arg.i, typename, arg.i, typename))
+             table.insert(txt, string.format('for (arg%d_i = arg%d_size - 1; arg%d_i >= 0; arg%d_i--)', arg.i, arg.i, arg.i, arg.i))
+             table.insert(txt, string.format('{'))
+             table.insert(txt, string.format('  if (!(arg%d_data[arg%d_i] = luaT_toudata(L, -1, "torch.%s")))', arg.i, arg.i, typename))
+             table.insert(txt, string.format('    luaL_error(L, "expected %s in tensor array");', typename))
+             table.insert(txt, string.format('  lua_pop(L, 1);'))
+             table.insert(txt, string.format('}'))
+             table.insert(txt, string.format(''))
+             return table.concat(txt, '\n')
+          end,
+
+   init = function(arg)
+          end,
+
+   carg = function(arg)
+             return string.format('arg%d_data,arg%d_size', arg.i, arg.i)
+          end,
+
+   creturn = function(arg)
+                error('TensorArray cannot be returned.')
+             end,
+
+   precall = function(arg)
+             end,
+
+   postcall = function(arg)
+                 return string.format('THFree(arg%d_data);', arg.i)
+              end
+}
+end
+
+local function interpretdefaultvalue(arg)
+    local default = arg.default
+    if type(default) == 'boolean' then
+        if default then
+            return '1'
+        else
+            return '0'
+        end
+    elseif type(default) == 'number' then
+        return tostring(default)
+    elseif type(default) == 'string' then
+        return default
+    elseif type(default) == 'function' then
+        default = default(arg)
+        assert(type(default) == 'string', 'a default function must return a string')
+        return default
+    elseif type(default) == 'nil' then
+        return nil
+    else
+        error('unknown default type value')
+    end
+end
+
+wrap.types.half = {
+
+    helpname = function(arg)
+        return "half"
+    end,
+
+    declare = function(arg)
+        -- if it is a number we initialize here
+        local default = tonumber(interpretdefaultvalue(arg)) or 0
+        return string.format("half arg%d = THC_float2half((float) %d);", arg.i, tonumber(default))
+    end,
+
+    check = function(arg, idx)
+        return string.format("lua_isnumber(L, %d)", idx)
+    end,
+
+    read = function(arg, idx)
+        return string.format("arg%d = THC_float2half((float) lua_tonumber(L, %d));", arg.i, idx)
+    end,
+
+    init = function(arg)
+        -- otherwise do it here
+        if arg.default then
+            local default = interpretdefaultvalue(arg)
+            if not tonumber(default) then
+                return string.format("arg%d = THC_float2half((float) %s);", arg.i, default)
+            end
+        end
+    end,
+
+    carg = function(arg)
+        return string.format('arg%d', arg.i)
+    end,
+
+    creturn = function(arg)
+        return string.format('arg%d', arg.i)
+    end,
+
+    precall = function(arg)
+        if arg.returned then
+            return string.format('lua_pushnumber(L, (lua_Number) THC_half2float(arg%d));', arg.i)
+        end
+    end,
+
+    postcall = function(arg)
+        if arg.creturned then
+            return string.format('lua_pushnumber(L, (lua_Number) THC_half2float(arg%d));', arg.i)
+        end
+    end
+
+}
+
+wrap.types.LongArg = {
+
+   vararg = true,
+
+   helpname = function(arg)
+      return "(LongStorage | dim1 [dim2...])"
+   end,
+
+   declare = function(arg)
+      return string.format("THLongStorage *arg%d = NULL;", arg.i)
+   end,
+
+   init = function(arg)
+      if arg.default then
+         error('LongArg cannot have a default value')
+      end
+   end,
+
+   check = function(arg, idx)
+      return string.format("cutorch_islongargs(L, %d)", idx)
+   end,
+
+   read = function(arg, idx)
+      return string.format("arg%d = cutorch_checklongargs(L, %d);", arg.i, idx)
+   end,
+
+   carg = function(arg, idx)
+      return string.format('arg%d', arg.i)
+   end,
+
+   creturn = function(arg, idx)
+      return string.format('arg%d', arg.i)
+   end,
+
+   precall = function(arg)
+      local txt = {}
+      if arg.returned then
+         table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongStorage");', arg.i))
+      end
+      return table.concat(txt, '\n')
+   end,
+
+   postcall = function(arg)
+      local txt = {}
+      if arg.creturned then
+         -- this next line is actually debatable
+         table.insert(txt, string.format('THLongStorage_retain(arg%d);', arg.i))
+         table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongStorage");', arg.i))
+      end
+      if not arg.returned and not arg.creturned then
+         table.insert(txt, string.format('THLongStorage_free(arg%d);', arg.i))
+      end
+      return table.concat(txt, '\n')
+   end
+}
+
+wrap.types.charoption = {
+
+   helpname = function(arg)
+                 if arg.values then
+                    return "(" .. table.concat(arg.values, '|') .. ")"
+                 end
+              end,
+
+   declare = function(arg)
+                local txt = {}
+                table.insert(txt, string.format("const char *arg%d = NULL;", arg.i))
+                if arg.default then
+                   table.insert(txt, string.format("char arg%d_default = '%s';", arg.i, arg.default))
+                end
+                return table.concat(txt, '\n')
+           end,
+
+   init = function(arg)
+             return string.format("arg%d = &arg%d_default;", arg.i, arg.i)
+          end,
+
+   check = function(arg, idx)
+              local txt = {}
+              local txtv = {}
+              table.insert(txt, string.format('(arg%d = lua_tostring(L, %d)) && (', arg.i, idx))
+              for _,value in ipairs(arg.values) do
+                 table.insert(txtv, string.format("*arg%d == '%s'", arg.i, value))
+              end
+              table.insert(txt, table.concat(txtv, ' || '))
+              table.insert(txt, ')')
+              return table.concat(txt, '')
+         end,
+
+   read = function(arg, idx)
+          end,
+
+   carg = function(arg, idx)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg, idx)
+             end,
+
+   precall = function(arg)
+             end,
+
+   postcall = function(arg)
+              end
+}
+
+cutorch_state_code = function(varname)
+  local txt = {}
+  table.insert(txt, 'lua_getglobal(L, "cutorch");')
+  table.insert(txt, 'lua_getfield(L, -1, "_state");')
+  table.insert(txt, string.format('THCState *%s = lua_touserdata(L, -1);', varname))
+  table.insert(txt, 'lua_pop(L, 2);')
+  return table.concat(txt, '\n');
+end
+interface:registerDefaultArgument(cutorch_state_code)
+method:registerDefaultArgument(cutorch_state_code)
+
+local function wrap(...)
+   local args = {...}
+
+   -- interface
+   interface:wrap(...)
+
+   -- method: we override things possibly in method table field
+   for _,x in ipairs(args) do
+      if type(x) == 'table' then -- ok, now we have a list of args
+         for _, arg in ipairs(x) do
+            if arg.method then
+               for k,v in pairs(arg.method) do
+                  if v == 'nil' then -- special case, we erase the field
+                     arg[k] = nil
+                  else
+                     arg[k] = v
+                  end
+               end
+            end
+         end
+      end
+   end
+   method:wrap(unpack(args))
+end
+
+local Tensor
+
+-- functions to help take in arguments that are Tensor or CudaLongTensor (for backward compatibility)
+-- used in scatter / gather for example
+local function TensorToCudaLong_declare(dummy)
+      return function(arg)
+	 local txt = {}
+	 table.insert(txt, string.format("THCudaLongTensor *arg%d = NULL;", arg.i))
+	 if dummy then
+	    table.insert(txt, string.format("THCudaLongTensor *indexLongTensor = NULL;"))
+	    table.insert(txt, string.format("TH%s *dummyIndexTensor = NULL;", Tensor))
+	 end
+	 return table.concat(txt, '\n')
+      end
+end
+local function TensorToCudaLong_check(arg, idx)
+   return string.format('(dummyIndexTensor = luaT_toudata(L, %d, "torch.%s"))', idx, Tensor)
+end
+local function TensorToCudaLong_read(arg, idx)
+   local copyname = Tensor:match("(%a+)Tensor")
+   if copyname == 'Cuda' then
+      copyname = 'CudaFloat'
+   end
+   local txt = {}
+   table.insert(txt, string.format('arg%d = THCudaLongTensor_new(default_arg1);', arg.i))
+   table.insert(txt, string.format('THLongStorage *indexSize = TH%s_newSizeOf(default_arg1, dummyIndexTensor);', Tensor))
+   table.insert(txt, string.format('THCudaLongTensor_resize(default_arg1, arg%d, indexSize, NULL);', arg.i))
+   table.insert(txt, string.format('THLongStorage_free(indexSize);'))
+   table.insert(txt, string.format('THCudaLongTensor_copy%s(default_arg1, arg%d, dummyIndexTensor);', copyname, arg.i))
+   table.insert(txt, string.format('indexLongTensor = arg%d;', arg.i))
+   return table.concat(txt, '\n')
+end
+
+local function TensorToCudaLong_postcall(arg)
+   return "if (indexLongTensor != NULL) THCudaLongTensor_free(default_arg1, indexLongTensor);\n"
+end
+
+-- function to initialize the gather call
+local function gatherInit(arg)
+   return table.concat(
+      {
+	 arg.__metatable.init(arg),
+	 string.format("TH%s_checkGPU(cutorch_getstate(L), 1, %s);",
+		       Tensor, arg.args[4]:carg()),
+	 string.format(
+	    [[
+		  THCState *state = cutorch_getstate(L);
+		  THLongStorage *indicesSize = THCudaLongTensor_newSizeOf(state, %s);
+		  TH%s_resize(state, %s, indicesSize, NULL);
+		  THLongStorage_free(indicesSize);
+	    ]], arg.args[4]:carg(), Tensor, arg:carg()),
+      }, '\n')
+end
+
+--
+-- Non-CudaTensor type math, since these are less fully implemented than
+-- CudaTensor
+--
+
+local handledTypenames = {
+   'CudaByteTensor',
+   'CudaCharTensor',
+   'CudaShortTensor',
+   'CudaIntTensor',
+   'CudaLongTensor',
+   'CudaDoubleTensor',
+   'CudaHalfTensor',
+}
+local handledTypereals = {
+   'unsigned char',
+   'char',
+   'short',
+   'int',
+   'long',
+   'double',
+   'half'
+}
+local handledTypeaccreals = {
+   'long',
+   'long',
+   'long',
+   'long',
+   'long',
+   'double',
+   'float'
+}
+
+for k, Tensor_ in pairs(handledTypenames) do
+    Tensor = Tensor_
+    if Tensor == 'CudaHalfTensor' then
+        interface:print("#ifdef CUDA_HALF_TENSOR")
+    end
+    local real = handledTypereals[k]
+    local accreal = handledTypeaccreals[k]
+
+    function interface.luaname2wrapname(self, name)
+        return string.format('cutorch_%s_%s', Tensor, name)
+    end
+
+    function method.luaname2wrapname(self, name)
+        return string.format('m_cutorch_%s_%s', Tensor, name)
+    end
+
+    local function cname(name)
+        return string.format('TH%s_%s', Tensor, name)
+    end
+
+    local function lastdim(argn)
+        return function(arg)
+            return string.format('TH%s_nDimension(cutorch_getstate(L), %s)',
+                                 Tensor, arg.args[argn]:carg())
+        end
+    end
+
+    local function lastdimarray(argn)
+        return function(arg)
+            return string.format('TH%s_nDimension(cutorch_getstate(L), arg%d_data[0])',
+                                 Tensor, arg.args[argn].i)
+        end
+    end
+
+    wrap("fill",
+         cname("fill"),
+         {{name=Tensor, returned=true},
+             {name=real}})
+
+    wrap("zero",
+         cname("zero"),
+         {{name=Tensor, returned=true}})
+
+    wrap("zeros",
+         cname("zeros"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name="LongArg"}})
+
+    wrap("ones",
+         cname("ones"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name="LongArg"}})
+
+    wrap("reshape",
+         cname("reshape"),
+         {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="LongArg"}})
+
+    wrap("numel",
+         cname("numel"),
+         {{name=Tensor},
+            {name="ptrdiff_t", creturned=true}})
+
+    wrap("add",
+         cname("add"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}},
+         cname("cadd"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real, default=1},
+            {name=Tensor}})
+
+    wrap("csub",
+         cname("sub"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}},
+         cname("csub"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real, default=1},
+            {name=Tensor}})
+
+    wrap("mul",
+         cname("mul"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("clamp",
+         cname("clamp"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+          {name=Tensor, method={default=1}},
+          {name=real},
+          {name=real}})
+
+    wrap("cross",
+        cname("cross"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name=Tensor},
+         {name="index", default=0}})
+
+    wrap("div",
+         cname("div"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("fmod",
+         cname("fmod"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("remainder",
+         cname("remainder"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+
+    wrap("equal",
+         cname("equal"),
+         {{name=Tensor},
+          {name=Tensor},
+          {name="boolean", creturned=true}})
+
+    for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+       wrap(name,
+            cname(name),
+            {{name=Tensor, default=true, returned=true, method={default='nil'}},
+               {name=Tensor, method={default=1}},
+               {name=Tensor}})
+    end
+
+    wrap("addcmul",
+         cname("addcmul"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real, default=1},
+            {name=Tensor},
+            {name=Tensor}})
+
+    wrap("addcdiv",
+         cname("addcdiv"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real, default=1},
+            {name=Tensor},
+            {name=Tensor}})
+
+    for _,name in ipairs({"min", "max"}) do
+       wrap(name,
+            cname(name .. "all"),
+            {{name=Tensor},
+               {name=real, creturned=true}},
+            cname(name),
+            {{name=Tensor, default=true, returned=true},
+               {name='CudaLongTensor', default=true, returned=true},
+               {name=Tensor},
+               {name="index"}})
+    end
+
+    for _,name in ipairs({"cmin", "cmax"}) do
+       wrap(name,
+            cname(name),
+            {{name=Tensor, default=true, returned=true},
+             {name=Tensor, method={default=1}},
+             {name=Tensor}},
+            cname(name .. "Value"),
+            {{name=Tensor, default=true, returned=true},
+             {name=Tensor, method={default=1}},
+             {name=real}})
+    end
+
+    if Tensor == 'CudaByteTensor' then
+       for _,name in pairs({'all', 'any'}) do
+          wrap(name,
+               cname('logical' .. name),
+               {{name=Tensor},
+                  {name="boolean", creturned=true}})
+       end
+    end
+
+   for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do
+      wrap(name,
+           cname(name .. 'Value'),
+           {{name='CudaByteTensor',default=true, returned=true},
+            {name=Tensor},
+            {name=real}},
+           cname(name .. 'ValueT'),
+           {{name=Tensor, returned=true},
+            {name=Tensor},
+            {name=real}},
+           cname(name .. 'Tensor'),
+           {{name='CudaByteTensor',default=true, returned=true},
+            {name=Tensor},
+            {name=Tensor}},
+           cname(name .. 'TensorT'),
+           {{name=Tensor, returned=true},
+            {name=Tensor},
+            {name=Tensor}})
+   end
+
+    wrap("sum",
+         cname("sumall"),
+         {{name=Tensor},
+            {name=accreal, creturned=true}},
+         cname("sum"),
+         {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="index"}})
+
+    for _, name in ipairs({"cumsum", "cumprod"}) do
+        wrap(name,
+             cname(name),
+             {{name=Tensor, default=true, returned=true},
+                 {name=Tensor},
+                 {name="index", default=1}})
+    end
+
+    wrap("prod",
+         cname("prodall"),
+         {{name=Tensor},
+            {name=accreal, creturned=true}},
+         cname("prod"),
+         {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="index"}})
+
+    wrap("mean",
+         cname("meanall"),
+         {{name=Tensor},
+          {name=accreal, creturned=true}},
+         cname("mean"),
+         {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="index"}})
+
+    wrap("maskedFill",
+         cname("maskedFill"),
+         {{name=Tensor, returned=true, method={default='nil'}},
+            {name='CudaByteTensor'},
+            {name=real}})
+
+    wrap("maskedCopy",
+         cname("maskedCopy"),
+         {{name=Tensor, returned=true, method={default='nil'}},
+            {name='CudaByteTensor'},
+            {name=Tensor}})
+
+    wrap("maskedSelect",
+         cname("maskedSelect"),
+         {{name=Tensor, returned=true, default=true},
+            {name=Tensor},
+            {name='CudaByteTensor'}})
+
+    wrap("gather",
+	 cname("gather"),
+	 {{name=Tensor, default=true, returned=true, init=gatherInit},
+	    {name=Tensor},
+	    {name="index"},
+	    {name='CudaLongTensor'}},
+	 cname("gather"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+	 {{name=Tensor, default=true, returned=true, init=gatherInit},
+	    {name=Tensor},
+	    {name="index"},
+	    {name=Tensor, declare=TensorToCudaLong_declare(true), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall}})
+
+    wrap("scatter",
+	 cname("scatter"),
+	 {{name=Tensor, returned=true},
+	    {name="index"},
+	    {name='CudaLongTensor'},
+	    {name=Tensor}},
+	 cname("scatter"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+	 {{name=Tensor, returned=true},
+	    {name="index"},
+	    {name=Tensor, declare=TensorToCudaLong_declare(true), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall},
+	    {name=Tensor}},
+	 cname("scatterFill"),
+	 {{name=Tensor, returned=true},
+	    {name="index"},
+	    {name='CudaLongTensor'},
+	    {name=real}},
+	 cname("scatterFill"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+	 {{name=Tensor, returned=true},
+	    {name="index"},
+	    {name=Tensor, declare=TensorToCudaLong_declare(false), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall},
+	    {name=real}}
+    )
+
+    wrap("sort",
+         cname("sort"),
+         {{name=Tensor, default=true, returned=true},
+             {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+             {name=Tensor},
+             {name="index", default=lastdim(3)},
+             {name="boolean", default=0}}
+    )
+
+    wrap("squeeze",
+         cname("squeeze"),
+         {{name=Tensor, default=true, returned=true, postcall=function(arg)
+              local txt = {}
+              if arg.returned then
+                 table.insert(txt, string.format('if(arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+                 if Tensor == 'CudaHalfTensor' then
+                    table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)THC_half2float(TH%s_get1d(cutorch_getstate(L), arg%d, 0)));', Tensor, arg.i))
+                 else
+                    table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(TH%s_get1d(cutorch_getstate(L), arg%d, 0)));', Tensor, arg.i))
+                 end
+              end
+              return table.concat(txt, '\n')
+          end},
+            {name=Tensor}},
+         cname("squeeze1d"),
+         {{name=Tensor, default=true, returned=true,
+           postcall=
+              function(arg)
+                 local txt = {}
+                 if arg.returned then
+                    table.insert(txt, string.format('if(!hasdims && arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+                    if Tensor == 'CudaHalfTensor' then
+                       table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)THC_half2float(TH%s_get1d(cutorch_getstate(L), arg%d, 0)));}', Tensor, arg.i))
+                    else
+                       table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(TH%s_get1d(cutorch_getstate(L), arg%d, 0)));}', Tensor, arg.i))
+                    end
+                 end
+                 return table.concat(txt, '\n')
+          end},
+
+            {name=Tensor,
+             precall=
+                function(arg)
+                   return string.format('{int hasdims = arg%d->nDimension > 1;', arg.i)
+            end},
+            {name="index"}})
+
+    wrap("abs",
+         cname("abs"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+             {name=Tensor, method={default=1}}})
+
+    wrap("sign",
+         cname("sign"),
+         {{name=Tensor, default=true, returned=true, method={default='nil'}},
+	    {name=Tensor, method={default=1}}})
+
+    wrap("cat",
+	 cname("cat"),
+	 {{name=Tensor, default=true, returned=true},
+	    {name=Tensor},
+	    {name=Tensor},
+	    {name="index", default=-1}},
+	 cname("catArray"),
+	 {{name=Tensor, default=true, returned=true},
+	    {name=Tensor .. "Array"},
+	    {name="index", default=-1}})
+
+    wrap("geometric",
+        cname("geometric"),
+        {{name=Tensor, returned=true},
+            {name='double'}})
+
+    wrap("bernoulli",
+        cname("bernoulli"),
+        {{name=Tensor, returned=true},
+            {name='double', default=0.5}},
+        cname("bernoulli_FloatTensor"),
+        {{name=Tensor, returned=true},
+            {name="CudaTensor"}},
+        cname("bernoulli_DoubleTensor"),
+        {{name=Tensor, returned=true},
+            {name="CudaDoubleTensor"}})
+
+    wrap("nonzero",
+         cname("nonzero"),
+         {{name="CudaLongTensor", default=true, returned=true},
+             {name=Tensor}})
+
+    if real == 'float' or real == 'double' or real == 'half' then
+       for _,name in ipairs({"log", "log1p", "exp",
+                             "cos", "acos", "cosh",
+                             "sin", "asin", "sinh",
+                             "tan", "atan", "tanh",
+                             "sqrt", "rsqrt", "sigmoid",
+                             "cinv", "ceil", "floor",
+                             "neg", "round", "trunc", "frac"}) do
+
+          wrap(name,
+               cname(name),
+               {{name=Tensor, default=true, returned=true, method={default='nil'}},
+                  {name=Tensor, method={default=1}}})
+
+       end
+
+       wrap("pow",
+            cname("pow"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'}},
+                {name=Tensor, method={default=1}},
+                {name=real}},
+            cname("tpow"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'}},
+                {name = real},
+                {name=Tensor, method={default=1}}})
+
+     wrap("rand",
+          cname("rand"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name="LongArg"}})
+
+     wrap("randn",
+          cname("randn"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name="LongArg"}})
+
+     wrap("multinomial",
+          cname("multinomial"),
+          {{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
+           {name=Tensor},
+           {name="int"},
+           {name="boolean", default=false}})
+
+     for _,f in ipairs({{name='uniform', a=0, b=1},
+                        {name='cauchy', a=0, b=1},
+                        {name='normal', a=0, b=1},
+                        {name='logNormal', a=1, b=2}}) do
+
+        wrap(f.name,
+             cname(f.name),
+             {{name=Tensor, returned=true},
+              {name='double', default=f.a},
+              {name='double', default=f.b}})
+     end
+
+     wrap('exponential',
+          cname('exponential'),
+          {{name=Tensor, returned=true},
+           {name='double', default=nil}})
+
+      wrap("norm",
+           cname("normall"),
+           {{name=Tensor},
+            {name=real, default=2},
+            {name=accreal, creturned=true}},
+           cname("norm"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name=real},
+            {name="index"}})
+
+      wrap("renorm",
+           cname("renorm"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real},
+            {name="index"},
+            {name=real}})
+
+      wrap("dist",
+           cname("dist"),
+           {{name=Tensor},
+               {name=Tensor},
+               {name=real, default=2},
+               {name=accreal, creturned=true}})
+
+
+      for _,name in ipairs({"var", "std"}) do
+         wrap(name,
+              cname(name .. "all"),
+              {{name=Tensor},
+               {name=accreal, creturned=true}},
+              cname(name),
+              {{name=Tensor, default=true, returned=true},
+               {name=Tensor},
+               {name="index"},
+               {name="boolean", default=false}})
+      end
+
+      wrap("tril",
+           cname("tril"),
+           {{name=Tensor, default=true, returned=true},
+               {name=Tensor},
+               {name="int", default=0}})
+
+      wrap("triu",
+           cname("triu"),
+           {{name=Tensor, default=true, returned=true},
+               {name=Tensor},
+               {name="int", default=0}})
+
+      wrap("diag",
+           cname("diag"),
+           {{name=Tensor, default=true, returned=true},
+               {name=Tensor},
+               {name="int", default=0}})
+
+      wrap("trace",
+           cname("trace"),
+           {{name=Tensor},
+               {name=accreal, creturned=true}})
+
+
+
+      wrap("lerp",
+        cname("lerp"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor},
+         {name=real}})
+
+       -- BLAS functions
+       wrap("mv",
+            cname("addmv"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'},
+              init=function(arg)
+                 return table.concat(
+                    {
+                       arg.__metatable.init(arg),
+                       string.format("TH%s_checkGPU(cutorch_getstate(L), 1, %s);",
+                                     Tensor, arg.args[5]:carg()),
+                       string.format("TH%s_resize1d(cutorch_getstate(L), %s, %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg())
+                    }, '\n')
+              end,
+              precall=function(arg)
+                 return table.concat(
+                    {
+                       string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+                       arg.__metatable.precall(arg)
+                    }, '\n')
+              end
+             },
+               {name=real, default=1, invisible=true},
+               {name=Tensor, default=1, invisible=true},
+               {name=real, default=1, invisible=true},
+               {name=Tensor, dim=2},
+               {name=Tensor, dim=1}}
+       )
+
+       wrap("mm",
+            cname("addmm"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'},
+              init=function(arg)
+                 return table.concat(
+                    {
+                       arg.__metatable.init(arg),
+                       string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                     Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                       string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[1]);",
+                                     Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                    }, '\n')
+              end,
+             },
+               {name=real, default=0, invisible=true},
+               {name=Tensor, default=1, invisible=true},
+               {name=real, default=1, invisible=true},
+               {name=Tensor, dim=2},
+               {name=Tensor, dim=2}}
+       )
+
+       wrap("bmm",
+            cname("baddbmm"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'},
+              init=function(arg)
+                 return table.concat(
+                    {
+                       arg.__metatable.init(arg),
+                       string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                     Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                       string.format("TH%s_resize3d(cutorch_getstate(L), %s, %s->size[0], %s->size[1], %s->size[2]);",
+                                     Tensor, arg:carg(), arg.args[5]:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                    }, '\n')
+              end,
+             },
+               {name=real, default=0, invisible=true},
+               {name=Tensor, default=1, invisible=true},
+               {name=real, default=1, invisible=true},
+               {name=Tensor, dim=3},
+               {name=Tensor, dim=3}}
+       )
+
+       wrap("ger",
+            cname("addr"),
+            {{name=Tensor, default=true, returned=true, method={default='nil'},
+              init=function(arg)
+                 return table.concat(
+                    {
+                       arg.__metatable.init(arg),
+                       string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                     Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                       string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                    }, '\n')
+              end,
+              precall=function(arg)
+                 return table.concat(
+                    {
+                       string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+                       arg.__metatable.precall(arg)
+                    }, '\n')
+              end
+             },
+               {name=real, default=1, invisible=true},
+               {name=Tensor, default=1, invisible=true},
+               {name=real, default=1, invisible=true},
+               {name=Tensor, dim=1},
+               {name=Tensor, dim=1}}
+       )
+
+       for _,f in ipairs({
+             {name="addmv",   dim1=1, dim2=2, dim3=1},
+             {name="addmm",   dim1=2, dim2=2, dim3=2},
+             {name="addr",    dim1=2, dim2=1, dim3=1},
+             {name="baddbmm", dim1=3, dim2=3, dim3=3},
+             {name="addbmm",  dim1=2, dim2=3, dim3=3},
+                         }
+       ) do
+
+          interface:wrap(f.name,
+                         cname(f.name),
+                         {{name=Tensor, default=true, returned=true},
+                            {name=real, default=1},
+                            {name=Tensor, dim=f.dim1},
+                            {name=real, default=1},
+                            {name=Tensor, dim=f.dim2},
+                            {name=Tensor, dim=f.dim3}})
+
+          -- there is an ambiguity here, hence the more complicated setup
+          method:wrap(f.name,
+                      cname(f.name),
+                      {{name=Tensor, returned=true, dim=f.dim1},
+                         {name=real, default=1, invisible=true},
+                         {name=Tensor, default=1, dim=f.dim1},
+                         {name=real, default=1},
+                         {name=Tensor, dim=f.dim2},
+                         {name=Tensor, dim=f.dim3}},
+                      cname(f.name),
+                      {{name=Tensor, returned=true, dim=f.dim1},
+                         {name=real},
+                         {name=Tensor, default=1, dim=f.dim1},
+                         {name=real},
+                         {name=Tensor, dim=f.dim2},
+                         {name=Tensor, dim=f.dim3}})
+       end
+    end
+
+    if real == 'float' or real == 'double' then
+
+        for _,name in ipairs({"gesv", "gels"}) do
+           wrap(name,
+                cname(name),
+                {{name=Tensor, returned=true},
+                 {name=Tensor, returned=true},
+                 {name=Tensor},
+                 {name=Tensor}},
+                cname(name),
+                {{name=Tensor, default=true, returned=true, invisible=true},
+                 {name=Tensor, default=true, returned=true, invisible=true},
+                 {name=Tensor},
+                 {name=Tensor}})
+        end
+
+        wrap("symeig",
+             cname("syev"),
+             {{name=Tensor, returned=true},
+              {name=Tensor, returned=true},
+              {name=Tensor},
+              {name='charoption', values={'N', 'V'}, default='N'},
+              {name='charoption', values={'U', 'L'}, default='U'}},
+             cname("syev"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name='charoption', values={'N', 'V'}, default='N'},
+              {name='charoption', values={'U', 'L'}, default='U'}})
+
+        wrap("eig",
+             cname("geev"),
+             {{name=Tensor, returned=true},
+              {name=Tensor, returned=true},
+              {name=Tensor},
+              {name='charoption', values={'N', 'V'}, default='N'}},
+             cname("geev"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name='charoption', values={'N', 'V'}, default='N'}})
+
+        wrap("svd",
+             cname("gesvd"),
+             {{name=Tensor, returned=true},
+              {name=Tensor, returned=true},
+              {name=Tensor, returned=true},
+              {name=Tensor},
+              {name='charoption', values={'A', 'S'}, default='S'}},
+             cname("gesvd"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name='charoption', values={'A', 'S'}, default='S'}})
+
+        wrap("inverse",
+             cname("getri"),
+             {{name=Tensor, returned=true},
+              {name=Tensor}},
+             cname("getri"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor}})
+
+        wrap("potri",
+             cname("potri"),
+             {{name=Tensor, returned=true},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}},
+             cname("potri"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}})
+
+        wrap("potrf",
+             cname("potrf"),
+             {{name=Tensor, returned=true},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}},
+             cname("potrf"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}})
+
+        wrap("potrs",
+             cname("potrs"),
+             {{name=Tensor, returned=true},
+              {name=Tensor},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}},
+             cname("potrs"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor},
+              {name=Tensor},
+              {name='charoption', values={'U', 'L'}, default='U'}})
+
+        wrap("qr",
+             cname("qr"),
+             {{name=Tensor, returned=true},
+              {name=Tensor, returned=true},
+              {name=Tensor}},
+             cname("qr"),
+             {{name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor, default=true, returned=true, invisible=true},
+              {name=Tensor}})
+
+    end
+
+    wrap("dot",
+         cname("dot"),
+         {{name=Tensor},
+            {name=Tensor},
+            {name=accreal, creturned=true}})
+
+    method:register("m_cutorch_" .. Tensor .. "Math__")
+    interface:print(method:tostring())
+    method:clearhistory()
+    method:registerDefaultArgument(cutorch_state_code)
+    interface:register("cutorch_" .. Tensor .. "Math__")
+
+    interface:print(string.format([[
+void cutorch_%sMath_init(lua_State *L)
+{
+  luaT_pushmetatable(L, "torch.%s");
+
+  /* register methods */
+  luaL_setfuncs(L, m_cutorch_%sMath__, 0);
+
+  /* register functions into the "torch" field of the tensor metaclass */
+  lua_pushstring(L, "torch");
+  lua_newtable(L);
+  luaL_setfuncs(L, cutorch_%sMath__, 0);
+  lua_rawset(L, -3);
+  lua_pop(L, 1);
+}
+]], Tensor, Tensor, Tensor, Tensor))
+
+    if Tensor == 'CudaHalfTensor' then
+        interface:print("#endif")
+    end
+end
+
+
+--
+-- CudaTensor special handling, since it is more fully implemented
+--
+
+Tensor = "CudaTensor"
+local real = "float"
+
+function interface.luaname2wrapname(self, name)
+   return string.format('cutorch_%s_%s', Tensor, name)
+end
+
+function method.luaname2wrapname(self, name)
+    return string.format('m_cutorch_%s_%s', Tensor, name)
+end
+
+local function cname(name)
+   return string.format('TH%s_%s', Tensor, name)
+end
+
+local function lastdim(argn)
+   return function(arg)
+       return string.format('TH%s_nDimension(cutorch_getstate(L), %s)',
+                            Tensor, arg.args[argn]:carg())
+   end
+end
+
+local function lastdimarray(argn)
+   return function(arg)
+       return string.format('TH%s_nDimension(cutorch_getstate(L), arg%d_data[0])',
+                            Tensor, arg.args[argn].i)
+   end
+end
+
+wrap("zero",
+     cname("zero"),
+     {{name=Tensor, returned=true}})
+
+wrap("fill",
+     cname("fill"),
+     {{name=Tensor, returned=true},
+         {name=real}})
+
+wrap("zeros",
+     cname("zeros"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name="LongArg"}})
+
+   wrap("ones",
+        cname("ones"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name="LongArg"}})
+
+   wrap("reshape",
+        cname("reshape"),
+        {{name=Tensor, default=true, returned=true},
+           {name=Tensor},
+           {name="LongArg"}})
+
+   wrap("numel",
+        cname("numel"),
+        {{name=Tensor},
+           {name="long", creturned=true}})
+
+wrap("add",
+     cname("add"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=real}},
+     cname("cadd"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real, default=1},
+        {name=Tensor}})
+
+
+wrap("csub",
+     cname("sub"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=real}},
+     cname("csub"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real, default=1},
+        {name=Tensor}})
+
+wrap("mul",
+     cname("mul"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real}})
+
+wrap("div",
+     cname("div"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real}})
+
+wrap("fmod",
+     cname("fmod"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real}})
+
+wrap("remainder",
+     cname("remainder"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real}})
+
+wrap("equal",
+     cname("equal"),
+     {{name=Tensor},
+      {name=Tensor},
+      {name="boolean", creturned=true}})
+
+for _, name in ipairs({"cmul", "cpow", "cdiv", "cremainder", "cfmod"}) do
+  wrap(name,
+       cname(name),
+       {{name=Tensor, default=true, returned=true, method={default='nil'}},
+          {name=Tensor, method={default=1}},
+        {name=Tensor}})
+end
+
+wrap("addcmul",
+     cname("addcmul"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real, default=1},
+        {name=Tensor},
+        {name=Tensor}})
+
+wrap("addcdiv",
+     cname("addcdiv"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+        {name=Tensor, method={default=1}},
+        {name=real, default=1},
+        {name=Tensor},
+        {name=Tensor}})
+
+wrap("maskedFill",
+     cname("maskedFill"),
+     {{name=Tensor, returned=true, method={default='nil'}},
+      {name='CudaByteTensor'},
+      {name=real}})
+
+wrap("maskedCopy",
+     cname("maskedCopy"),
+     {{name=Tensor, returned=true, method={default='nil'}},
+	{name='CudaByteTensor'},
+	{name=Tensor}})
+
+wrap("maskedSelect",
+     cname("maskedSelect"),
+     {{name=Tensor, returned=true, default=true},
+      {name=Tensor},
+      {name='CudaByteTensor'}})
+
+wrap("gather",
+     cname("gather"),
+     {{name=Tensor, default=true, returned=true, init=gatherInit},
+	{name=Tensor},
+	{name="index"},
+	{name='CudaLongTensor'}},
+     cname("gather"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+     {{name=Tensor, default=true, returned=true, init=gatherInit},
+	{name=Tensor},
+	{name="index"},
+	{name=Tensor, declare=TensorToCudaLong_declare(true), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall}})
+
+wrap("scatter",
+     cname("scatter"),
+     {{name=Tensor, returned=true},
+	{name="index"},
+	{name='CudaLongTensor'},
+	{name=Tensor}},
+     cname("scatter"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+     {{name=Tensor, returned=true},
+	{name="index"},
+	{name=Tensor, declare=TensorToCudaLong_declare(true), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall},
+	{name=Tensor}},
+     cname("scatterFill"),
+     {{name=Tensor, returned=true},
+	{name="index"},
+	{name='CudaLongTensor'},
+	{name=real}},
+     cname("scatterFill"), -- this is for backward-compatibility, and takes in "Tensor" as the indexing tensor
+     {{name=Tensor, returned=true},
+	{name="index"},
+	{name=Tensor, declare=TensorToCudaLong_declare(false), check=TensorToCudaLong_check, read=TensorToCudaLong_read, postcall=TensorToCudaLong_postcall},
+	{name=real}}
+)
+
+wrap("sort",
+     cname("sort"),
+     {{name=Tensor, default=true, returned=true},
+      {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+      {name=Tensor},
+      {name="index", default=lastdim(3)},
+      {name="boolean", default=0}})
+
+wrap("topk",
+     cname("topk"),
+     {{name=Tensor, default=true, returned=true},
+       {name="CudaLongTensor", default=true, returned=true, noreadadd=true},
+       {name=Tensor},
+       {name="long", default=1},
+       {name="index", default=lastdim(3)},
+       {name="boolean", default=0},
+       {name="boolean", default=0}})
+
+do
+   local Tensor = Tensor
+   local real = real
+   wrap("mv",
+        cname("addmv"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+             return table.concat(
+                {
+                   arg.__metatable.init(arg),
+                   string.format("TH%s_checkGPU(cutorch_getstate(L), 1, %s);",
+                                 Tensor, arg.args[5]:carg()),
+                   string.format("TH%s_resize1d(cutorch_getstate(L), %s, %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg())
+                }, '\n')
+          end,
+          precall=function(arg)
+             return table.concat(
+                {
+                   string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+                   arg.__metatable.precall(arg)
+                }, '\n')
+          end
+         },
+           {name=real, default=1, invisible=true},
+           {name=Tensor, default=1, invisible=true},
+           {name=real, default=1, invisible=true},
+           {name=Tensor, dim=2},
+           {name=Tensor, dim=1}}
+   )
+
+   wrap("mm",
+        cname("addmm"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+             return table.concat(
+                {
+                   arg.__metatable.init(arg),
+                   string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                 Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                   string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[1]);",
+                                 Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                }, '\n')
+          end,
+         },
+           {name=real, default=0, invisible=true},
+           {name=Tensor, default=1, invisible=true},
+           {name=real, default=1, invisible=true},
+           {name=Tensor, dim=2},
+           {name=Tensor, dim=2}}
+   )
+
+   wrap("bmm",
+        cname("baddbmm"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+             return table.concat(
+                {
+                   arg.__metatable.init(arg),
+                   string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                 Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                   string.format("TH%s_resize3d(cutorch_getstate(L), %s, %s->size[0], %s->size[1], %s->size[2]);",
+                                 Tensor, arg:carg(), arg.args[5]:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                }, '\n')
+          end,
+         },
+           {name=real, default=0, invisible=true},
+           {name=Tensor, default=1, invisible=true},
+           {name=real, default=1, invisible=true},
+           {name=Tensor, dim=3},
+           {name=Tensor, dim=3}}
+   )
+
+   wrap("ger",
+        cname("addr"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+             return table.concat(
+                {
+                   arg.__metatable.init(arg),
+                   string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+                                 Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+                   string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                }, '\n')
+          end,
+          precall=function(arg)
+             return table.concat(
+                {
+                   string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+                   arg.__metatable.precall(arg)
+                }, '\n')
+          end
+         },
+           {name=real, default=1, invisible=true},
+           {name=Tensor, default=1, invisible=true},
+           {name=real, default=1, invisible=true},
+           {name=Tensor, dim=1},
+           {name=Tensor, dim=1}}
+   )
+
+   for _,f in ipairs({
+         {name="addmv",   dim1=1, dim2=2, dim3=1},
+         {name="addmm",   dim1=2, dim2=2, dim3=2},
+         {name="addr",    dim1=2, dim2=1, dim3=1},
+         {name="baddbmm", dim1=3, dim2=3, dim3=3},
+         {name="addbmm",  dim1=2, dim2=3, dim3=3},
+                     }
+   ) do
+
+      interface:wrap(f.name,
+                     cname(f.name),
+                     {{name=Tensor, default=true, returned=true},
+                        {name=real, default=1},
+                        {name=Tensor, dim=f.dim1},
+                        {name=real, default=1},
+                        {name=Tensor, dim=f.dim2},
+                        {name=Tensor, dim=f.dim3}})
+
+      -- there is an ambiguity here, hence the more complicated setup
+      method:wrap(f.name,
+                  cname(f.name),
+                  {{name=Tensor, returned=true, dim=f.dim1},
+                     {name=real, default=1, invisible=true},
+                     {name=Tensor, default=1, dim=f.dim1},
+                     {name=real, default=1},
+                     {name=Tensor, dim=f.dim2},
+                     {name=Tensor, dim=f.dim3}},
+                  cname(f.name),
+                  {{name=Tensor, returned=true, dim=f.dim1},
+                     {name=real},
+                     {name=Tensor, default=1, dim=f.dim1},
+                     {name=real},
+                     {name=Tensor, dim=f.dim2},
+                     {name=Tensor, dim=f.dim3}})
+   end
+end
+
+wrap("dot",
+     cname("dot"),
+     {{name=Tensor},
+      {name=Tensor},
+      {name=real, creturned=true}})
+
+wrap("sum",
+     cname("sumall"),
+     {{name=Tensor},
+        {name=real, creturned=true}},
+     cname("sum"),
+     {{name=Tensor, default=true, returned=true},
+        {name=Tensor},
+        {name="index"}})
+
+for _, name in ipairs({"cumsum", "cumprod"}) do
+  wrap(name,
+       cname(name),
+       {{name=Tensor, default=true, returned=true},
+        {name=Tensor},
+        {name="index", default=1}})
+end
+
+wrap("prod",
+     cname("prodall"),
+     {{name=Tensor},
+        {name=real, creturned=true}},
+     cname("prod"),
+     {{name=Tensor, default=true, returned=true},
+        {name=Tensor},
+        {name="index"}})
+
+for _,name in ipairs({"min", "max"}) do
+   wrap(name,
+        cname(name .. "all"),
+        {{name=Tensor},
+           {name=real, creturned=true}},
+        cname(name),
+        {{name=Tensor, default=true, returned=true},
+           {name='CudaLongTensor', default=true, returned=true},
+           {name=Tensor},
+           {name="index"}})
+end
+
+for _,name in ipairs({"cmin", "cmax"}) do
+   wrap(name,
+        cname(name),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}},
+        cname(name .. "Value"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+end
+
+wrap("cross",
+cname("cross"),
+    {{name=Tensor, default=true, returned=true},
+     {name=Tensor},
+     {name=Tensor},
+     {name="index", default=0}})
+
+wrap("tril",
+     cname("tril"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor},
+      {name="int", default=0}})
+
+wrap("triu",
+     cname("triu"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor},
+      {name="int", default=0}})
+
+wrap("diag",
+     cname("diag"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor},
+      {name="int", default=0}})
+
+wrap("trace",
+     cname("trace"),
+     {{name=Tensor},
+      {name=real, creturned=true}})
+
+for _,name in ipairs({"log", "log1p", "exp",
+                      "cos", "acos", "cosh",
+                      "sin", "asin", "sinh",
+                      "tan", "atan", "tanh",
+                      "sqrt", "rsqrt", "sigmoid",
+                      "cinv", "ceil", "floor",
+                      "neg", "abs", "sign",
+                      "round", "trunc", "frac"}) do
+
+   wrap(name,
+        cname(name),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}}})
+
+end
+
+wrap("atan2",
+     cname("atan2"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=Tensor}}
+)
+
+wrap("lerp",
+     cname("lerp"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=Tensor},
+      {name=real}}
+)
+
+wrap("pow",
+     cname("pow"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=real}},
+     cname("tpow"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name = real},
+      {name=Tensor, method={default=1}}})
+
+wrap("rand",
+     cname("rand"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name="LongArg"}})
+
+wrap("randn",
+     cname("randn"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name="LongArg"}})
+
+wrap("multinomial",
+     cname("multinomial"),
+     {{name='CudaLongTensor', default=true, returned=true, method={default='nil'}},
+        {name=Tensor},
+        {name="int"},
+        {name="boolean", default=false}})
+
+wrap("clamp",
+     cname("clamp"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=real},
+      {name=real}})
+
+for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do
+   wrap(name,
+        cname(name .. 'Value'),
+        {{name='CudaByteTensor',default=true, returned=true},
+           {name=Tensor},
+           {name=real}},
+        cname(name .. 'ValueT'),
+        {{name=Tensor, returned=true},
+           {name=Tensor},
+           {name=real}},
+        cname(name .. 'Tensor'),
+        {{name='CudaByteTensor',default=true, returned=true},
+           {name=Tensor},
+           {name=Tensor}},
+        cname(name .. 'TensorT'),
+        {{name=Tensor, returned=true},
+           {name=Tensor},
+           {name=Tensor}})
+end
+
+wrap("cat",
+     cname("cat"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor},
+      {name=Tensor},
+      {name="index", default=-1}},
+     cname("catArray"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor .. "Array"},
+      {name="index", default=-1}})
+
+wrap("nonzero",
+     cname("nonzero"),
+     {{name="CudaLongTensor", default=true, returned=true},
+         {name=Tensor}})
+
+wrap("geometric",
+    cname("geometric"),
+    {{name=Tensor, returned=true},
+        {name='double'}})
+
+wrap("bernoulli",
+    cname("bernoulli"),
+    {{name=Tensor, returned=true},
+        {name='double', default=0.5}},
+    cname("bernoulli_FloatTensor"),
+    {{name=Tensor, returned=true},
+        {name="CudaTensor"}},
+    cname("bernoulli_DoubleTensor"),
+    {{name=Tensor, returned=true},
+        {name="CudaDoubleTensor"}})
+
+for _,f in ipairs({{name='uniform', a=0, b=1},
+                   {name='normal', a=0, b=1},
+                   {name='cauchy', a=0, b=1},
+                   {name='logNormal', a=1, b=2}}) do
+
+   wrap(f.name,
+        cname(f.name),
+        {{name=Tensor, returned=true},
+         {name=real, default=f.a},
+         {name=real, default=f.b}})
+end
+
+for _,f in ipairs({{name='exponential'}}) do
+
+   wrap(f.name,
+        cname(f.name),
+        {{name=Tensor, returned=true},
+         {name=real, default=f.a}})
+end
+
+for _,name in ipairs({"gesv","gels"}) do
+   wrap(name,
+        cname(name),
+        {{name=Tensor, returned=true},
+         {name=Tensor, returned=true},
+         {name=Tensor},
+         {name=Tensor}},
+        cname(name),
+        {{name=Tensor, default=true, returned=true, invisible=true},
+         {name=Tensor, default=true, returned=true, invisible=true},
+         {name=Tensor},
+         {name=Tensor}})
+end
+
+wrap("symeig",
+     cname("syev"),
+     {{name=Tensor, returned=true},
+      {name=Tensor, returned=true},
+      {name=Tensor},
+      {name='charoption', values={'N', 'V'}, default='N'},
+      {name='charoption', values={'U', 'L'}, default='U'}},
+     cname("syev"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name='charoption', values={'N', 'V'}, default='N'},
+      {name='charoption', values={'U', 'L'}, default='U'}})
+
+wrap("eig",
+     cname("geev"),
+     {{name=Tensor, returned=true},
+      {name=Tensor, returned=true},
+      {name=Tensor},
+      {name='charoption', values={'N', 'V'}, default='N'}},
+     cname("geev"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name='charoption', values={'N', 'V'}, default='N'}})
+
+wrap("svd",
+     cname("gesvd"),
+     {{name=Tensor, returned=true},
+      {name=Tensor, returned=true},
+      {name=Tensor, returned=true},
+      {name=Tensor},
+      {name='charoption', values={'A', 'S'}, default='S'}},
+     cname("gesvd"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name='charoption', values={'A', 'S'}, default='S'}})
+
+wrap("inverse",
+     cname("getri"),
+     {{name=Tensor, returned=true},
+      {name=Tensor}},
+     cname("getri"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor}})
+
+wrap("potri",
+     cname("potri"),
+     {{name=Tensor, returned=true},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}},
+     cname("potri"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}})
+
+wrap("potrf",
+     cname("potrf"),
+     {{name=Tensor, returned=true},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}},
+     cname("potrf"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}})
+
+wrap("potrs",
+     cname("potrs"),
+     {{name=Tensor, returned=true},
+      {name=Tensor},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}},
+     cname("potrs"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor},
+      {name=Tensor},
+      {name='charoption', values={'U', 'L'}, default='U'}})
+
+wrap("qr",
+     cname("qr"),
+     {{name=Tensor, returned=true},
+      {name=Tensor, returned=true},
+      {name=Tensor}},
+     cname("qr"),
+     {{name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor, default=true, returned=true, invisible=true},
+      {name=Tensor}})
+
+wrap("mean",
+     cname("meanall"),
+     {{name=Tensor},
+      {name=real, creturned=true}},
+     cname("mean"),
+     {{name=Tensor, default=true, returned=true},
+        {name=Tensor},
+        {name="index"}})
+
+for _,name in ipairs({"var", "std"}) do
+   wrap(name,
+        cname(name .. "all"),
+        {{name=Tensor},
+         {name=real, creturned=true}},
+        cname(name),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="index"},
+         {name="boolean", default=false}})
+end
+
+wrap("norm",
+     cname("normall"),
+     {{name=Tensor},
+      {name=real, default=2},
+      {name=real, creturned=true}},
+     cname("norm"),
+     {{name=Tensor, default=true, returned=true},
+      {name=Tensor},
+      {name=real},
+      {name="index"}})
+
+wrap("renorm",
+     cname("renorm"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+      {name=Tensor, method={default=1}},
+      {name=real},
+      {name="index"},
+      {name=real}})
+
+wrap("dist",
+     cname("dist"),
+     {{name=Tensor},
+      {name=Tensor},
+      {name=real, default=2},
+      {name=real, creturned=true}})
+
+wrap("squeeze",
+     cname("squeeze"),
+     {{name=Tensor, default=true, returned=true, postcall=function(arg)
+          local txt = {}
+          if arg.returned then
+             table.insert(txt, string.format('if(arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+             table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(THCudaTensor_get1d(cutorch_getstate(L), arg%d, 0)));', arg.i))
+          end
+          return table.concat(txt, '\n')
+     end},
+      {name=Tensor}},
+     cname("squeeze1d"),
+     {{name=Tensor, default=true, returned=true,
+       postcall=
+          function(arg)
+             local txt = {}
+             if arg.returned then
+                table.insert(txt, string.format('if(!hasdims && arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+                table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(THCudaTensor_get1d(cutorch_getstate(L), arg%d, 0)));}', arg.i))
+             end
+             return table.concat(txt, '\n')
+          end},
+
+      {name=Tensor,
+       precall=
+          function(arg)
+             return string.format('{int hasdims = arg%d->nDimension > 1;', arg.i)
+          end},
+      {name="index"}})
+
+method:register("m_cutorch_" .. Tensor .. "Math__")
+interface:print(method:tostring())
+method:clearhistory()
+interface:register("cutorch_" .. Tensor .. "Math__")
+
+interface:print(string.format([[
+void cutorch_%sMath_init(lua_State *L)
+{
+  luaT_pushmetatable(L, "torch.%s");
+
+  /* register methods */
+  luaL_setfuncs(L, m_cutorch_%sMath__, 0);
+
+  /* register functions into the "torch" field of the tensor metaclass */
+  lua_pushstring(L, "torch");
+  lua_newtable(L);
+  luaL_setfuncs(L, cutorch_%sMath__, 0);
+  lua_rawset(L, -3);
+  lua_pop(L, 1);
+}
+]], Tensor, Tensor, Tensor, Tensor))
+
+interface:tofile(arg[1])
diff --git a/TensorOperator.c b/TensorOperator.c
new file mode 100644
index 0000000..ae7c2b3
--- /dev/null
+++ b/TensorOperator.c
@@ -0,0 +1,13 @@
+#include "torch/utils.h"
+#include "luaT.h"
+#include "THC.h"
+
+#include "THCTensorMath.h"
+
+#define cutorch_TensorOperator_(NAME) TH_CONCAT_4(cutorch_,CReal,TensorOperator_,NAME)
+#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor)
+#define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME)
+
+#include "generic/TensorOperator.c"
+#include "THCGenerateAllTypes.h"
diff --git a/generic/CStorage.c b/generic/CStorage.c
new file mode 100644
index 0000000..d3a7323
--- /dev/null
+++ b/generic/CStorage.c
@@ -0,0 +1,116 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/CStorage.c"
+#else
+
+#include "THHalf.h"
+
+/* everything is as the generic Storage.c, except few things (see below) */
+
+// FixMe: Requires an unsafe conversion in that we convert from cutorch's 'half'
+// to torch's THHalf.  These types are required to be defined in the same way
+// (is there some way to enforce this?)
+#ifdef THC_REAL_IS_HALF
+#define THFILE_REAL_CAST(x) (THHalf *)x
+#else
+#define THFILE_REAL_CAST(x) x
+#endif
+
+#define THFile_readRealRaw(file, data, size)                            \
+  {                                                                     \
+    real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
+    TH_CONCAT_3(THFile_read,Real,Raw)(file, THFILE_REAL_CAST(fdata), size);               \
+    THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \
+    THFree(fdata);                                                      \
+  }
+
+#define THFile_writeRealRaw(file, data, size)                           \
+  {                                                                     \
+    real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
+    THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \
+    TH_CONCAT_3(THFile_write,Real,Raw)(file, THFILE_REAL_CAST(fdata), size);              \
+    THFree(fdata);                                                      \
+  }
+
+#define TH_GENERIC_FILE "generic/Storage.c"
+#include "generic/Storage.c"
+
+#undef TH_GENERIC_FILE
+#undef THFILE_REAL_CAST
+#undef THFile_readRealRaw
+#undef THFile_writeRealRaw
+
+/* now we overwrite some methods specific to CudaStorage */
+
+static int cutorch_Storage_(copy)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  void *src;
+  if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
+    THCStorage_(copyCudaByte)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
+    THCStorage_(copyCudaChar)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
+    THCStorage_(copyCudaShort)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
+    THCStorage_(copyCudaInt)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
+    THCStorage_(copyCudaLong)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
+    THCStorage_(copyCudaFloat)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
+    THCStorage_(copyCudaDouble)(state, storage, src);
+#ifdef CUDA_HALF_TENSOR
+  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
+    THCStorage_(copyCudaHalf)(state, storage, src);
+#endif
+
+  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
+    THCStorage_(copyByte)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
+    THCStorage_(copyChar)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
+    THCStorage_(copyShort)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
+    THCStorage_(copyInt)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
+    THCStorage_(copyLong)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
+    THCStorage_(copyFloat)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
+    THCStorage_(copyDouble)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THCStorage_(copyHalf)(state, storage, src);
+  else
+    luaL_typerror(L, 2, "torch.*Storage");
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int cutorch_Storage_(getDevice)(lua_State *L) {
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  lua_pushinteger(L, THCStorage_(getDevice)(cutorch_getstate(L), storage) + 1);
+  return 1;
+}
+
+void cutorch_Storage_(init)(lua_State* L)
+{
+  /* the standard stuff */
+  torch_Storage_(init)(L);
+
+  // Register this even though it is generated elsewhere.
+  cutorch_StorageCopy_(init)(L);
+
+  luaT_pushmetatable(L, torch_Storage);
+  lua_pushcfunction(L, cutorch_Storage_(copy));
+  lua_setfield(L, -2, "copy");
+  lua_pop(L, 1);
+
+  luaT_pushmetatable(L, torch_Storage);
+  lua_pushcfunction(L, cutorch_Storage_(getDevice));
+  lua_setfield(L, -2, "getDevice");
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/CStorageCopy.c b/generic/CStorageCopy.c
new file mode 100644
index 0000000..00c8fb8
--- /dev/null
+++ b/generic/CStorageCopy.c
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/CStorageCopy.c"
+#else
+
+#include "THHalf.h"
+
+static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
+  void *src;
+  if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Storage) )))
+    THStorage_(copy)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
+    THStorage_(copyByte)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
+    THStorage_(copyChar)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
+    THStorage_(copyShort)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
+    THStorage_(copyInt)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
+    THStorage_(copyLong)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
+    THStorage_(copyFloat)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
+    THStorage_(copyDouble)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THStorage_(copyHalf)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
+    THStorage_(copyCudaFloat)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
+    THStorage_(copyCudaLong)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
+    THStorage_(copyCudaByte)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
+    THStorage_(copyCudaChar)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
+    THStorage_(copyCudaShort)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
+    THStorage_(copyCudaInt)(cutorch_getstate(L), storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
+    THStorage_(copyCudaDouble)(cutorch_getstate(L), storage, src);
+#ifdef CUDA_HALF_TENSOR
+  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
+    THStorage_(copyCudaHalf)(cutorch_getstate(L), storage, src);
+#endif
+  else
+    luaL_typerror(L, 2, "torch.*Storage");
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+void cutorch_StorageCopy_(init)(lua_State* L)
+{
+  // torch_Storage macro is defined in Storage.c produce the CudaTensor types
+  // so I have to construct the normal torch types by hand
+  luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
+  lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
+  lua_setfield(L, -2, "copy");
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/CTensor.c b/generic/CTensor.c
new file mode 100644
index 0000000..50681ff
--- /dev/null
+++ b/generic/CTensor.c
@@ -0,0 +1,223 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/CTensor.c"
+#else
+
+/* everything is as the generic Storage.c, except few things (see below) */
+
+#define TH_GENERIC_FILE "generic/Tensor.c"
+#include "generic/Tensor.c"
+#undef TH_GENERIC_FILE
+
+/* now we overwrite some methods specific to CudaTensor */
+static int cutorch_Tensor_(copy)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  void *src;
+  if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) )
+    THCTensor_(copyCudaFloat)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaByteTensor")) )
+    THCTensor_(copyCudaByte)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaCharTensor")) )
+    THCTensor_(copyCudaChar)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaShortTensor")) )
+    THCTensor_(copyCudaShort)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaIntTensor")) )
+    THCTensor_(copyCudaInt)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaLongTensor")) )
+    THCTensor_(copyCudaLong)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleTensor")) )
+    THCTensor_(copyCudaDouble)(state, tensor, src);
+#ifdef CUDA_HALF_TENSOR
+  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfTensor")) )
+    THCTensor_(copyCudaHalf)(state, tensor, src);
+#endif
+
+  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
+    THCTensor_(copyByte)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
+    THCTensor_(copyChar)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
+    THCTensor_(copyShort)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
+    THCTensor_(copyInt)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
+    THCTensor_(copyLong)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
+    THCTensor_(copyFloat)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
+    THCTensor_(copyDouble)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) )
+    THCTensor_(copyHalf)(state, tensor, src);
+  else
+    luaL_typerror(L, 2, "torch.*Tensor");
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L)
+{
+#define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor)
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, STRINGIFY_TENSOR(CReal));
+  void *src;
+  if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(CReal))))
+    THCTensor_(copy)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(Real))))
+    THCTensor_(copyAsyncCPU)(state, tensor, src);
+  else
+    luaL_typerror(L, 2, STRINGIFY_TENSOR(Real) " or " STRINGIFY_TENSOR(CReal));
+
+  lua_settop(L, 1);
+  return 1;
+#undef STRINGIFY_TENSOR
+}
+
+static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L)
+{
+#define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor)
+  THTensor *tensor = luaT_checkudata(L, 1, STRINGIFY_TENSOR(Real));
+  void *src;
+  if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(CReal))))
+    THTensor_(copyAsyncCuda)(cutorch_getstate(L), tensor, src);
+  else
+    luaL_typerror(L, 2, STRINGIFY_TENSOR(CReal));
+
+  lua_settop(L, 1);
+  return 1;
+#undef STRINGIFY_TENSOR
+}
+
+#ifdef THC_REAL_IS_FLOAT
+static void THFloatTensor_computesz(THFloatTensor *self, long **sz_, long **st_)
+{
+  long *sz, *st, *szh;
+  int i;
+
+  sz = (long*)THAlloc(sizeof(long)*self->nDimension);
+  st = (long*)THAlloc(sizeof(long)*self->nDimension);
+  szh = (long*)THAlloc(sizeof(long)*self->nDimension);
+
+  for(i = self->nDimension-1; i >= 0; i--)
+  {
+    if(i == self->nDimension-1)
+      szh[i] = 1;
+    else
+      szh[i] = szh[i+1]*self->size[i+1];
+  }
+
+  memcpy(sz, szh, self->nDimension * sizeof(long));
+  memcpy(st, self->stride, self->nDimension * sizeof(long));
+  THFree(szh);
+
+  *sz_ = sz;
+  *st_ = st;
+}
+
+void THFloatTensor_kernel_copy(float *dst,
+                                         long *dst_sz, long *dst_st, int dst_dim,
+                                         float *src,
+                                         long *src_sz, long *src_st, int src_dim,
+                                         ptrdiff_t n_elem)
+{
+  ptrdiff_t k;
+
+  for(k = 0; k < n_elem; k++)
+  {
+    ptrdiff_t src_idx = 0;
+    ptrdiff_t src_rest = k;
+    ptrdiff_t dst_idx = 0;
+    ptrdiff_t dst_rest = k;
+    int dim;
+
+    for(dim = 0; dim < dst_dim; dim++)
+    {
+      dst_idx += (dst_rest/dst_sz[dim])*dst_st[dim];
+      dst_rest = dst_rest % dst_sz[dim];
+    }
+
+    for(dim = 0; dim < src_dim; dim++)
+    {
+      src_idx += (src_rest/src_sz[dim])*src_st[dim];
+      src_rest = src_rest % src_sz[dim];
+    }
+
+    dst[dst_idx] = src[src_idx];
+  }
+}
+
+static int cuda_FloatTensor_fakecopy(lua_State *L)
+{
+  THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor");
+  THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor");
+  long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st;
+  ptrdiff_t nElement = THFloatTensor_nElement(self);
+
+  THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match");
+
+  THFloatTensor_computesz(self, &d_self_sz, &d_self_st);
+  THFloatTensor_computesz(src, &d_src_sz, &d_src_st);
+
+  THFloatTensor_kernel_copy(THFloatTensor_data(self),
+                            d_self_sz, d_self_st, self->nDimension,
+                            THFloatTensor_data(src),
+                            d_src_sz, d_src_st, src->nDimension,
+                            nElement);
+
+  THFree(d_self_sz);
+  THFree(d_self_st);
+  THFree(d_src_sz);
+  THFree(d_src_st);
+
+  lua_settop(L, 1);
+  return 1;
+}
+#endif
+
+static int cutorch_Tensor_(getDevice)(lua_State *L) {
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushinteger(L, THCTensor_(getDevice)(cutorch_getstate(L), tensor) + 1);
+  return 1;
+}
+
+void cutorch_Tensor_(init)(lua_State* L)
+{
+  /* the standard stuff */
+  torch_Tensor_(init)(L);
+
+  /* additional methods */
+#ifdef THC_REAL_IS_FLOAT
+  luaT_pushmetatable(L, "torch.FloatTensor");
+  lua_pushcfunction(L, cuda_FloatTensor_fakecopy);
+  lua_setfield(L, -2, "fakecopy");
+  lua_pop(L, 1);
+#endif
+
+  // Register this even though it is generated elsewhere.
+  cutorch_TensorCopy_(init)(L);
+
+  // Register async copy methods.
+  luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Tensor));
+  lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda));
+  lua_setfield(L, -2, "copyAsync");
+  lua_pop(L, 1);
+
+  luaT_pushmetatable(L, torch_Tensor);
+  lua_pushcfunction(L, cutorch_Tensor_(copyAsyncCPU));
+  lua_setfield(L, -2, "copyAsync");
+  lua_pop(L, 1);
+
+  luaT_pushmetatable(L, torch_Tensor);
+  lua_pushcfunction(L, cutorch_Tensor_(copy));
+  lua_setfield(L, -2, "copy");
+  lua_pop(L, 1);
+
+  luaT_pushmetatable(L, torch_Tensor);
+  lua_pushcfunction(L, cutorch_Tensor_(getDevice));
+  lua_setfield(L, -2, "getDevice");
+
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/CTensorCopy.c b/generic/CTensorCopy.c
new file mode 100644
index 0000000..9c62fbf
--- /dev/null
+++ b/generic/CTensorCopy.c
@@ -0,0 +1,60 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/CTensorCopy.c"
+#else
+
+static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Tensor));
+  void *src;
+  if( (src = luaT_toudata(L, 2, TH_CONCAT_STRING_3(torch.,Real,Tensor)) ))
+    THTensor_(copy)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
+    THTensor_(copyByte)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
+    THTensor_(copyChar)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
+    THTensor_(copyShort)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
+    THTensor_(copyInt)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
+    THTensor_(copyLong)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
+    THTensor_(copyFloat)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
+    THTensor_(copyDouble)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) )
+    THTensor_(copyHalf)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaByteTensor")) )
+    THTensor_(copyCudaByte)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaCharTensor")) )
+    THTensor_(copyCudaChar)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaShortTensor")) )
+    THTensor_(copyCudaShort)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaIntTensor")) )
+    THTensor_(copyCudaInt)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaLongTensor")) )
+    THTensor_(copyCudaLong)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) )
+    THTensor_(copyCudaFloat)(cutorch_getstate(L), tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleTensor")) )
+    THTensor_(copyCudaDouble)(cutorch_getstate(L), tensor, src);
+#ifdef CUDA_HALF_TENSOR
+  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfTensor")) )
+    THTensor_(copyCudaHalf)(cutorch_getstate(L), tensor, src);
+#endif
+  else
+    luaL_typerror(L, 2, "torch.*Tensor");
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+void cutorch_TensorCopy_(init)(lua_State* L)
+{
+  luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Tensor));
+  lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Tensor_copy));
+  lua_setfield(L, -2, "copy");
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
new file mode 100644
index 0000000..55fa0f1
--- /dev/null
+++ b/generic/TensorOperator.c
@@ -0,0 +1,262 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TensorOperator.c"
+#else
+
+static int cutorch_TensorOperator_(__add__)(lua_State *L)
+{
+  THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THCTensor *r;
+  THCState *state = cutorch_getstate(L);
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two Tensors or one Tensor and one number");
+  else
+  {
+    r = THCTensor_(new)(state);
+    luaT_pushudata(L, r, torch_Tensor);
+
+    if(!tensor1 && tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor2);
+      THCTensor_(copy)(state, r, tensor2);
+      double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(add)(state, r, r, value);
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor1);
+      THCTensor_(copy)(state, r, tensor1);
+
+      double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(add)(state, r, r, value);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, r, tensor1);
+      THCTensor_(copy)(state, r, tensor1);
+
+#ifdef THC_REAL_IS_HALF
+      half one = THC_float2half(1.0f);
+#else
+      real one = (real) 1;
+#endif
+
+      THCTensor_(cadd)(state, r, r, one, tensor2);
+    }
+  }
+  return 1;
+}
+
+static int cutorch_TensorOperator_(__sub__)(lua_State *L)
+{
+  THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THCTensor *r;
+  THCState *state = cutorch_getstate(L);
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two Tensors or one Tensor and one number");
+  else
+  {
+    r = THCTensor_(new)(state);
+    luaT_pushudata(L, r, torch_Tensor);
+
+#ifdef THC_REAL_IS_HALF
+      half neg = THC_float2half(-1.0f);
+#else
+      real neg = (real) -1;
+#endif
+
+    if(!tensor1 && tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor2);
+
+      double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(fill)(state, r, value);
+      THCTensor_(cadd)(state, r, r, neg, tensor2);
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor1);
+      THCTensor_(copy)(state, r, tensor1);
+
+      double v = -luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(add)(state, r, r, value);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, r, tensor1);
+      THCTensor_(copy)(state, r, tensor1);
+      THCTensor_(cadd)(state, r, r, neg, tensor2);
+    }
+  }
+  return 1;
+}
+
+static int cutorch_TensorOperator_(__unm__)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor *r;
+  THCState *state = cutorch_getstate(L);
+  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+
+  r = THCTensor_(new)(state);
+  luaT_pushudata(L, r, torch_Tensor);
+  THCTensor_(resizeAs)(state, r, tensor);
+  THCTensor_(copy)(state, r, tensor);
+
+#ifdef THC_REAL_IS_HALF
+      half neg = THC_float2half(-1.0f);
+#else
+      real neg = (real) -1;
+#endif
+
+  THCTensor_(mul)(state, r, r, neg);
+
+  return 1;
+}
+
+static int cutorch_TensorOperator_(__mul__)(lua_State *L)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THCTensor *r;
+  THCState *state = cutorch_getstate(L);
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two Tensors or one Tensor and one number");
+  else
+  {
+    r = THCTensor_(new)(state);
+    luaT_pushudata(L, r, torch_Tensor);
+
+    if(!tensor1 && tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor2);
+      THCTensor_(copy)(state, r, tensor2);
+
+      double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(mul)(state, r, r, value);
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THCTensor_(resizeAs)(state, r, tensor1);
+      THCTensor_(copy)(state, r, tensor1);
+
+      double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half((float) v);
+#else
+      real value = (real) v;
+#endif
+
+      THCTensor_(mul)(state, r, r, value);
+    }
+    else
+    {
+      int dimt = tensor1->nDimension;
+      int dims = tensor2->nDimension;
+
+      if(dimt == 1 && dims == 1)
+        lua_pushnumber(L, THCTensor_(dot)(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */
+      else if(dimt == 2 && dims == 1)
+      {
+        THCTensor_(resize1d)(state, r, tensor1->size[0]);
+        THCTensor_(zero)(state, r);
+        THCTensor_(addmv)(state, r, 1, r, 1, tensor1, tensor2);
+      }
+      else if(dimt == 2 && dims == 2)
+      {
+        THCTensor_(resize2d)(state, r, tensor1->size[0], tensor2->size[1]);
+        THCTensor_(zero)(state, r);
+        THCTensor_(addmm)(state, r, 1, r, 1, tensor1, tensor2);
+      }
+      else
+        luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
+    }
+  }
+#else
+  THError("unimplemented");
+#endif
+  return 1;
+}
+
+static int cutorch_TensorOperator_(__div__)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor *r;
+  THCState *state = cutorch_getstate(L);
+  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+
+  luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected");
+
+  r = THCTensor_(new)(state);
+  luaT_pushudata(L, r, torch_Tensor);
+
+  THCTensor_(resizeAs)(state, r, tensor);
+  THCTensor_(copy)(state, r, tensor);
+
+  double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+  half value = THC_float2half(1.0f / (float) v);
+#else
+  real value = (real) 1 / (real) v;
+#endif
+
+  THCTensor_(mul)(state, r, r, value);
+
+  return 1;
+}
+
+static const struct luaL_Reg cutorch_TensorOperator_(_) [] = {
+  {"__add__", cutorch_TensorOperator_(__add__)},
+  {"__sub__", cutorch_TensorOperator_(__sub__)},
+  {"__unm__", cutorch_TensorOperator_(__unm__)},
+  {"__mul__", cutorch_TensorOperator_(__mul__)},
+  {"__div__", cutorch_TensorOperator_(__div__)},
+  {NULL, NULL}
+};
+
+void cutorch_TensorOperator_(init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_setfuncs(L, cutorch_TensorOperator_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/init.c b/init.c
new file mode 100644
index 0000000..894be2e
--- /dev/null
+++ b/init.c
@@ -0,0 +1,1125 @@
+#include "utils.h"
+#include "luaT.h"
+#include "THCGeneral.h"
+#include "THCCachingAllocator.h"
+#include "THCCachingHostAllocator.h"
+#include "THCSleep.h"
+#include "THCTensorRandom.h"
+#include "THCHalf.h" // for CUDA_HALF_TENSOR
+
+extern void cutorch_CudaByteStorage_init(lua_State* L);
+extern void cutorch_CudaCharStorage_init(lua_State* L);
+extern void cutorch_CudaShortStorage_init(lua_State* L);
+extern void cutorch_CudaIntStorage_init(lua_State* L);
+extern void cutorch_CudaLongStorage_init(lua_State* L);
+extern void cutorch_CudaStorage_init(lua_State* L);
+extern void cutorch_CudaDoubleStorage_init(lua_State* L);
+#ifdef CUDA_HALF_TENSOR
+extern void cutorch_CudaHalfStorage_init(lua_State* L);
+#else
+extern void cutorch_HalfStorageCopy_init(lua_State *L);
+#endif
+
+extern void cutorch_CudaByteTensor_init(lua_State* L);
+extern void cutorch_CudaCharTensor_init(lua_State* L);
+extern void cutorch_CudaShortTensor_init(lua_State* L);
+extern void cutorch_CudaIntTensor_init(lua_State* L);
+extern void cutorch_CudaLongTensor_init(lua_State* L);
+extern void cutorch_CudaTensor_init(lua_State* L);
+extern void cutorch_CudaDoubleTensor_init(lua_State* L);
+#ifdef CUDA_HALF_TENSOR
+extern void cutorch_CudaHalfTensor_init(lua_State* L);
+#else
+extern void cutorch_HalfTensorCopy_init(lua_State *L);
+#endif
+
+extern void cutorch_CudaByteTensorOperator_init(lua_State* L);
+extern void cutorch_CudaCharTensorOperator_init(lua_State* L);
+extern void cutorch_CudaShortTensorOperator_init(lua_State* L);
+extern void cutorch_CudaIntTensorOperator_init(lua_State* L);
+extern void cutorch_CudaLongTensorOperator_init(lua_State* L);
+extern void cutorch_CudaTensorOperator_init(lua_State* L);
+extern void cutorch_CudaDoubleTensorOperator_init(lua_State* L);
+#ifdef CUDA_HALF_TENSOR
+extern void cutorch_CudaHalfTensorOperator_init(lua_State* L);
+#endif
+
+extern void cutorch_CudaByteTensorMath_init(lua_State* L);
+extern void cutorch_CudaCharTensorMath_init(lua_State* L);
+extern void cutorch_CudaShortTensorMath_init(lua_State* L);
+extern void cutorch_CudaIntTensorMath_init(lua_State* L);
+extern void cutorch_CudaLongTensorMath_init(lua_State* L);
+extern void cutorch_CudaTensorMath_init(lua_State* L);
+extern void cutorch_CudaDoubleTensorMath_init(lua_State* L);
+#ifdef CUDA_HALF_TENSOR
+extern void cutorch_CudaHalfTensorMath_init(lua_State* L);
+#endif
+
+
+/*
+   Iteration utilities for lists of streams and lists of gpus with streams
+*/
+
+int checkAndCountListOfStreams(lua_State *L, THCState *state, int arg,
+                               int device)
+{
+  if (!lua_istable(L, arg)) {
+    THError("expecting array of device streams");
+  }
+
+  /* Push table to top */
+  lua_pushvalue(L, arg);
+
+  /* Check that all values in the table are numeric and in bounds */
+  int streams = 0;
+  lua_pushnil(L);
+  while (lua_next(L, -2)) {
+    if (!lua_isnumber(L, -2)) {
+      THError("expected array of streams, not table");
+    }
+    if (!lua_isnumber(L, -1)) {
+      THError("array of stream ids must contain numeric ids");
+    }
+    int streamId = (int) lua_tonumber(L, -1);
+
+    /* This will error out if the stream is not in bounds */
+    THCState_getDeviceStream(state, device, streamId);
+
+    ++streams;
+    lua_pop(L, 1);
+  }
+
+  /* Pop table from top */
+  lua_pop(L, 1);
+  return streams;
+}
+
+void checkAndCountListOfGPUStreamPairs(lua_State *L, THCState *state, int arg,
+                                       int* gpus,
+                                       int* streams)
+{
+  if (!lua_istable(L, arg)) {
+    THError("expecting table of gpu={streams...}");
+  }
+
+  /* Push table to top */
+  lua_pushvalue(L, arg);
+
+  /* Check that all values in the table are tables of numeric and in bounds */
+  *gpus = 0;
+  *streams = 0;
+
+  lua_pushnil(L);
+  while (lua_next(L, -2)) {
+    /* -2 is key (device), -1 is value, in the form device={streams...} */
+    if (!lua_isnumber(L, -2) || !lua_istable(L, -1)) {
+      THError("expecting table of gpu={streams...}");
+    }
+
+    int device = (int) lua_tonumber(L, -2) - 1;
+    /* Verify device is in range */
+    if (device < 0 || device >= THCState_getNumDevices(state)) {
+      THError("%d is not a device", device + 1);
+    }
+
+    /* Verify that the list is a list of streams */
+    *streams += checkAndCountListOfStreams(L, state, -1, device);
+    ++(*gpus);
+    lua_pop(L, 1);
+  }
+
+  /* Pop table from top */
+  lua_pop(L, 1);
+}
+
+int createSingleDeviceEvents(lua_State *L, THCState *state, int arg,
+                             int device, cudaEvent_t* event)
+{
+
+  /* Push table to top */
+  lua_pushvalue(L, arg);
+
+  /* Record events */
+  lua_pushnil(L);
+  int i = 0;
+  while (lua_next(L, -2)) {
+    int streamId = (int) lua_tonumber(L, -1);
+    cudaStream_t streamWaitingOn =
+      THCState_getDeviceStream(state, device, streamId);
+    THCudaCheck(cudaEventCreateWithFlags(&event[i], cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(event[i], streamWaitingOn));
+    lua_pop(L, 1);
+    i++;
+  }
+  /* Pop table from top */
+  lua_pop(L, 1);
+  return i;
+}
+
+void createMultiDeviceEvents(lua_State *L, THCState *state, int arg,
+                             cudaEvent_t* events)
+{
+  /* Push {gpu={streams...}} table */
+  lua_pushvalue(L, arg);
+
+  /* Create and record events per each GPU */
+  int gpu = 0;
+  lua_pushnil(L);
+  while (lua_next(L, -2)) {
+    int device = (int) lua_tonumber(L, -2) - 1;
+    THCudaCheck(cudaSetDevice(device));
+    events += createSingleDeviceEvents(L, state, -1, device, events);
+    ++gpu;
+
+    lua_pop(L, 1);
+  }
+
+  /* Pop {gpu={streams...}} table */
+  lua_pop(L, 1);
+}
+
+void waitSingleDeviceEvents(lua_State *L, THCState *state, int arg,
+                           int device, cudaEvent_t * event, int numEvents)
+{
+  /* Push table to top */
+  lua_pushvalue(L, arg);
+
+  /* Then, wait on the events. Each stream is actually waiting on itself here
+     too, but that's harmless and isn't worth weeding out. */
+  lua_pushnil(L);
+  while (lua_next(L, -2)) {
+    int streamId = (int) lua_tonumber(L, -1);
+    cudaStream_t stream =
+      THCState_getDeviceStream(state, device, streamId);
+    for (int i = 0; i < numEvents; i++) {
+      THCudaCheck(cudaStreamWaitEvent(stream, event[i], 0));
+    }
+    lua_pop(L, 1);
+  }
+
+  /* Pop table from top */
+  lua_pop(L, 1);
+}
+
+
+void waitMultiDeviceEvents(lua_State *L, THCState *state, int arg,
+                           cudaEvent_t* events, int streams)
+{
+  /* Push {gpu={streams...}} table */
+  lua_pushvalue(L, arg);
+
+  /* Then, wait on the events. Each stream is actually waiting on itself here
+     too, but that's harmless and isn't worth weeding out. */
+  lua_pushnil(L);
+  while (lua_next(L, -2)) {
+    int device = (int) lua_tonumber(L, -2) - 1;
+    THCudaCheck(cudaSetDevice(device));
+
+    /* Push stream table */
+    lua_pushvalue(L, -1);
+    lua_pushnil(L);
+    while (lua_next(L, -2)) {
+      int streamId = (int) lua_tonumber(L, -1);
+
+      cudaStream_t stream =
+        THCState_getDeviceStream(state, device, streamId);
+
+      /* Each stream waits on all events */
+      for (int i = 0; i < streams; ++i) {
+        THCudaCheck(cudaStreamWaitEvent(stream, events[i], 0));
+      }
+
+      lua_pop(L, 1);
+    }
+
+    /* Pop stream table and GPU entry */
+    lua_pop(L, 2);
+  }
+
+  /* Pop {gpu={streams...}} table */
+  lua_pop(L, 1);
+}
+
+/* Synchronizes the host with respect to the current device */
+static int cutorch_synchronize(lua_State *L)
+{
+  THCudaCheck(cudaDeviceSynchronize());
+  return 0;
+}
+
+/* Synchronizes the host with respect to all devices */
+static int cutorch_synchronizeAll(lua_State *L)
+{
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+
+  int devices = -1;
+  THCudaCheck(cudaGetDeviceCount(&devices));
+
+  for (int i = 0; i < devices; ++i) {
+    THCudaCheck(cudaSetDevice(i));
+    THCudaCheck(cudaDeviceSynchronize());
+  }
+
+  THCudaCheck(cudaSetDevice(prevDev));
+
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.reserveStreams(n)
+   Allocates n user streams for every device present. If fewer than
+   n streams are currently allocated, an additional number will be added.
+   If more than n streams are currently allocated, does nothing.
+   The default CUDA stream is assumed to be stream 0 and is always present;
+   the allocated streams are user streams on top of the CUDA streams
+   (thus, reserveStreams(1) will create 1 user stream with two being available,
+   the default stream 0 and the user stream 1, on each device).
+*/
+static int cutorch_reserveStreams(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int numStreams = (int) luaL_checknumber(L, 1);
+  int nonBlocking = lua_toboolean(L, 2);
+  THCState_reserveStreams(state, numStreams, nonBlocking);
+
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.reserveBlasHandles(n)
+   Allocates n blasHandles for every device present. If fewer than
+   n blasHandles are currently allocated, an additional number will be added.
+   If more than n blasHandles are currently allocated, does nothing.
+   Unlike for streams, there is no default blasHandle.
+*/
+static int cutorch_reserveBlasHandles(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int numHandles = (int) luaL_checknumber(L, 1);
+  THCState_reserveBlasHandles(state, numHandles);
+
+  return 0;
+}
+
+/*
+   Usage:
+   n = cutorch.getNumStreams()
+   Returns the number of user streams allocated for every device present.
+   By default, is 0.
+*/
+static int cutorch_getNumStreams(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushnumber(L, THCState_getNumStreams(state));
+
+  return 1;
+}
+
+/*
+   Usage:
+   n = cutorch.getNumBlasHandles()
+   Returns the number of user blasHandles allocated for every device present.
+   By default, is 1.
+*/
+static int cutorch_getNumBlasHandles(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushnumber(L, THCState_getNumBlasHandles(state));
+
+  return 1;
+}
+
+/*
+   Usage:
+   cutorch.setStream(n)
+   For all devices, sets the current user stream in use to the index
+   specified. e.g.,
+   ---
+   cutorch.setDevice(1)
+   cutorch.setStream(3)
+   -- device 1 stream 3 in use here
+   cutorch.setDevice(2)
+   -- device 2 stream 3 in use here
+   ---
+   0 is the default stream on the device.
+*/
+static int cutorch_setStream(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int stream = (int) luaL_checknumber(L, 1);
+  THCState_setCurrentStreamIndex(state, stream);
+
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.setBlasHandle(n)
+   For all devices, sets the current blasHandle in use to the index
+   specified. e.g.,
+   ---
+   cutorch.setDevice(1)
+   cutorch.setBlasHandle(3)
+   -- device 1 blasHandle 3 in use here
+   cutorch.setDevice(2)
+   -- device 2 blasHandle 3 in use here
+   ---
+*/
+static int cutorch_setBlasHandle(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int handle = (int) luaL_checknumber(L, 1);
+  THCState_setCurrentBlasHandleIndex(state, handle);
+
+  return 0;
+}
+
+/*
+   Usage:
+   n = cutorch.getStream()
+   Returns the current user stream for all devices in use (as previously
+   set via cutorch.setStream(n). 0 is the default stream on the device
+   and is its initial value.
+*/
+static int cutorch_getStream(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushnumber(L, THCState_getCurrentStreamIndex(state));
+
+  return 1;
+}
+
+/*
+   Usage:
+   n = cutorch.getBlasHandle()
+   Returns the current blasHandle for all devices in use (as previously
+   set via cutorch.setBlasHandle(n).
+*/
+static int cutorch_getBlasHandle(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushnumber(L, THCState_getCurrentBlasHandleIndex(state));
+
+  return 1;
+}
+
+/*
+   Usage:
+   cutorch.setDefaultStream()
+   Equivalent to cutorch.setStream(0).
+*/
+static int cutorch_setDefaultStream(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCState_setStream(state, NULL);
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN})
+   for streams on the current device. Creates a one-way barrier where
+   waiterStream waits for waitForStream1-N to reach the current point.
+*/
+static int cutorch_streamWaitFor(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+
+  /* Check that the waiting stream is in bounds; this will error out if not */
+  int waitingId = (int) luaL_checknumber(L, 1);
+  cudaStream_t streamWaiting =
+    THCState_getDeviceStream(state, curDev, waitingId);
+
+  /* Validate the streams that we are waiting on */
+  int streams = checkAndCountListOfStreams(L, state, 2, curDev);
+
+  if (streams < 1) {
+    /* nothing to synchronize */
+    return 0;
+  }
+  /* One-way dependency; streamWaiting will wait for the list of streams to
+     wait on to complete execution of pending scheduled kernels/events */
+  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
+  createSingleDeviceEvents(L, state, 2, curDev, events);
+  /* Then, wait on them */
+  for (int i = 0; i < streams; i++) {
+    THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
+    THCudaCheck(cudaEventDestroy(events[i]));
+  }
+  free(events);
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter,
+                                    {[gpu1]={stream1_1, ..., stream1_N},
+                                    [gpuK]={streamK_1, ..., streamK_M}})
+   with a specified GPU per each list of streams.
+   Stream (gpuWaiter, streamWaiter) will wait on all of the other streams
+   (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
+   (gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way
+   barrier only (only streamWaiter is blocked).
+   The streams to wait on are bucketed per device. Equivalent to
+   streamWaitFor() if only one GPU's streams are listed.
+*/
+static int cutorch_streamWaitForMultiDevice(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+
+  /* Validate waiting (gpu, stream); this will error out if not */
+  int gpuWaiter = (int) luaL_checknumber(L, 1) - 1;
+  int streamWaiter = (int) luaL_checknumber(L, 2);
+  cudaStream_t streamWaiting =
+    THCState_getDeviceStream(state, gpuWaiter, streamWaiter);
+
+  /* Validate and count set of {gpu={streams...}} we are waiting on */
+  int gpus = 0;
+  int streams = 0;
+  checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams);
+
+  if (streams < 1) {
+    /* nothing to synchronize together */
+    return 0;
+  }
+
+  /*
+     Events can only be recorded on the same device on which they are created.
+     -For each GPU, create and record event per each stream given
+     for that GPU.
+     -For (gpuWaiter, streamWaiter), wait on all of the above events.
+  */
+  cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);
+
+  /* First, create an event per GPU and record events for the specified stream
+     on that GPU */
+  createMultiDeviceEvents(L, state, 3, events);
+
+  /* Then, wait on the events */
+  THCudaCheck(cudaSetDevice(gpuWaiter));
+  for (int i = 0; i < streams; ++i) {
+    THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
+  }
+
+  /* Clean up events */
+  for (int i = 0; i < streams; ++i) {
+    THCudaCheck(cudaEventDestroy(events[i]));
+  }
+  free(events);
+  THCudaCheck(cudaSetDevice(prevDev));
+
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.streamBarrier({stream1, stream2, ..., streamN})
+   applies to streams for the current device. Creates a N-way barrier
+   to synchronize all of the streams given
+*/
+static int cutorch_streamBarrier(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+
+  int streams = checkAndCountListOfStreams(L, state, 1, curDev);
+
+  if (streams < 2) {
+    /* nothing to synchronize together */
+    return 0;
+  }
+  /* Multi-way dependency (barrier); all streams must complete execution
+     of pending scheduled kernels/events */
+  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
+  /* First, create an event and record them for all streams */
+  int eventsCreated =  createSingleDeviceEvents(L, state, 1, curDev, events);
+
+  /* Then, wait on the event. Each stream is actually waiting on itself here
+     too, but that's harmless and isn't worth weeding out. */
+  waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated);
+  for (int i = 0; i < eventsCreated; i++)
+    THCudaCheck(cudaEventDestroy(events[i]));
+
+  free(events);
+  return 0;
+}
+
+/* usage:
+   cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N},
+                                     [gpuK]={streamK_1, ..., streamK_M}})
+   with a specified GPU per each list of streams.
+   Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
+               (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait
+   for all others to complete fully.
+   Streams are bucketed per device. Equivalent to streamBarrier() if only
+   one GPU is specified.
+ */
+static int cutorch_streamBarrierMultiDevice(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+
+  /* Validate and count set of {gpu={streams...}} that are mutually waiting */
+  int gpus = 0;
+  int streams = 0;
+  checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams);
+
+  if (streams < 2) {
+    /* nothing to synchronize together */
+    return 0;
+  }
+
+  /*
+     Events can only be recorded on the same device on which they are created.
+     -For each GPU, create an event, and record that event on each stream given
+     for that GPU.
+     -For each GPU, for each stream, wait on the event created by each other
+     GPU.
+  */
+  cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);
+
+  /* First, create an event per GPU and record events for the specified stream
+     on that GPU */
+  createMultiDeviceEvents(L, state, 1, events);
+
+  /* Then, wait on the events. Each stream is actually waiting on itself here
+     too, but that's harmless and isn't worth weeding out. */
+  waitMultiDeviceEvents(L, state, 1, events, streams);
+
+  /* Clean up events */
+  for (int i = 0; i < streams; ++i) {
+    THCudaCheck(cudaEventDestroy(events[i]));
+  }
+  free(events);
+  THCudaCheck(cudaSetDevice(prevDev));
+
+  return 0;
+}
+
+/*
+   Usage:
+   cutorch.streamSynchronize(n)
+   For the current device, synchronizes with the given stream only
+   (cudaStreamSynchronize).
+   0 is the default stream on the device.
+*/
+static int cutorch_streamSynchronize(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int streamId = (int) luaL_checknumber(L, 1);
+
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+
+  /* This also validates the stream */
+  cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId);
+  THCudaCheck(cudaStreamSynchronize(stream));
+
+  return 0;
+}
+
+static int cutorch_getDevice(lua_State *L)
+{
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  device++;
+  lua_pushnumber(L, device);
+  return 1;
+}
+
+static int cutorch_deviceReset(lua_State *L)
+{
+  printf("WARNING: cutorch.deviceReset has been depreceated."
+	 " Just remove the call from your code.\n");
+  return 0;
+}
+
+static int cutorch_getDeviceCount(lua_State *L)
+{
+  int ndevice;
+  THCudaCheck(cudaGetDeviceCount(&ndevice));
+  lua_pushnumber(L, ndevice);
+  return 1;
+}
+
+static int cutorch_getPeerToPeerAccess(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int dev = (int) luaL_checknumber(L, 1) - 1;
+  int devToAccess = (int) luaL_checknumber(L, 2) - 1;
+
+  /* device bounds checking is performed within */
+  int enabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
+  lua_pushboolean(L, enabled);
+
+  return 1;
+}
+
+static int cutorch_setPeerToPeerAccess(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int dev = (int) luaL_checknumber(L, 1) - 1;
+  int devToAccess = (int) luaL_checknumber(L, 2) - 1;
+  int enable = lua_toboolean(L, 3);
+
+  /* device bounds checking is performed within */
+  THCState_setPeerToPeerAccess(state, dev, devToAccess, enable);
+
+  return 0;
+}
+
+static int cutorch_getKernelPeerToPeerAccess(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  lua_pushboolean(L, THCState_getKernelPeerToPeerAccessEnabled(state));
+
+  return 1;
+}
+
+static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+
+  int val = lua_toboolean(L, -1);
+  THCState_setKernelPeerToPeerAccessEnabled(state, val);
+
+  return 0;
+}
+
+static int cutorch_getMemoryUsage(lua_State *L) {
+  size_t freeBytes = 0;
+  size_t totalBytes = 0;
+  int curDevice;
+  THCudaCheck(cudaGetDevice(&curDevice));
+  THCState *state = cutorch_getstate(L);
+
+  int device = luaL_optint(L, 1, -10);
+  if (device == -10) { /* no argument passed, current device mem usage */
+    THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes));
+  } else { /* argument was given, particular device's memory usage */
+    THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */
+    THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes));
+    THCudaCheck(cudaSetDevice(curDevice));
+  }
+  lua_pushnumber(L, freeBytes);
+  lua_pushnumber(L, totalBytes);
+  return 2;
+}
+
+static int cutorch_setDevice(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int device = (int)luaL_checknumber(L, 1)-1;
+  THCudaCheck(cudaSetDevice(device));
+  return 0;
+}
+
+#define SET_DEVN_PROP(NAME) \
+  lua_pushnumber(L, prop.NAME); \
+  lua_setfield(L, -2, #NAME);
+
+static int cutorch_getDeviceProperties(lua_State *L)
+{
+  int device = (int)luaL_checknumber(L, 1)-1;
+
+  // switch context to given device so the call to cudaMemGetInfo is for the correct device
+  int oldDevice;
+  THCudaCheck(cudaGetDevice(&oldDevice));
+  THCudaCheck(cudaSetDevice(device));
+
+  struct cudaDeviceProp prop;
+  THCudaCheck(cudaGetDeviceProperties(&prop, device));
+  lua_newtable(L);
+  SET_DEVN_PROP(canMapHostMemory);
+  SET_DEVN_PROP(clockRate);
+  SET_DEVN_PROP(computeMode);
+  SET_DEVN_PROP(deviceOverlap);
+  SET_DEVN_PROP(integrated);
+  SET_DEVN_PROP(kernelExecTimeoutEnabled);
+  SET_DEVN_PROP(major);
+  SET_DEVN_PROP(maxThreadsPerBlock);
+  SET_DEVN_PROP(memPitch);
+  SET_DEVN_PROP(minor);
+  SET_DEVN_PROP(multiProcessorCount);
+  SET_DEVN_PROP(regsPerBlock);
+  SET_DEVN_PROP(sharedMemPerBlock);
+  SET_DEVN_PROP(textureAlignment);
+  SET_DEVN_PROP(totalConstMem);
+  SET_DEVN_PROP(totalGlobalMem);
+  SET_DEVN_PROP(warpSize);
+  SET_DEVN_PROP(pciBusID);
+  SET_DEVN_PROP(pciDeviceID);
+  SET_DEVN_PROP(pciDomainID);
+  SET_DEVN_PROP(maxTexture1D);
+  SET_DEVN_PROP(maxTexture1DLinear);
+
+  size_t freeMem;
+  THCudaCheck(cudaMemGetInfo (&freeMem, NULL));
+  lua_pushnumber(L, freeMem);
+  lua_setfield(L, -2, "freeGlobalMem");
+
+  lua_pushstring(L, prop.name);
+  lua_setfield(L, -2, "name");
+
+  // restore context
+  THCudaCheck(cudaSetDevice(oldDevice));
+
+  return 1;
+}
+
+static int cutorch_seed(lua_State *L)
+{
+  unsigned long long seed = THCRandom_seed(cutorch_getstate(L));
+  lua_pushnumber(L, seed);
+  return 1;
+}
+
+static int cutorch_seedAll(lua_State *L)
+{
+  unsigned long long seed = THCRandom_seedAll(cutorch_getstate(L));
+  lua_pushnumber(L, seed);
+  return 1;
+}
+
+static int cutorch_initialSeed(lua_State *L)
+{
+  unsigned long long seed = THCRandom_initialSeed(cutorch_getstate(L));
+  lua_pushnumber(L, seed);
+  return 1;
+}
+
+static int cutorch_manualSeed(lua_State *L)
+{
+  unsigned long long seed = luaL_checknumber(L, 1);
+  THCRandom_manualSeed(cutorch_getstate(L), seed);
+  return 0;
+}
+
+static int cutorch_manualSeedAll(lua_State* L)
+{
+  unsigned long long seed = luaL_checknumber(L, 1);
+  THCRandom_manualSeedAll(cutorch_getstate(L), seed);
+  return 0;
+}
+
+static int cutorch_getRNGState(lua_State *L)
+{
+  THByteTensor* t = THByteTensor_new();
+  THCRandom_getRNGState(cutorch_getstate(L), t);
+  luaT_pushudata(L, t, "torch.ByteTensor");
+  return 1;
+}
+
+static int cutorch_setRNGState(lua_State *L)
+{
+  THByteTensor* t = luaT_checkudata(L, 1, "torch.ByteTensor");
+  THCRandom_setRNGState(cutorch_getstate(L), t);
+  return 0;
+}
+
+static int cutorch_getState(lua_State *L)
+{
+  lua_getglobal(L, "cutorch");
+  lua_getfield(L, -1, "_state");
+  lua_remove(L, -2);
+  return 1;
+}
+
+static int cutorch_Event_new(lua_State *L)
+{
+  cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t));
+  THCudaCheck(cudaEventCreate(event));
+
+  THCState *state = cutorch_getstate(L);
+  THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state)));
+  luaT_pushudata(L, event, "cutorch.Event");
+
+  return 1;
+}
+
+static int cutorch_Event_free(lua_State *L)
+{
+  cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
+  THCudaCheck(cudaEventDestroy(*event));
+  luaT_free(L, event);
+
+  return 0;
+}
+
+static int cutorch_Event_waitOn(lua_State *L)
+{
+  cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
+  THCState *state = cutorch_getstate(L);
+  THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0));
+
+  return 0;
+}
+
+static const struct luaL_Reg cutorch_Event__[] = {
+  {"waitOn", cutorch_Event_waitOn},
+  {NULL, NULL}
+};
+
+static void cutorch_Event_init(lua_State *L)
+{
+  luaT_newmetatable(L, "cutorch.Event", NULL, cutorch_Event_new, cutorch_Event_free, NULL);
+  luaT_setfuncs(L, cutorch_Event__, 0);
+  lua_pop(L, 1);
+}
+
+static void luaCutorchGCFunction(void *data)
+{
+  lua_State *L = data;
+  lua_gc(L, LUA_GCCOLLECT, 0);
+}
+
+static int cutorch_setHeapTracking(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int enabled = luaT_checkboolean(L,1);
+  if(enabled) {
+    THCSetGCHandler(state, luaCutorchGCFunction, L);
+  } else {
+    THCSetGCHandler(state, NULL, NULL);
+  }
+  return 0;
+}
+
+static int cutorch_isManagedPtr(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  if(lua_type(L, 1) != LUA_TNUMBER) {
+    THError("Must receive a ptr cast as a number");
+  }
+  void* ptr = (void* )luaL_optinteger(L, 1, 0);
+  struct cudaPointerAttributes attributes;
+  cudaError_t res = cudaPointerGetAttributes(&attributes, ptr);
+  if (res == cudaErrorInvalidValue) {
+    lua_pushboolean(L, 0);
+  } else {
+    THCudaCheck(res);
+    lua_pushboolean(L, attributes.isManaged);
+  }
+  return 1;
+}
+
+static int cutorch_shutdown(lua_State *L)
+{
+  THCState **state = (THCState **) lua_topointer(L, 1);
+  THCudaShutdown(*state);
+  THCState_free(*state);
+  return 0;
+}
+
+static int cutorch_hasHalfInstructions(lua_State *L) {
+  THCState *state = cutorch_getstate(L);
+#ifdef CUDA_HALF_TENSOR
+  lua_pushboolean(L, THC_nativeHalfInstructions(state));
+#else
+  lua_pushboolean(L, 0);
+#endif
+  return 1;
+}
+
+static int cutorch_hasFastHalfInstructions(lua_State *L) {
+  THCState *state = cutorch_getstate(L);
+#ifdef CUDA_HALF_TENSOR
+  lua_pushboolean(L, THC_fastHalfInstructions(state));
+#else
+  lua_pushboolean(L, 0);
+#endif
+  return 1;
+}
+
+static int cutorch_sleep(lua_State *L) {
+  THCState *state = cutorch_getstate(L);
+  if (!luaT_checklong(L, 1)) {
+      THError("expected number 'cycles'");
+  }
+  THC_sleep(state, luaT_tolong(L, 1));
+  return 0;
+}
+
+static const struct luaL_Reg cutorch_stuff__ [] = {
+  {"synchronize", cutorch_synchronize},
+  {"synchronizeAll", cutorch_synchronizeAll},
+  {"reserveBlasHandles", cutorch_reserveBlasHandles},
+  {"getNumBlasHandles", cutorch_getNumBlasHandles},
+  {"setBlasHandle", cutorch_setBlasHandle},
+  {"getBlasHandle", cutorch_getBlasHandle},
+  {"reserveStreams", cutorch_reserveStreams},
+  {"getNumStreams", cutorch_getNumStreams},
+  {"setStream", cutorch_setStream},
+  {"getStream", cutorch_getStream},
+  {"setDefaultStream", cutorch_setDefaultStream},
+  {"streamWaitFor", cutorch_streamWaitFor},
+  {"streamWaitForMultiDevice", cutorch_streamWaitForMultiDevice},
+  {"streamBarrier", cutorch_streamBarrier},
+  {"streamBarrierMultiDevice", cutorch_streamBarrierMultiDevice},
+  {"streamSynchronize", cutorch_streamSynchronize},
+  {"getDevice", cutorch_getDevice},
+  {"deviceReset", cutorch_deviceReset},
+  {"getDeviceCount", cutorch_getDeviceCount},
+  {"getPeerToPeerAccess", cutorch_getPeerToPeerAccess},
+  {"setPeerToPeerAccess", cutorch_setPeerToPeerAccess},
+  {"setKernelPeerToPeerAccess", cutorch_setKernelPeerToPeerAccess},
+  {"getKernelPeerToPeerAccess", cutorch_getKernelPeerToPeerAccess},
+  {"getDeviceProperties", cutorch_getDeviceProperties},
+  {"getMemoryUsage", cutorch_getMemoryUsage},
+  {"hasHalfInstructions", cutorch_hasHalfInstructions},
+  {"hasFastHalfInstructions", cutorch_hasFastHalfInstructions},
+  {"setDevice", cutorch_setDevice},
+  {"seed", cutorch_seed},
+  {"seedAll", cutorch_seedAll},
+  {"initialSeed", cutorch_initialSeed},
+  {"manualSeed", cutorch_manualSeed},
+  {"manualSeedAll", cutorch_manualSeedAll},
+  {"_sleep", cutorch_sleep},
+  {"getRNGState", cutorch_getRNGState},
+  {"setRNGState", cutorch_setRNGState},
+  {"getState", cutorch_getState},
+  {"setHeapTracking", cutorch_setHeapTracking},
+  {"isManagedPtr", cutorch_isManagedPtr},
+  {NULL, NULL}
+};
+
+LUA_EXTERNC DLL_EXPORT int luaopen_libcutorch(lua_State *L);
+
+int luaopen_libcutorch(lua_State *L)
+{
+  lua_newtable(L);
+  lua_pushvalue(L, -1);
+  lua_setglobal(L, "cutorch");
+  luaL_setfuncs(L, cutorch_stuff__, 0);
+
+  THCState* state = THCState_alloc();
+
+  /* Enable the caching allocator unless THC_CACHING_ALLOCATOR=0 */
+  char* thc_caching_allocator = getenv("THC_CACHING_ALLOCATOR");
+  if (!thc_caching_allocator || strcmp(thc_caching_allocator, "0") != 0) {
+    THCState_setDeviceAllocator(state, THCCachingAllocator_get());
+    state->cudaHostAllocator = &THCCachingHostAllocator;
+  }
+
+  THCudaInit(state);
+
+  /* Register torch.CudaHostAllocator. */
+  luaT_pushudata(L, THCState_getCudaHostAllocator(state), "torch.Allocator");
+  lua_setfield(L, -2, "CudaHostAllocator");
+
+  /* Register torch.CudaUVAHostAllocator. */
+  luaT_pushudata(L, THCState_getCudaUVAAllocator(state), "torch.Allocator");
+  lua_setfield(L, -2, "CudaUVAAllocator");
+
+#ifdef USE_MAGMA
+  THCMagma_init(state);
+  lua_pushboolean(L, 1);
+  lua_setfield(L, -2, "magma");
+#endif
+
+  cutorch_CudaByteStorage_init(L);
+  cutorch_CudaCharStorage_init(L);
+  cutorch_CudaShortStorage_init(L);
+  cutorch_CudaIntStorage_init(L);
+  cutorch_CudaLongStorage_init(L);
+  cutorch_CudaStorage_init(L);
+  cutorch_CudaDoubleStorage_init(L);
+#ifdef CUDA_HALF_TENSOR
+  cutorch_CudaHalfStorage_init(L);
+#else
+  cutorch_HalfStorageCopy_init(L);
+#endif
+
+  cutorch_CudaByteTensor_init(L);
+  cutorch_CudaCharTensor_init(L);
+  cutorch_CudaShortTensor_init(L);
+  cutorch_CudaIntTensor_init(L);
+  cutorch_CudaLongTensor_init(L);
+  cutorch_CudaTensor_init(L);
+  cutorch_CudaDoubleTensor_init(L);
+#ifdef CUDA_HALF_TENSOR
+  cutorch_CudaHalfTensor_init(L);
+#else
+  cutorch_HalfTensorCopy_init(L);
+#endif
+
+  cutorch_CudaByteTensorOperator_init(L);
+  cutorch_CudaCharTensorOperator_init(L);
+  cutorch_CudaShortTensorOperator_init(L);
+  cutorch_CudaIntTensorOperator_init(L);
+  cutorch_CudaLongTensorOperator_init(L);
+  cutorch_CudaTensorOperator_init(L);
+  cutorch_CudaDoubleTensorOperator_init(L);
+#ifdef CUDA_HALF_TENSOR
+  cutorch_CudaHalfTensorOperator_init(L);
+#endif
+
+  cutorch_CudaByteTensorMath_init(L);
+  cutorch_CudaCharTensorMath_init(L);
+  cutorch_CudaShortTensorMath_init(L);
+  cutorch_CudaIntTensorMath_init(L);
+  cutorch_CudaLongTensorMath_init(L);
+  cutorch_CudaTensorMath_init(L);
+  cutorch_CudaDoubleTensorMath_init(L);
+#ifdef CUDA_HALF_TENSOR
+  cutorch_CudaHalfTensorMath_init(L);
+#endif
+
+  cutorch_Event_init(L);
+
+  /* Store state in cutorch table. */
+  lua_pushlightuserdata(L, state);
+  lua_setfield(L, -2, "_state");
+
+#ifdef CUDA_HALF_TENSOR
+  lua_pushboolean(L, 1);
+#else
+  lua_pushboolean(L, 0);
+#endif
+  lua_setfield(L, -2, "hasHalf");
+
+  /* store gpu driver version in field */
+  int driverVersion;
+  THCudaCheck(cudaDriverGetVersion(&driverVersion));
+  lua_pushinteger(L, driverVersion);
+  lua_setfield(L, -2, "driverVersion");
+
+  /* when cutorch goes out of scope, we need to make sure THCState is properly
+     shut down (so that memory doesn not leak. Since _state is a lightuserdata
+     we cannot associate an __gc method with it. Hence, create a userdata, and
+     associate a metatable with it, which has an __gc method which properly
+     calls THCudaShutdown.
+  */
+  /* create a new userdata type which is a pointer to a pointer */
+  THCState **thc_pointer = (THCState**)lua_newuserdata(L, sizeof(void*));
+  /* set the state pointer */
+  *thc_pointer = state;
+  /* create a table that will be used as the metatable */
+  lua_newtable(L);
+  /* push the gc function onto the stack */
+  lua_pushcfunction(L, &cutorch_shutdown);
+  /* set the __gc field in the table to the function (function is popped) */
+  lua_setfield(L, -2, "__gc");
+  /* now the table is on the top of the stack, and the userdata below it,
+     setmetatable on the userdata with the table. table is popped */
+  lua_setmetatable(L, -2);
+  /* now the userdata is on top, with the cutorch table below it,
+     set the field cutorch.__stategc to this userdata.
+     userdata is popped, leaving cutorch table on top of the stack */
+  lua_setfield(L, -2, "_stategc");
+
+  return 1;
+}
diff --git a/init.lua b/init.lua
new file mode 100644
index 0000000..fdb7b08
--- /dev/null
+++ b/init.lua
@@ -0,0 +1,153 @@
+require "torch"
+paths.require("libcutorch")
+
+torch.CudaByteStorage.__tostring__   = torch.ByteStorage.__tostring__
+torch.CudaByteTensor.__tostring__    = torch.ByteTensor.__tostring__
+torch.CudaCharStorage.__tostring__   = torch.CharStorage.__tostring__
+torch.CudaCharTensor.__tostring__    = torch.CharTensor.__tostring__
+torch.CudaShortStorage.__tostring__  = torch.ShortStorage.__tostring__
+torch.CudaShortTensor.__tostring__   = torch.ShortTensor.__tostring__
+torch.CudaIntStorage.__tostring__    = torch.IntStorage.__tostring__
+torch.CudaIntTensor.__tostring__     = torch.IntTensor.__tostring__
+torch.CudaLongStorage.__tostring__   = torch.LongStorage.__tostring__
+torch.CudaLongTensor.__tostring__    = torch.LongTensor.__tostring__
+torch.CudaStorage.__tostring__       = torch.FloatStorage.__tostring__
+torch.CudaTensor.__tostring__        = torch.FloatTensor.__tostring__
+torch.CudaDoubleStorage.__tostring__ = torch.DoubleStorage.__tostring__
+torch.CudaDoubleTensor.__tostring__  = torch.DoubleTensor.__tostring__
+if cutorch.hasHalf then
+   torch.CudaHalfStorage.__tostring__  = torch.HalfStorage.__tostring__
+   torch.CudaHalfTensor.__tostring__  = torch.HalfTensor.__tostring__
+end
+
+require('cutorch.Tensor')
+require('cutorch.FFI')
+require('cutorch.test')
+
+local unpack = unpack or table.unpack
+
+function cutorch.withDevice(newDeviceID, closure)
+    local curDeviceID = cutorch.getDevice()
+    cutorch.setDevice(newDeviceID)
+    local vals = {pcall(closure)}
+    cutorch.setDevice(curDeviceID)
+    if vals[1] then
+       return unpack(vals, 2)
+    end
+    error(unpack(vals, 2))
+end
+
+local function longTensorSize(...)
+   local size
+   if not ... then
+      size = torch.LongTensor{0}
+   elseif torch.isStorage(...) then
+      size = torch.LongTensor(...)
+   else
+      size = torch.LongTensor{...}
+   end
+   return size
+end
+
+-- Creates a FloatTensor using the CudaHostAllocator.
+-- Accepts either a LongStorage or a sequence of numbers.
+function cutorch.createCudaHostTensor(...)
+   local size = longTensorSize(...)
+   local storage = torch.FloatStorage(cutorch.CudaHostAllocator, size:prod())
+   return torch.FloatTensor(storage, 1, size:storage())
+end
+
+function cutorch.createCudaHostDoubleTensor(...)
+   local size = longTensorSize(...)
+   local storage = torch.DoubleStorage(cutorch.CudaHostAllocator, size:prod())
+   return torch.DoubleTensor(storage, 1, size:storage())
+end
+
+if cutorch.hasHalf then
+  function cutorch.createCudaHostHalfTensor(...)
+     local size = longTensorSize(...)
+     local storage = torch.HalfStorage(cutorch.CudaHostAllocator, size:prod())
+     return torch.HalfTensor(storage, 1, size:storage())
+   end
+ end
+
+-- Creates a CudaTensor using the CudaUVAAllocator.
+-- Accepts either a LongStorage or a sequence of numbers.
+local function _createUVATensor(...)
+   local size = longTensorSize(...)
+   -- See CUDA_C_Programming_guide.pdf for detailed explanation about synchronization
+   -- Section J.
+   -- "It is worth a comment on the synchronization between host and device. Notice how in
+   -- the non-managed example, the synchronous cudaMemcpy() routine is used both to
+   -- synchronize the kernel (that is, to wait for it to finish running), and to transfer the data
+   -- to the host. The Unified Memory examples do not call cudaMemcpy() and so require an
+   -- explicit cudaDeviceSynchronize() before the host program can safely use the output
+   -- from the GPU."
+   -- Section J.2.2.1.
+   -- " Note that if memory is dynamically allocated with cudaMallocManaged() or
+   -- cuMemAllocManaged() while the GPU is active, the behavior of the memory is
+   -- unspecified until additional work is launched or the GPU is synchronized. Attempting
+   -- to access the memory on the CPU during this time may or may not cause a segmentation
+   -- fault."
+   cutorch.synchronize()
+   local storage = torch.FloatStorage(cutorch.CudaUVAAllocator, size:prod())
+   return torch.FloatTensor(storage)
+end
+
+function cutorch.createFloatUVATensor(...)
+   return _createUVATensor(...)
+end
+
+-- Creates a CudaTensor using the CudaUVAAllocator.
+-- Accepts either a LongStorage or a sequence of numbers.
+-- First creates a UVA backed FloatTensor and takes its pointer.
+function cutorch.createCudaUVATensor(...)
+   -- Delegate actual allocation and synchronization to CPU tensor and
+   -- take the pointer.
+   local ft = _createUVATensor(...)
+   local storage = torch.CudaStorage(
+      ft:storage():size(),
+      tonumber(torch.data(ft:storage(), true))
+   )
+   return torch.CudaTensor(storage)
+end
+
+-- UVA storage is a single memory location backed by virtual addressing.
+-- Converting between CPU / GPU tensor types is done by raw pointer passing.
+-- We only support FloatTensor, CudaTensor, Cuda -> float and float -> Cuda atm
+function cutorch.toFloatUVATensor(t)
+   if not torch.isTensor(t) then
+      error('Must use a tensor, got ' .. torch.type(t))
+   end
+   local storage = torch.FloatStorage(
+      t:storage():size(),
+      tonumber(torch.data(t:storage(), true))
+   )
+   assert(cutorch.isManaged(storage))
+   return torch.FloatTensor(storage)
+end
+
+function cutorch.toCudaUVATensor(t)
+   if not torch.isTensor(t) then
+      error('Must use a tensor, got ' .. torch.type(t))
+   end
+   local storage = torch.CudaStorage(
+      t:storage():size(),
+      tonumber(torch.data(t:storage(), true))
+   )
+   assert(cutorch.isManaged(storage))
+   return torch.CudaTensor(storage)
+end
+
+function cutorch.isManaged(t)
+   if not torch.isTensor(t) and not torch.isStorage(t) then
+      error('Usage: cutorch.isManaged(Tensor|Storage), got ' .. torch.type(t))
+   end
+   return cutorch.isManagedPtr(tonumber(torch.data(t, true)))
+end
+
+-- remove this line to disable automatic cutorch heap-tracking
+-- for garbage collection
+cutorch.setHeapTracking(true)
+
+return cutorch
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..ebd35b9
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1 @@
+ADD_SUBDIRECTORY(THC)
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
new file mode 100644
index 0000000..0e08120
--- /dev/null
+++ b/lib/THC/CMakeLists.txt
@@ -0,0 +1,327 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.8)
+
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+OPTION(NDEBUG "disable asserts (WARNING: this may result in invalid memory accesses)")
+IF(NOT NDEBUG)
+  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
+ENDIF()
+
+IF(NOT Torch_FOUND)
+  FIND_PACKAGE(Torch)
+ENDIF()
+
+IF(NOT TH_LIBRARIES)
+  SET(TH_LIBRARIES "TH")
+ENDIF(NOT TH_LIBRARIES)
+MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
+
+IF(NOT CUDA_FOUND)
+  FIND_PACKAGE(CUDA 5.5 REQUIRED)
+ENDIF()
+
+IF(NOT MAGMA_FOUND)
+  FIND_PACKAGE(MAGMA)
+ENDIF()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
+    if(CUDA_VERSION VERSION_LESS "8.0")
+      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
+    endif(CUDA_VERSION VERSION_LESS "8.0")
+  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
+endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.7" OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "4.7" )
+    SET(CXX_VERSION "c++11")
+  ELSE()
+    SET(CXX_VERSION "c++0x")
+  ENDIF()
+  SET_SOURCE_FILES_PROPERTIES(
+    THCTensorRandom.cpp
+    THCCachingAllocator.cpp
+    THCCachingHostAllocator.cpp
+    PROPERTIES COMPILE_FLAGS -std=${CXX_VERSION})
+ELSE()
+  SET(CMAKE_CXX_STANDARD 11)
+ENDIF()
+
+
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
+
+IF(MAGMA_FOUND)
+  INCLUDE_DIRECTORIES(${MAGMA_INCLUDE_DIR})
+  SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
+  INCLUDE(CheckPrototypeDefinition)
+  check_prototype_definition(magma_get_sgeqrf_nb
+   "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
+   "0"
+   "magma.h"
+    MAGMA_V2)
+  IF (MAGMA_V2)
+    add_definitions(-DMAGMA_V2)
+  ENDIF (MAGMA_V2)
+
+  SET(USE_MAGMA 1)
+  MESSAGE(STATUS "Compiling with MAGMA support")
+  MESSAGE(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+  MESSAGE(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+  MESSAGE(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+ELSE(MAGMA_FOUND)
+  MESSAGE(STATUS "MAGMA not found. Compiling without MAGMA support")
+ENDIF(MAGMA_FOUND)
+
+IF ($ENV{TH_BINARY_BUILD})
+  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
+  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
+ENDIF()
+
+IF(APPLE)
+  IF(${CUDA_VERSION} LESS 6.0)
+    # work around for mac os x bug:
+    # http://stackoverflow.com/questions/16286588/cuda-5-0-cmake-and-make-failing-on-osx-10-8-3
+    if (NOT DEFINED CUDA_HOST_COMPILER AND CMAKE_C_COMPILER_ID STREQUAL "Clang" AND EXISTS /usr/bin/gcc)
+      set(CUDA_HOST_COMPILER /usr/bin/gcc CACHE FILEPATH "Host side compiler used by NVCC")
+      message(STATUS "Setting CMAKE_HOST_COMPILER to /usr/bin/gcc instead of ${CMAKE_C_COMPILER}.")
+    endif()
+
+    # bug on Apple
+    LINK_DIRECTORIES("/usr/local/cuda/lib/")
+  ELSEIF(${CUDA_VERSION} LESS 7.0)
+    SET(CUDA_HOST_COMPILER clang)
+    LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -stdlib=libstdc++ -Xlinker -stdlib=libstdc++")
+    IF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
+    ENDIF()
+  ENDIF()
+  # CUDA 7 supports clang and libc++ so no need to change anything
+ENDIF(APPLE)
+
+# Detect CUDA architecture and get best NVCC flags
+IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
+  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
+ENDIF()
+LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
+CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
+LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+
+IF(NOT THC_INSTALL_BIN_SUBDIR
+    OR NOT THC_INSTALL_LIB_SUBDIR
+    OR NOT THC_INSTALL_INCLUDE_SUBDIR
+    OR NOT THC_INSTALL_CMAKE_SUBDIR)
+
+  INCLUDE_DIRECTORIES(${TH_INCLUDE_PATH} ${TH_INCLUDE_PATH}/TH)
+  LINK_DIRECTORIES(${TH_LIB_PATH})
+
+  IF(Torch_INSTALL_BIN_SUBDIR)
+    SET(THC_INSTALL_BIN_SUBDIR ${Torch_INSTALL_BIN_SUBDIR})
+    SET(THC_INSTALL_LIB_SUBDIR ${Torch_INSTALL_LIB_SUBDIR})
+    SET(THC_INSTALL_INCLUDE_SUBDIR ${Torch_INSTALL_INCLUDE_SUBDIR})
+    SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
+  ELSE(Torch_INSTALL_BIN_SUBDIR)
+    # not installing in a Torch context, so Torch_INSTALL_BIN_SUBDIR is not available
+    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")           
+    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")                     
+    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")                     
+    SET(THC_INSTALL_CMAKE_SUBDIR "share/cmake/THC" CACHE PATH "THC install cmake subdirectory")
+  ENDIF(Torch_INSTALL_BIN_SUBDIR)
+
+ENDIF()
+
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}")
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+CONFIGURE_FILE(THCGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h")
+
+IF(MSVC)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
+ELSE()
+  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
+ENDIF()
+
+SET(src
+    THCCachingAllocator.cpp
+    THCCachingHostAllocator.cpp
+    THCGeneral.c
+    THCStorageCopy.c
+    THCStream.c
+    THCTensor.c
+    THCTensorCopy.c
+    THCTensorRandom.cpp
+    THCThreadLocal.c
+    )
+
+SET(src-cuda
+  THCReduceApplyUtils.cu
+  THCBlas.cu
+  THCSleep.cu
+  THCStorage.cu
+  THCStorageCopy.cu
+  THCTensor.cu
+  THCTensorCopy.cu
+  THCTensorMath.cu
+  THCTensorMath2.cu
+  THCTensorMathBlas.cu
+  THCTensorMathMagma.cu
+  THCTensorMathPairwise.cu
+  THCTensorMathReduce.cu
+  THCTensorMathScan.cu
+  THCTensorIndex.cu
+  THCTensorConv.cu
+  THCTensorRandom.cu
+  THCTensorScatterGather.cu
+  THCTensorTopK.cu
+  THCTensorSort.cu
+  THCTensorTypeUtils.cu
+  )
+
+# loop over all types
+foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
+   # loop over files which need to be split between types (because of long compile times)
+   foreach(THC_FILE TensorSort TensorMathCompareT TensorMathPointwise TensorMathCompare TensorMathReduce TensorMasked)
+      if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
+         FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
+              "#include \"../THC${THC_FILE}.cuh\"\n#include \"../generic/THC${THC_FILE}.cu\"\n#include \"../THCGenerate${THC_TYPE}Type.h\"\n")
+      endif()
+      LIST(APPEND src-cuda "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
+   endforeach()
+endforeach()
+
+MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
+
+IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
+  LIST(APPEND src-cuda THCHalf.cu)
+  LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
+  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
+ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
+ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+
+MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+
+CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
+CUDA_ADD_CUBLAS_TO_TARGET(THC)
+TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY})
+
+IF(USE_MAGMA)
+  TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
+ENDIF(USE_MAGMA)
+
+IF(NOT THC_SO_VERSION)
+  SET(THC_SO_VERSION 0)
+ENDIF(NOT THC_SO_VERSION)
+MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
+SET_TARGET_PROPERTIES(THC PROPERTIES
+  VERSION   ${THC_SO_VERSION}
+  SOVERSION ${THC_SO_VERSION})
+
+
+INSTALL(TARGETS THC
+          RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
+          LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
+          ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
+
+INSTALL(FILES
+          THC.h
+          ${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h
+          THCBlas.h
+          THCSleep.h
+          THCStorage.h
+          THCStorageCopy.h
+          THCStream.h
+          THCThreadLocal.h
+          THCTensor.h
+          THCTensorCopy.h
+          THCTensorRandom.h
+          THCTensorMath.h
+          THCTensorConv.h
+          THCTensorTopK.h
+          THCApply.cuh
+          THCReduce.cuh
+          THCReduceAll.cuh
+          THCReduceApplyUtils.cuh
+          THCAsmUtils.cuh
+          THCAtomics.cuh
+          THCScanUtils.cuh
+          THCSortUtils.cuh
+          THCAllocator.h
+          THCCachingAllocator.h
+          THCCachingHostAllocator.h
+          THCDeviceUtils.cuh
+          THCDeviceTensor.cuh
+          THCDeviceTensor-inl.cuh
+          THCDeviceTensorUtils.cuh
+          THCDeviceTensorUtils-inl.cuh
+          THCGenerateAllTypes.h
+          THCGenerateByteType.h
+          THCGenerateCharType.h
+          THCGenerateShortType.h
+          THCGenerateIntType.h
+          THCGenerateLongType.h
+          THCGenerateHalfType.h
+          THCGenerateFloatType.h
+          THCGenerateFloatTypes.h
+          THCGenerateDoubleType.h
+          THCHalf.h
+          THCNumerics.cuh
+          THCTensorSort.cuh
+          THCTensorInfo.cuh
+          THCTensorTypeUtils.cuh
+          THCTensorRandom.cuh
+          THCTensorMathMagma.cuh
+          THCThrustAllocator.cuh
+          DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
+
+INSTALL(FILES
+          generic/THCStorage.c
+          generic/THCStorage.cu
+          generic/THCStorage.h
+          generic/THCTensor.c
+          generic/THCTensor.cu
+          generic/THCTensor.h
+          generic/THCStorageCopy.c
+          generic/THCStorageCopy.cu
+          generic/THCStorageCopy.h
+          generic/THCTensorCopy.c
+          generic/THCTensorCopy.cu
+          generic/THCTensorCopy.h
+          generic/THCTensorMasked.h
+          generic/THCTensorMasked.cu
+          generic/THCTensorMath.h
+          generic/THCTensorMath.cu
+          generic/THCTensorMathBlas.cu
+          generic/THCTensorMathBlas.h
+          generic/THCTensorMathCompare.h
+          generic/THCTensorMathCompare.cu
+          generic/THCTensorMathCompareT.h
+          generic/THCTensorMathCompareT.cu
+          generic/THCTensorMathMagma.h
+          generic/THCTensorMathMagma.cu
+          generic/THCTensorMathPairwise.h
+          generic/THCTensorMathPairwise.cu
+          generic/THCTensorMathPointwise.h
+          generic/THCTensorMathPointwise.cu
+          generic/THCTensorMathReduce.h
+          generic/THCTensorMathReduce.cu
+          generic/THCTensorMathScan.h
+          generic/THCTensorMathScan.cu
+          generic/THCTensorScatterGather.h
+          generic/THCTensorScatterGather.cu
+          generic/THCTensorIndex.h
+          generic/THCTensorIndex.cu
+          generic/THCTensorSort.h
+          generic/THCTensorSort.cu
+          generic/THCDeviceTensorUtils.cu
+          generic/THCTensorRandom.h
+          generic/THCTensorRandom.cu
+          DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THC.h b/lib/THC/THC.h
new file mode 100644
index 0000000..e3840dc
--- /dev/null
+++ b/lib/THC/THC.h
@@ -0,0 +1,20 @@
+#ifndef THC_INC
+#define THC_INC
+
+#include "THCGeneral.h"
+#include "THCAllocator.h"
+#include "THCBlas.h"
+#include "THCCachingAllocator.h"
+#include "THCCachingHostAllocator.h"
+#include "THCSleep.h"
+#include "THCStorage.h"
+#include "THCStorageCopy.h"
+#include "THCStream.h"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THCTensorRandom.h"
+#include "THCTensorMath.h"
+#include "THCTensorConv.h"
+#include "THCTensorTopK.h"
+
+#endif
diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c
new file mode 100644
index 0000000..9ff447d
--- /dev/null
+++ b/lib/THC/THCAllocator.c
@@ -0,0 +1,67 @@
+#include "THCAllocator.h"
+
+static void *THCudaHostAllocator_malloc(void* ctx, ptrdiff_t size) {
+  void* ptr;
+
+  if (size < 0) THError("Invalid memory size: %ld", size);
+
+  if (size == 0) return NULL;
+
+  THCudaCheck(cudaMallocHost(&ptr, size));
+
+  return ptr;
+}
+
+static void THCudaHostAllocator_free(void* ctx, void* ptr) {
+  if (!ptr) return;
+
+  THCudaCheck(cudaFreeHost(ptr));
+}
+
+THAllocator THCudaHostAllocator = {
+  &THCudaHostAllocator_malloc,
+  NULL,
+  &THCudaHostAllocator_free
+};
+
+static cudaError_t THCIpcAllocator_malloc(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
+{
+  THError("THCIpcAllocator.malloc() not supported");
+  return cudaSuccess;
+}
+
+static cudaError_t THCIpcAllocator_free(void* ctx, void* devPtr)
+{
+  return cudaIpcCloseMemHandle(devPtr);
+}
+
+THCDeviceAllocator THCIpcAllocator = {
+  &THCIpcAllocator_malloc,
+  NULL,
+  &THCIpcAllocator_free,
+  NULL,
+  NULL
+};
+
+static void *THCUVAAllocator_alloc(void* ctx, ptrdiff_t size) {
+  if (size < 0) THError("Invalid memory size: %ld", size);
+
+  if (size == 0) return NULL;
+
+  // See J.1.1 of the CUDA_C_Programming_Guide.pdf for UVA and coherence rules
+  // on various compute capabilities.
+  void* ptr;
+  THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
+  return ptr;
+}
+
+static void THCUVAAllocator_free(void* ctx, void* ptr) {
+  if (!ptr) return;
+  THCudaCheck(cudaFree(ptr));
+}
+
+THAllocator THCUVAAllocator = {
+  &THCUVAAllocator_alloc,
+  NULL,
+  &THCUVAAllocator_free
+};
diff --git a/lib/THC/THCAllocator.h b/lib/THC/THCAllocator.h
new file mode 100644
index 0000000..d6a0a9b
--- /dev/null
+++ b/lib/THC/THCAllocator.h
@@ -0,0 +1,10 @@
+#ifndef THC_ALLOCATOR_INC
+#define THC_ALLOCATOR_INC
+
+#include "THCGeneral.h"
+
+extern THAllocator THCudaHostAllocator;
+extern THAllocator THCUVAAllocator;
+extern THCDeviceAllocator THCIpcAllocator;
+
+#endif
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
new file mode 100644
index 0000000..a47e303
--- /dev/null
+++ b/lib/THC/THCApply.cuh
@@ -0,0 +1,644 @@
+#ifndef THC_APPLY_INC
+#define THC_APPLY_INC
+
+#include "THCTensorCopy.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+
+//
+// This file contains pointwise operation functions and kernels that
+// work on both contiguous and non-contiguous tensor arguments of
+// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
+// copying or temporary storage.
+//
+
+// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
+#define THC_APPLY_THREADS_PER_BLOCK 32 * 16
+
+template <typename Op,
+          typename Ta,
+          typename IndexType,
+          int ADims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
+
+    op(&a.data[aOffset]);
+  }
+}
+
+template <typename Op,
+          typename Ta, typename Tb,
+          typename IndexType,
+          int ADims, int BDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
+                      TensorInfo<Tb, IndexType> b,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
+
+    // Convert `linearIndex` into an offset of `b`
+    const IndexType bOffset =
+      IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
+
+    op(&a.data[aOffset], &b.data[bOffset]);
+  }
+}
+
+template <typename Op,
+          typename Ta, typename Tb, typename Tc,
+          typename IndexType,
+          int ADims, int BDims, int CDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
+                      TensorInfo<Tb, IndexType> b,
+                      TensorInfo<Tc, IndexType> c,
+                      IndexType totalElements,
+                      Op op) {
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    // Convert `linearIndex` into an offset of `a`
+    const IndexType aOffset =
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
+
+    // Convert `linearIndex` into an offset of `b`
+    const IndexType bOffset =
+      IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
+
+    // Convert `linearIndex` into an offset of `c`
+    const IndexType cOffset =
+      IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c);
+
+    op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);
+  }
+}
+
+inline dim3 getApplyBlock() {
+  return dim3(THC_APPLY_THREADS_PER_BLOCK);
+}
+
+inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+
+  if (curDevice == -1) {
+    return false;
+  }
+
+  // Assume a reasonable number of SMs if no state is available
+  int numSM =
+    state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
+
+  // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
+  // which seems to be a good sweetspot for latency hiding
+  grid = dim3(min((long long) THCCeilDiv(totalElements,
+                                         (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
+                  4LL * numSM));
+  return true;
+}
+
+template <typename TensorTypeA,
+          typename Op>
+bool THC_pointwiseApply1(THCState* state,
+                         TensorTypeA* a,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+  if (!getApplyGrid(state, totalElements, grid)) {
+    return false;
+  }
+
+  // If tensor args have overlapping indices and are read/write, then
+  // we must expand the tensor to a contiguous form first, since
+  // otherwise there are conflicting writes. Upon copying back to the
+  // non-contiguous form, there will be conflicting writes, but at
+  // least with copy, one of the updaters will win atomically. This is
+  // a sketchy property of the old system as well (writing into all
+  // indices of a tensor with overlapping indices should probably be
+  // an error, since it is unclear which one should win), but we will
+  // preserve this last-writer-wins (in arbitrary copy order) behavior.
+  TensorTypeA* oldA = NULL;
+
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, A)                                            \
+  kernelPointwiseApply1<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,   \
+                        TYPE, A>                                        \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      aInfo, (TYPE) totalElements, op);
+
+#define HANDLE_A_CASE(TYPE, A)                  \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, -2);                    \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, 1);                   \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, 2);                   \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, -1);                  \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+  // Can we use 32-bit integer math in the kernel (the linear ID for the copy
+  // and the resulting non-linear offset is all computable using 32-bit math?)
+  // We also use unsigned index math in the kernel, as signed div/mod has
+  // additional overhead.
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
+    aInfo.collapseDims();
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims);
+  } else {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
+    aInfo.collapseDims();
+
+    // For large tensors, we only compile the completely contiguous
+    // version and the completely generic version, to reduce
+    // compilation time.
+    if (aInfo.isContiguous()) {
+      kernelPointwiseApply1<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            unsigned long, -2>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, (unsigned long) totalElements, op);
+    } else {
+      kernelPointwiseApply1<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            unsigned long, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, (unsigned long) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
+    a = oldA;
+  }
+
+  return true;
+}
+
+template <typename TensorTypeA,
+          typename TensorTypeB,
+          typename Op>
+bool THC_pointwiseApply2(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly) {
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
+    return false;
+  }
+
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  if (!getApplyGrid(state, totalElements, grid)) {
+    return false;
+  }
+
+  // If tensor args have overlapping indices and are read/write, then
+  // we must expand the tensor to a contiguous form first, since
+  // otherwise there are conflicting writes. Upon copying back to the
+  // non-contiguous form, there will be conflicting writes, but at
+  // least with copy, one of the updaters will win atomically. This is
+  // a sketchy property of the old system as well (writing into all
+  // indices of a tensor with overlapping indices should probably be
+  // an error, since it is unclear which one should win), but we will
+  // preserve this last-writer-wins (in arbitrary copy order) behavior.
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
+
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
+  }
+  if (bType == ReadWrite &&
+      TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = TensorUtils<TensorTypeB>::newContiguous(state, b);
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, A, B)                                         \
+  kernelPointwiseApply2<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,    \
+                        typename TensorUtils<TensorTypeB>::DataType,    \
+                        TYPE, A, B>                                     \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      aInfo, bInfo, (TYPE) totalElements, op);
+
+#define HANDLE_B_CASE(TYPE, A, B)               \
+  {                                             \
+    if (bInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, A, -2);                 \
+    } else {                                    \
+      switch (B) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, A, 1);                \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, A, 2);                \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, A, -1);               \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_A_CASE(TYPE, A, B)               \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_B_CASE(TYPE, -2, B);               \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_B_CASE(TYPE, 1, B);              \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_B_CASE(TYPE, 2, B);              \
+        break;                                  \
+        default:                                \
+        HANDLE_B_CASE(TYPE, -1, B);             \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
+    aInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+      getTensorInfo<TensorTypeB, unsigned int>(state, b);
+    bInfo.collapseDims();
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
+  } else {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
+    aInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+      getTensorInfo<TensorTypeB, unsigned long>(state, b);
+    bInfo.collapseDims();
+
+    // For large tensors, we only compile the completely contiguous
+    // version and the completely generic version, to reduce
+    // compilation time.
+    if (aInfo.isContiguous() && bInfo.isContiguous()) {
+      kernelPointwiseApply2<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            unsigned long, -2, -2>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, bInfo, (unsigned long) totalElements, op);
+    } else {
+      kernelPointwiseApply2<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            unsigned long, -1, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, bInfo, (unsigned long) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
+    a = oldA;
+  }
+
+  if (oldB) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+    TensorUtils<TensorTypeB>::free(state, b);
+    b = oldB;
+  }
+
+  return true;
+}
+
+template <typename TensorTypeA,
+          typename TensorTypeB,
+          typename TensorTypeC,
+          typename Op>
+bool THC_pointwiseApply3(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         TensorTypeC* c,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly,
+                         TensorArgType cType = ReadOnly) {
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
+      totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
+    return false;
+  }
+
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  if (!getApplyGrid(state, totalElements, grid)) {
+    return false;
+  }
+
+  // If tensor args have overlapping indices and are read/write, then
+  // we must expand the tensor to a contiguous form first, since
+  // otherwise there are conflicting writes. Upon copying back to the
+  // non-contiguous form, there will be conflicting writes, but at
+  // least with copy, one of the updaters will win atomically. This is
+  // a sketchy property of the old system as well (writing into all
+  // indices of a tensor with overlapping indices should probably be
+  // an error, since it is unclear which one should win), but we will
+  // preserve this last-writer-wins (in arbitrary copy order) behavior.
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
+  TensorTypeC* oldC = NULL;
+
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
+  }
+  if (bType == ReadWrite &&
+      TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
+    // Must perform in contiguous space
+    oldB = b;
+    b = TensorUtils<TensorTypeB>::newContiguous(state, b);
+  }
+  if (cType == ReadWrite &&
+      TensorUtils<TensorTypeC>::overlappingIndices(state, c)) {
+    // Must perform in contiguous space
+    oldC = c;
+    c = TensorUtils<TensorTypeC>::newContiguous(state, c);
+  }
+
+#define HANDLE_CASE(TYPE, A, B, C)                                      \
+  kernelPointwiseApply3<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,    \
+                        typename TensorUtils<TensorTypeB>::DataType,    \
+                        typename TensorUtils<TensorTypeC>::DataType,    \
+                        TYPE, A, B, C>                                  \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      aInfo, bInfo, cInfo, (TYPE) totalElements, op);
+
+#define HANDLE_C_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (cInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, A, B, -2);              \
+    } else {                                    \
+      switch (C) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, A, B, 1);             \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, A, B, 2);             \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, A, B, -1);            \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_B_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (bInfo.isContiguous()) {                 \
+      HANDLE_C_CASE(TYPE, A, -2, C);            \
+    } else {                                    \
+      switch (B) {                              \
+        case 1:                                 \
+        HANDLE_C_CASE(TYPE, A, 1, C);           \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_C_CASE(TYPE, A, 2, C);           \
+        break;                                  \
+        default:                                \
+        HANDLE_C_CASE(TYPE, A, -1, C);          \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_A_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_B_CASE(TYPE, -2, B, C);            \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_B_CASE(TYPE, 1, B, C);           \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_B_CASE(TYPE, 2, B, C);           \
+        break;                                  \
+        default:                                \
+        HANDLE_B_CASE(TYPE, -1, B, C);          \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
+      TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
+    aInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+      getTensorInfo<TensorTypeB, unsigned int>(state, b);
+    bInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo =
+      getTensorInfo<TensorTypeC, unsigned int>(state, c);
+    cInfo.collapseDims();
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
+  } else {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
+    aInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+      getTensorInfo<TensorTypeB, unsigned long>(state, b);
+    bInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned long> cInfo =
+      getTensorInfo<TensorTypeC, unsigned long>(state, c);
+    cInfo.collapseDims();
+
+    // For large tensors, we only compile the completely contiguous
+    // version and the completely generic version, to reduce
+    // compilation time.
+    if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) {
+      kernelPointwiseApply3<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            typename TensorUtils<TensorTypeC>::DataType,
+                            unsigned long, -2, -2, -2>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
+    } else {
+      kernelPointwiseApply3<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            typename TensorUtils<TensorTypeC>::DataType,
+                            unsigned long, -1, -1, -1>
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_C_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
+    a = oldA;
+  }
+
+  if (oldB) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldB contiguous.
+    TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+    TensorUtils<TensorTypeB>::free(state, b);
+    b = oldB;
+  }
+
+  if (oldC) {
+    // Ignore overlaps when copying back; if we use THCTensor_copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldC contiguous.
+    TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
+    TensorUtils<TensorTypeC>::free(state, c);
+    c = oldC;
+  }
+
+  return true;
+}
+
+#undef THC_APPLY_THREADS_PER_BLOCK
+
+#endif // THC_APPLY_INC
diff --git a/lib/THC/THCAsmUtils.cuh b/lib/THC/THCAsmUtils.cuh
new file mode 100644
index 0000000..7015d20
--- /dev/null
+++ b/lib/THC/THCAsmUtils.cuh
@@ -0,0 +1,52 @@
+#ifndef THC_ASM_UTILS_INC
+#define THC_ASM_UTILS_INC
+
+// Collection of direct PTX functions
+
+__device__ __forceinline__
+unsigned int getBitfield(unsigned int val, int pos, int len) {
+  unsigned int ret;
+  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+  return ret;
+}
+
+__device__ __forceinline__
+unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+  unsigned int ret;
+  asm("bfi.b32 %0, %1, %2, %3, %4;" :
+      "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+  return ret;
+}
+
+__device__ __forceinline__ int getLaneId() {
+  int laneId;
+  asm("mov.s32 %0, %laneid;" : "=r"(laneId) );
+  return laneId;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+
+
+#endif // THC_ASM_UTILS_INC
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
new file mode 100644
index 0000000..ac0b45f
--- /dev/null
+++ b/lib/THC/THCAtomics.cuh
@@ -0,0 +1,134 @@
+#ifndef THC_ATOMICS_INC
+#define THC_ATOMICS_INC
+
+#include "THCHalf.h"
+
+template <typename T, size_t n>
+struct AtomicAddIntegerImpl;
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 1> {
+  inline __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui =
+        (unsigned int *) (address - ((size_t)address & 3));
+    unsigned int old = *address_as_ui;
+    unsigned int shift = (((size_t)address & 3) * 8);
+    unsigned int sum;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      sum = val + T((old >> shift) & 0xff);
+      old = (old & ~(0x000000ff << shift)) | (sum << shift);
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 2> {
+  inline __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui =
+        (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int sum;
+    unsigned int newval;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff);
+      newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 4> {
+  inline __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui = (unsigned int *) (address);
+    unsigned int old = *address_as_ui;
+    unsigned int newval;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 8> {
+  inline __device__ void operator()(T *address, T val) {
+    unsigned long long * address_as_ui = (unsigned long long *) (address);
+    unsigned long long old = *address_as_ui;
+    unsigned long long newval;
+    unsigned long long assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+static inline __device__ void atomicAdd(unsigned char *address, unsigned char val) {
+  AtomicAddIntegerImpl<unsigned char, sizeof(unsigned char)>()(address, val);
+}
+
+static inline  __device__ void atomicAdd(char *address, char val) {
+  AtomicAddIntegerImpl<char, sizeof(char)>()(address, val);
+}
+
+static inline  __device__ void atomicAdd(short *address, short val) {
+  AtomicAddIntegerImpl<short, sizeof(short)>()(address, val);
+}
+
+static inline __device__ void atomicAdd(long *address, long val) {
+  AtomicAddIntegerImpl<long, sizeof(long)>()(address, val);
+}
+
+#ifdef CUDA_HALF_TENSOR
+static inline  __device__ void atomicAdd(half *address, half val) {
+  unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    half hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    hsum = THCNumerics<half>::add(hsum, val);
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+   } while (assumed != old);
+}
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+// from CUDA C Programmic Guide
+static inline  __device__  void atomicAdd(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+}
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
+// This needs to be defined for the host side pass
+static inline  __device__  void atomicAdd(double *address, double val) { }
+#endif
+
+#endif // THC_ATOMICS_INC
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
new file mode 100644
index 0000000..c438ad8
--- /dev/null
+++ b/lib/THC/THCBlas.cu
@@ -0,0 +1,414 @@
+#include "THCBlas.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy)
+{
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+    float result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSdot(handle, i_n, x, i_incx, y, i_incy, &result));
+    return result;
+  }
+
+  THError("Cublas_Sdot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return 0;
+}
+
+double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy)
+{
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+    double result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDdot(handle, i_n, x, i_incx, y, i_incy, &result));
+    return result;
+  }
+
+  THError("Cublas_Ddot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return 0;
+}
+
+#ifdef CUDA_HALF_TENSOR
+float THCudaBlas_Hdot(THCState *state, long n, half *x, long incx, half *y, long incy)
+{
+#if CUDA_VERSION >= 8000
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+    float result;
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDotEx(handle, i_n, x, CUDA_R_16F, i_incx, y, CUDA_R_16F, i_incy, &result, CUDA_R_32F, CUDA_R_32F));
+    return result;
+}
+
+  THError("Cublas_Hdot only supports n, incx and incy "
+          "up to signed integer limits: %d", INT_MAX);
+  return 0;
+#else
+  THError("Cublas_Hdot requires CUDA 8.0+");
+  return 0;
+#endif
+}
+#endif
+
+/* Level 2 */
+void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy)
+{
+  if(n == 1)
+    lda = m;
+
+  cublasOperation_t op;
+  if (trans == 't') op = CUBLAS_OP_T;
+  else if (trans == 'n') op = CUBLAS_OP_N;
+  else if (trans == 'c') op = CUBLAS_OP_C;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) &&
+      (lda > 0) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+    return;
+  }
+  THError("Cublas_Sgemv only supports m, n, lda, incx, incy"
+          "in the range 0 < [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy)
+{
+  if(n == 1)
+    lda = m;
+
+  cublasOperation_t op;
+  if (trans == 't') op = CUBLAS_OP_T;
+  else if (trans == 'n') op = CUBLAS_OP_N;
+  else if (trans == 'c') op = CUBLAS_OP_C;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) &&
+      (lda > 0) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDgemv(handle, op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+    return;
+  }
+  THError("Cublas_Dgemv only supports m, n, lda, incx, incy"
+          "in the range 0 < [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda)
+{
+  if(n == 1)
+    lda = m;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
+    {
+      int i_m = (int)m;
+      int i_n = (int)n;
+      int i_lda = (int)lda;
+      int i_incx = (int)incx;
+      int i_incy = (int)incy;
+
+      cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+      cublasSetStream(handle, THCState_getCurrentStream(state));
+      THCublasCheck(cublasSger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
+      return;
+    }
+  THError("Cublas_Sger only supports m, n, lda, incx, incy"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda)
+{
+  if(n == 1)
+    lda = m;
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
+    {
+      int i_m = (int)m;
+      int i_n = (int)n;
+      int i_lda = (int)lda;
+      int i_incx = (int)incx;
+      int i_incy = (int)incy;
+
+      cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+      cublasSetStream(handle, THCState_getCurrentStream(state));
+      THCublasCheck(cublasDger(handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
+      return;
+    }
+  THError("Cublas_Dger only supports m, n, lda, incx, incy"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+
+cublasOperation_t convertTransToCublasOperation(char trans) {
+  if (trans == 't') return CUBLAS_OP_T;
+  else if (trans == 'n') return CUBLAS_OP_N;
+  else if (trans == 'c') return CUBLAS_OP_C;
+  else {
+    THError("trans must be one of: t, n, c");
+    return CUBLAS_OP_T;
+  }
+}
+
+void adjustLd(char transa, char transb, long m, long n, long k, long *lda, long *ldb, long *ldc)
+{
+  int transa_ = ((transa == 't') || (transa == 'T'));
+  int transb_ = ((transb == 't') || (transb == 'T'));
+
+  if(n == 1)
+    *ldc = m;
+
+  if(transa_)
+  {
+    if(m == 1)
+      *lda = k;
+  }
+  else
+  {
+    if(k == 1)
+      *lda = m;
+  }
+
+  if(transb_)
+  {
+    if(k == 1)
+      *ldb = n;
+  }
+  else
+  {
+    if(n == 1)
+      *ldb = k;
+  }
+}
+
+/* Level 3 */
+void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasSgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
+    return;
+  }
+  THError("Cublas_Sgemm only supports m, n, k, lda, ldb, ldc"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+#ifdef CUDA_HALF_TENSOR
+// In CUDA 8.0, definition of data types for sgemmex changed
+#if CUDA_VERSION < 8000
+#  define CUDA_R_16F CUBLAS_DATA_HALF
+#endif
+
+void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+
+    // Check for native Hgemm support
+    if (THC_fastHalfInstructions(state)) {
+      THCublasCheck(cublasHgemm(handle, opa, opb,
+				i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb,
+				&beta, c, i_ldc));
+    } else {
+      // Simulated Hgemm
+      float fAlpha = THC_half2float(alpha);
+      float fBeta = THC_half2float(beta);
+
+      THCublasCheck(cublasSgemmEx(handle, opa, opb,
+				  i_m, i_n, i_k, &fAlpha,
+                                  a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
+				  i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
+    }
+
+    return;
+  }
+  THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc"
+          "with th bound [val] <= %d", INT_MAX);
+}
+#endif
+
+void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc)
+{
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+    cublasSetStream(handle, THCState_getCurrentStream(state));
+    THCublasCheck(cublasDgemm(handle, opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
+    return;
+  }
+  THError("Cublas_Dgemm only supports m, n, k, lda, ldb, ldc"
+          "with the bound [val] <= %d", INT_MAX);
+}
+
+
+void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+                             float alpha, const float *a[], long lda, const float *b[], long ldb,
+                             float beta, float *c[], long ldc, long batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+  {
+    THError("Cublas_SgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgemmBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc,
+                                   (int)batchCount));
+}
+
+void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+                             double alpha, const double *a[], long lda, const double *b[], long ldb,
+                             double beta, double *c[], long ldc, long batchCount)
+{
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+  {
+    THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgemmBatched(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc,
+                                   (int)batchCount));
+}
+
+/* Inverse */
+void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) {
+  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Sgetrf only supports n, lda, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
+}
+
+void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) {
+  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetrf only supports n, lda, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
+}
+
+void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
+
+  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Sgetri only supports n, lda, ldc, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
+}
+
+void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) {
+
+  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
+  {
+    THError("Cublas_Dgetri only supports n, lda, ldc, batchSize"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
+  cublasSetStream(handle, THCState_getCurrentStream(state));
+  THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
+}
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
new file mode 100644
index 0000000..bf91f93
--- /dev/null
+++ b/lib/THC/THCBlas.h
@@ -0,0 +1,41 @@
+#ifndef THC_BLAS_INC
+#define THC_BLAS_INC
+
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+/* Level 1 */
+THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy);
+THC_API double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy);
+#ifdef CUDA_HALF_TENSOR
+THC_API float THCudaBlas_Hdot(THCState *state, long n, half *x, long incx, half *y, long incy);
+#endif
+
+/* Level 2 */
+THC_API void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy);
+THC_API void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy);
+THC_API void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda);
+THC_API void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda);
+
+/* Level 3 */
+THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc);
+THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc);
+
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc);
+#endif
+
+THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+                                     float alpha, const float *a[], long lda, const float *b[], long ldb,
+                                     float beta, float *c[], long ldc, long batchCount);
+THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+                                     double alpha, const double *a[], long lda, const double *b[], long ldb,
+                                     double beta, double *c[], long ldc, long batchCount);
+
+/* Inverse */
+THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
+THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
+
+#endif
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
new file mode 100644
index 0000000..eeae04a
--- /dev/null
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -0,0 +1,376 @@
+#include "THCCachingAllocator.h"
+
+#include <cuda_runtime_api.h>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+// - Large (>1MB) and small allocation requestss are handled separately. Large
+//   allocation requests can be filled by a cudaMalloc call of the exact size.
+//   Small requests will allocate and split a 1MB buffer, if necessary.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+
+
+namespace {
+
+const size_t kRoundSmall = 512;     // round up small allocs to 512 bytes
+const size_t kRoundLarge = 131072;  // round up large allocs to 128 KiB
+const size_t kSmallAlloc = 1048576; // largest "small" allocation is 1 MiB
+
+struct Block {
+  int           device;     // gpu
+  cudaStream_t  stream;     // allocation stream
+  size_t        size;       // block size in bytes
+  char*         ptr;        // memory address
+  bool          allocated;  // in-use flag
+  Block*        prev;       // prev block if split from a larger allocation
+  Block*        next;       // next block if split from a larger allocation
+
+  Block(int device, cudaStream_t stream, size_t size, char* ptr=NULL) :
+      device(device), stream(stream), size(size), ptr(ptr), allocated(0),
+      prev(NULL), next(NULL) { }
+};
+
+static bool BlockComparator(const Block* a, const Block* b)
+{
+  if (a->device != b->device) {
+    return a->device < b->device;
+  }
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+} // namespace
+
+struct THCCachingAllocator
+{
+  typedef bool (*Comparison)(const Block*, const Block*);
+  typedef std::set<Block*, Comparison> FreeBlocks;
+
+  // lock around malloc and free
+  std::mutex mutex;
+
+  // cached blocks larger than 1 MB
+  FreeBlocks large_blocks;
+
+  // cached blocks 1 MB or smaller
+  FreeBlocks small_blocks;
+
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocated_blocks;
+
+  THCCachingAllocator() :
+      large_blocks(BlockComparator),
+      small_blocks(BlockComparator) {}
+
+  /** allocates a block which is safe to use from the provided stream */
+  cudaError_t malloc(void** devPtr, size_t size, cudaStream_t stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    int device;
+    cudaError_t err = cudaGetDevice(&device);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    size = round_size(size);
+    bool small = size <= kSmallAlloc;
+
+    Block search_key(device, stream, size);
+    auto& free_blocks = small ? large_blocks : small_blocks;
+
+    Block* block = NULL;
+    Block* remaining = NULL;
+
+    auto it = free_blocks.lower_bound(&search_key);
+    if (it != free_blocks.end() && (*it)->device == device && (*it)->stream == stream) {
+      block = *it;
+      free_blocks.erase(it);
+    } else {
+      void* ptr;
+      size_t alloc_size = small ? kSmallAlloc : size;
+      err = cuda_malloc_retry(device, &ptr, alloc_size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      block = new Block(device, stream, alloc_size, (char*)ptr);
+    }
+
+    if (block->size - size >= (small ? kRoundSmall : kSmallAlloc + 1)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr += size;
+      remaining->size -= size;
+      free_blocks.insert(remaining);
+    }
+
+    block->allocated = true;
+    allocated_blocks[block->ptr] = block;
+
+    *devPtr = (void*)block->ptr;
+    return cudaSuccess;
+  }
+
+  cudaError_t free(void* ptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!ptr) {
+      return cudaSuccess;
+    }
+
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return cudaErrorInvalidDevicePointer;
+    }
+
+    Block* block = it->second;
+    allocated_blocks.erase(it);
+
+    bool small = block->size <= kSmallAlloc;
+    auto& free_blocks = small ? large_blocks : small_blocks;
+    try_merge_blocks(block, block->prev, free_blocks);
+    try_merge_blocks(block, block->next, free_blocks);
+
+    block->allocated = false;
+    free_blocks.insert(block);
+
+    return cudaSuccess;
+  }
+
+  /** returns cached blocks to the system allocator */
+  cudaError_t emptyCache()
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cudaError_t err = free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    return cudaSuccess;
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    Block* block = find_allocated_block(ptr);
+    if (!block) {
+      THError("invalid device pointer: %p", ptr);
+    }
+    while (block->prev) {
+      block = block->prev;
+    }
+    void *basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  // Accumulates sizes of all memory blocks for given device in given free list
+  void cacheInfoAux(FreeBlocks& blocks, int dev_id, size_t* total, size_t* largest)
+  {
+    Block search_key(dev_id, 0, 0);
+    auto it = blocks.lower_bound(&search_key);
+    for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) {
+      size_t blocksize = (*it)->size;
+      *total += blocksize;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+
+  void cacheInfo(int dev_id, size_t* total, size_t* largest)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cacheInfoAux(large_blocks, dev_id, total, largest);
+    cacheInfoAux(small_blocks, dev_id, total, largest);
+  }
+
+  /** combine previously split blocks */
+  void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
+  {
+    if (!src || src->allocated) {
+      return;
+    }
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+    dst->size += src->size;
+    free_blocks.erase(src);
+    delete src;
+  }
+
+  size_t round_size(size_t size)
+  {
+    if (size < kRoundSmall) {
+      size = kRoundSmall;
+    } else if (size < kSmallAlloc) {
+      size += kRoundSmall - 1 - (size - 1) % kRoundSmall;
+    } else {
+      size += kRoundLarge - 1 - (size - 1) % kRoundLarge;
+    }
+    return size;
+  }
+
+  cudaError_t cuda_malloc_retry(int device, void** devPtr, size_t size)
+  {
+    // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks
+    // and retries.
+    cudaError_t err = cudaMalloc(devPtr, size);
+    if (err != cudaSuccess) {
+      cudaGetLastError();
+      err = free_cached_blocks(device);
+      if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaMalloc(devPtr, size);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    return cudaSuccess;
+  }
+
+  cudaError_t free_cached_blocks(int device)
+  {
+    // Free all non-split cached blocks on device
+    Block lower_bound(device, NULL, 0);
+    Block upper_bound(device + 1, NULL, 0);
+
+    cudaError_t err = free_blocks(
+        large_blocks,
+        large_blocks.lower_bound(&lower_bound),
+        large_blocks.lower_bound(&upper_bound));
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = free_blocks(
+        small_blocks,
+        small_blocks.lower_bound(&lower_bound),
+        small_blocks.lower_bound(&upper_bound));
+    return err;
+  }
+
+  cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end)
+  {
+    // Frees all non-split blocks between `it` and `end`
+    while (it != end) {
+      Block* block = *it;
+      if (!block->prev && !block->next) {
+        cudaError_t err = cudaFree((void*)block->ptr);
+        if (err != cudaSuccess) {
+          return err;
+        }
+        auto cur = it;
+        ++it;
+        blocks.erase(cur);
+        delete block;
+      } else {
+        ++it;
+      }
+    }
+    return cudaSuccess;
+  }
+
+  Block* find_allocated_block(void *ptr) {
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return NULL;
+    }
+    return it->second;
+  }
+};
+
+static cudaError_t THCCachingAllocator_malloc(void* ctx, void** ptr, size_t size, cudaStream_t stream)
+{
+  THCCachingAllocator* a = (THCCachingAllocator*) ctx;
+  return a->malloc(ptr, size, stream);
+}
+
+static cudaError_t THCCachingAllocator_free(void* ctx, void* ptr)
+{
+  THCCachingAllocator* a = (THCCachingAllocator*) ctx;
+  return a->free(ptr);
+}
+
+static cudaError_t THCCachingAllocator_emptyCache(void* ctx)
+{
+  THCCachingAllocator* a = (THCCachingAllocator*) ctx;
+  return a->emptyCache();
+}
+
+static cudaError_t THCCachingAllocator_cacheInfo(void* ctx, int dev_id, size_t* cachedAndFree, size_t* largestBlock)
+{
+  THCCachingAllocator* a = (THCCachingAllocator*) ctx;
+  a->cacheInfo(dev_id, cachedAndFree, largestBlock);
+  return cudaSuccess;
+}
+
+static THCCachingAllocator caching_allocator;
+static THCDeviceAllocator device_allocator = {
+  &THCCachingAllocator_malloc,
+  NULL,
+  &THCCachingAllocator_free,
+  &THCCachingAllocator_emptyCache,
+  &THCCachingAllocator_cacheInfo,
+  &caching_allocator
+};
+
+THC_API THCDeviceAllocator* THCCachingAllocator_get(void)
+{
+  return &device_allocator;
+}
+
+THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size)
+{
+  return caching_allocator.getBaseAllocation(ptr, size);
+}
diff --git a/lib/THC/THCCachingAllocator.h b/lib/THC/THCCachingAllocator.h
new file mode 100644
index 0000000..3eb3725
--- /dev/null
+++ b/lib/THC/THCCachingAllocator.h
@@ -0,0 +1,9 @@
+#ifndef THC_DEVICE_ALLOCATOR_INC
+#define THC_DEVICE_ALLOCATOR_INC
+
+#include "THCGeneral.h"
+
+THC_API THCDeviceAllocator* THCCachingAllocator_get(void);
+THC_API void* THCCachingAllocator_getBaseAllocation(void *ptr, size_t *size);
+
+#endif
diff --git a/lib/THC/THCCachingHostAllocator.cpp b/lib/THC/THCCachingHostAllocator.cpp
new file mode 100644
index 0000000..3cbbccb
--- /dev/null
+++ b/lib/THC/THCCachingHostAllocator.cpp
@@ -0,0 +1,249 @@
+#include "THCCachingHostAllocator.h"
+
+#include <cuda_runtime_api.h>
+#include <deque>
+#include <mutex>
+#include <set>
+#include <stdint.h>
+#include <unordered_map>
+#include <utility>
+
+
+namespace {
+
+struct BlockSize
+{
+  size_t  size; // allocation size
+  void*   ptr;  // host memory pointer
+
+  BlockSize(size_t size, void* ptr=NULL) : size(size), ptr(ptr) {}
+};
+
+struct Block : public BlockSize
+{
+  bool  allocated;    // true if the block is currently allocated
+  int   event_count;  // number of outstanding cuda events
+
+  Block(size_t size, void* ptr, bool allocated) :
+      BlockSize(size, ptr), allocated(allocated), event_count(0) { }
+};
+
+static bool BlockComparator(const BlockSize& a, const BlockSize& b)
+{
+  // sort by size, break ties with pointer
+  if (a.size != b.size) {
+    return a.size < b.size;
+  }
+  return (uintptr_t)a.ptr < (uintptr_t)b.ptr;
+}
+
+struct HostAllocator
+{
+  typedef bool (*Comparison)(const BlockSize&, const BlockSize&);
+
+  // lock around all operations
+  std::mutex mutex;
+
+  // blocks by pointer
+  std::unordered_map<void*, Block> blocks;
+
+  // pointers that are ready to be allocated (event_count=0)
+  std::set<BlockSize, Comparison> available;
+
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, void*>> cuda_events;
+
+  HostAllocator() : available(BlockComparator) {}
+
+  cudaError_t malloc(void** ptr, size_t size)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    // search for the smallest block which can hold this allocation
+    BlockSize search_key(size);
+    auto it = available.lower_bound(search_key);
+    if (it != available.end()) {
+      Block& block = blocks.at(it->ptr);
+      THAssert(!block.allocated && block.event_count == 0);
+      block.allocated = true;
+      *ptr = block.ptr;
+      available.erase(it);
+      return cudaSuccess;
+    }
+
+    // note that cudaHostAlloc may not touch pointer if size is 0
+    *ptr = 0;
+
+    // allocate a new block if no cached allocation is found
+    err = cudaHostAlloc(ptr, size, cudaHostAllocDefault);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    blocks.insert({*ptr, Block(size, *ptr, true)});
+    return cudaSuccess;
+  }
+
+  cudaError_t free(void* ptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (!ptr) {
+      return cudaSuccess;
+    }
+
+    auto it = blocks.find(ptr);
+    THAssert(it != blocks.end());
+
+    Block& block = it->second;
+    THAssert(block.allocated);
+
+    block.allocated = false;
+    if (block.event_count == 0) {
+      // the block can be re-used if there are no outstanding cuda events
+      available.insert(block);
+    }
+    return cudaSuccess;
+  }
+
+  cudaError_t recordEvent(void* ptr, cudaStream_t stream)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cudaError_t err;
+
+    auto it = blocks.find(ptr);
+    if (it == blocks.end()) {
+      // ignore events for untracked pointers
+      return cudaSuccess;
+    }
+
+    Block& block = it->second;
+    THAssert(block.allocated);
+
+    // process outstanding cuda events which may have occurred
+    err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    // create and record an event in the given stream
+    cudaEvent_t event;
+    err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = cudaEventRecord(event, stream);
+    if (err != cudaSuccess) {
+      return err;
+    }
+
+    // the block will not be re-used until all associated events have occured
+    block.event_count++;
+    cuda_events.emplace_back(event, ptr);
+    return cudaSuccess;
+  }
+
+  cudaError_t processEvents()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        break;
+      } else if (err != cudaSuccess) {
+        return err;
+      }
+      err = cudaEventDestroy(event);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      Block& block = blocks.at(e.second);
+      block.event_count--;
+      if (block.event_count == 0 && !block.allocated) {
+        available.insert(block);
+      }
+      cuda_events.pop_front();
+    }
+    return cudaSuccess;
+  }
+
+  void emptyCache()
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+
+    // remove events for freed blocks
+    std::deque<std::pair<cudaEvent_t, void*>> new_events;
+    for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
+      cudaEvent_t event = it->first;
+      Block& block = blocks.at(it->second);
+      if (!block.allocated) {
+        THCudaCheckWarn(cudaEventDestroy(event));
+        block.event_count--;
+      } else {
+        new_events.push_back(*it);
+      }
+    }
+    cuda_events.swap(new_events);
+
+    // clear list of available blocks
+    available.clear();
+
+    // free and erase non-allocated blocks
+    for (auto it = blocks.begin(); it != blocks.end();) {
+      Block& block = it->second;
+      if (!block.allocated) {
+        THCudaCheckWarn(cudaFreeHost(block.ptr));
+        it = blocks.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+};
+
+}  // namespace
+
+static HostAllocator allocator;
+
+static void* THCCachingHostAllocator_malloc(void* ctx, ptrdiff_t size)
+{
+  THAssert(size >= 0);
+  void *ptr;
+  THCudaCheck(allocator.malloc(&ptr, size));
+  return ptr;
+}
+
+static void THCCachingHostAllocator_free(void* ctx, void* ptr)
+{
+  allocator.free(ptr);
+}
+
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream)
+{
+  return allocator.recordEvent(ptr, stream);
+}
+
+void THCCachingHostAllocator_emptyCache()
+{
+  allocator.emptyCache();
+}
+
+THAllocator THCCachingHostAllocator = {
+  &THCCachingHostAllocator_malloc,
+  NULL,
+  &THCCachingHostAllocator_free,
+};
diff --git a/lib/THC/THCCachingHostAllocator.h b/lib/THC/THCCachingHostAllocator.h
new file mode 100644
index 0000000..a695565
--- /dev/null
+++ b/lib/THC/THCCachingHostAllocator.h
@@ -0,0 +1,30 @@
+#ifndef THC_CACHING_HOST_ALLOCATOR_INC
+#define THC_CACHING_HOST_ALLOCATOR_INC
+
+#include "THCGeneral.h"
+
+//
+// A caching allocator for CUDA host allocations (pinned memory).
+//
+// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// freed pinned (page-locked) memory allocations. This avoids device
+// synchronizations due to cudaFreeHost calls.
+//
+// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
+// called anytime a pointer from this allocator is used in a cudaMemcpyAsync
+// call between host and device. The THC library implements this for storages
+// and tensors in THCTensor_(copyAsyncCPU) and THCTensor_(copyAsyncCuda).
+//
+// Note that this allocator does not split larger allocations into smaller
+// blocks, unlike the caching device allocator.
+//
+THC_API THAllocator THCCachingHostAllocator;
+
+// Records an event in the specified stream. The allocation 'ptr' will not be
+// re-used until the event has occured.
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream);
+
+// Releases cached pinned memory allocations via cudaHostFree
+THC_API void THCCachingHostAllocator_emptyCache(void);
+
+#endif
diff --git a/lib/THC/THCDeviceTensor-inl.cuh b/lib/THC/THCDeviceTensor-inl.cuh
new file mode 100644
index 0000000..84cad81
--- /dev/null
+++ b/lib/THC/THCDeviceTensor-inl.cuh
@@ -0,0 +1,420 @@
+#include <assert.h>
+
+namespace detail {
+
+template <typename T, int N>
+__host__ __device__ void copy(T to[N], T from[N]) {
+  for (int i = 0; i < N; ++i) {
+    to[i] = from[i];
+  }
+}
+
+} // namespace detail
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
+    : data_(NULL) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = 0;
+    stride_[i] = (IndexT) 1;
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
+#ifdef _MSC_VER
+THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
+#else
+THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+  }
+
+  stride_[Dim - 1] = (IndexT) 1;
+  for (int i = Dim - 2; i >= 0; --i) {
+    stride_[i] = stride_[i + 1] * sizes[i + 1];
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
+#ifdef _MSC_VER
+  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
+#else
+  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+    stride_[i] = strides[i];
+  }
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int OtherDim>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
+  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+
+  for (int i = 0; i < Dim; ++i) {
+    if (size_[i] != rhs.size_[i]) {
+      return false;
+    }
+
+    if (stride_[i] != rhs.stride_[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
+  thc_static_assert(sizeof(U) == sizeof(T));
+
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
+  thc_static_assert(sizeof(U) == sizeof(T));
+
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ ptrdiff_t
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
+  ptrdiff_t size = getSize(0);
+
+  for (int i = 1; i < Dim; ++i) {
+    size *= getSize(i);
+  }
+
+  return size;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
+  long prevSize = 1;
+
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (getSize(i) != (IndexT) 1) {
+      if (getStride(i) == prevSize) {
+        prevSize *= getSize(i);
+      } else {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
+  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
+    return true;
+  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
+             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
+    return true;
+  }
+
+  return false;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
+  for (int i = 0; i < Dim; ++i) {
+    if (!isConsistentlySized(i)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousDim(int i) const {
+  return (i == Dim - 1) || // just in case
+    ((i < Dim - 1) &&
+     ((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
+                                                      int dim2) const {
+#ifdef __CUDA_ARCH__
+  // Device code
+  assert(dim1 >= 0 && dim1 < Dim);
+  assert(dim1 >= 0 && dim2 < Dim);
+#else
+  // Host code
+  if (dim1 < 0 || dim1 >= Dim) {
+    THError("dim1 out of bounds");
+  }
+
+  if (dim2 < 0 || dim2 >= Dim) {
+    THError("dim2 out of bounds");
+  }
+#endif
+
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i];
+  }
+
+  IndexT tmp = newSize[dim1];
+  newSize[dim1] = newSize[dim2];
+  newSize[dim2] = tmp;
+
+  tmp = newStride[dim1];
+  newStride[dim1] = newStride[dim2];
+  newStride[dim2] = tmp;
+
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int shift = NewDim - Dim;
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < shift) {
+      // These are the extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = size_[0] * stride_[0];
+    } else {
+      // Shift the remaining dimensions
+      newSize[i] = size_[i - shift];
+      newStride[i] = stride_[i - shift];
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < Dim) {
+      // Existing dimensions get copied over
+      newSize[i] = size_[i];
+      newStride[i] = stride_[i];
+    } else {
+      // Extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = (IndexT) 1;
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  for (int i = 0; i < Dim - NewDim; ++i) {
+    bool cont = isContiguousDim(i);
+#ifdef __CUDA_ARCH__
+    // Device code
+    assert(cont);
+#else
+    // Host code
+    if (!cont) {
+      THError("Can only downcast contiguous tensors");
+    }
+#endif
+  }
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  int ignoredDims = Dim - NewDim;
+  IndexT collapsedSize = 1;
+
+  for (int i = 0; i < Dim; ++i) {
+    if (i < ignoredDims) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == ignoredDims) {
+        // This is the first non-collapsed dimension
+        newSize[i - ignoredDims] = collapsedSize * getSize(i);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i - ignoredDims] = getSize(i);
+      }
+
+      newStride[i - ignoredDims] = getStride(i);
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  for (int i = NewDim; i < Dim; ++i) {
+    bool cont = isContiguousDim(i);
+#ifdef __CUDA_ARCH__
+    // Device code
+    assert(cont);
+#else
+    // Host code
+    if (!cont) {
+      THError("Can only downcast contiguous tensors");
+    }
+#endif
+  }
+
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+
+  IndexT collapsedSize = 1;
+
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (i >= NewDim) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == NewDim - 1) {
+        // This is the first non-collapsed dimension
+        newSize[i] = collapsedSize * getSize(i);
+        newStride[i] = getStride(Dim - 1);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i] = getSize(i);
+        newStride[i] = getStride(i);
+      }
+    }
+  }
+
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
+  thc_static_assert(SubDim >= 1 && SubDim < Dim);
+
+  IndexT viewSizes[SubDim];
+  IndexT viewStrides[SubDim];
+
+  for (int i = 0; i < SubDim; ++i) {
+    viewSizes[i] = size_[Dim - SubDim + i];
+    viewStrides[i] = stride_[Dim - SubDim + i];
+  }
+
+  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
+    at, viewSizes, viewStrides);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
+  return view<SubDim>(data_);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+void
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
+#ifdef __CUDA_ARCH__
+  assert(isContiguous());
+#else
+  if (!isContiguous()) {
+    THError("fillAsync only works on contiguous data");
+  }
+#endif
+
+  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
+}
diff --git a/lib/THC/THCDeviceTensor.cuh b/lib/THC/THCDeviceTensor.cuh
new file mode 100644
index 0000000..3122d08
--- /dev/null
+++ b/lib/THC/THCDeviceTensor.cuh
@@ -0,0 +1,513 @@
+#ifndef THC_DEVICE_TENSOR_INC
+#define THC_DEVICE_TENSOR_INC
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
+template <bool>
+struct THCStaticAssert;
+
+template <>
+struct THCStaticAssert<true> {
+};
+
+#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
+
+/// Our tensor type
+template <typename T,
+          int Dim,
+          typename IndexT,
+          template <typename U> class PtrTraits>
+class THCDeviceTensor;
+
+/// Type of a subspace of a tensor
+namespace detail {
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor;
+}
+
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+/**
+   Templated multi-dimensional array that supports strided access of
+   elements. Main access is through `operator[]`; e.g.,
+   `tensor[x][y][z]`.
+
+- `T` is the contained type (e.g., `float`)
+- `Dim` is the tensor rank
+- `IndexT` is the integer type used for size/stride arrays, and for
+- all indexing math. Default is `int`, but for large tensors, `long`
+- can be used instead.
+- `PtrTraits` are traits applied to our data pointer (T*). By default,
+- this is just T*, but RestrictPtrTraits can be used to apply T*
+- __restrict__ for alias-free analysis.
+*/
+template <typename T,
+          int Dim,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = DefaultPtrTraits>
+class THCDeviceTensor {
+ public:
+  enum { NumDim = Dim };
+  typedef T DataType;
+  typedef IndexT IndexType;
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
+
+  /// Default constructor
+  __host__ __device__ THCDeviceTensor();
+
+  /// Constructor that calculates strides with no padding
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim]);
+#else
+                                      const IndexT sizes[Dim]);
+#endif
+
+  /// Constructor that takes arbitrary size/stride arrays
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim],
+                                      const IndexT (&strides)[Dim]);
+#else
+                                      const IndexT sizes[Dim],
+                                      const IndexT strides[Dim]);
+#endif
+
+  /// Returns true if the two tensors are of the same dimensionality,
+  /// size and stride.
+  template <int OtherDim>
+  __host__ __device__ bool
+  isSameSizeAndStride(
+    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
+
+  /// Cast to a tensor of a different type of the same size and stride
+  template <typename U>
+  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
+
+  /// Const version of `cast`
+  template <typename U>
+  __host__ __device__
+  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
+
+  /// Returns a raw pointer to the start of our data.
+  __host__ __device__ __forceinline__ DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw pointer to the start of our data (const).
+  __host__ __device__ __forceinline__
+  const DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<U>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  const typename PtrTraits<const U>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
+  }
+
+  /// Returns a read/write view of a portion of our tensor.
+  __host__ __device__ __forceinline__
+  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT);
+
+  /// Returns a read/write view of a portion of our tensor (const).
+  __host__ __device__ __forceinline__
+  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT) const;
+
+  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getSize(int i) const {
+    return size_[i];
+  }
+
+  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getStride(int i) const {
+    return stride_[i];
+  }
+
+  /// Returns the total number of elements contained within our data
+  /// (product of `getSize(i)`)
+  __host__ __device__ ptrdiff_t numElements() const;
+
+  /// Returns the size array.
+  __host__ __device__ __forceinline__ const IndexT* sizes() const {
+    return size_;
+  }
+
+  /// Returns the stride array.
+  __host__ __device__ __forceinline__ const IndexT* strides() const {
+    return stride_;
+  }
+
+  /// Returns true if there is no padding within the tensor and no
+  /// re-ordering of the dimensions.
+  /// ~~~
+  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
+  /// ~~~
+  __host__ __device__ bool isContiguous() const;
+
+  /// Returns whether a given dimension has only increasing stride
+  /// from the previous dimension. A tensor that was permuted by
+  /// exchanging size and stride only will fail this check.
+  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
+  __host__ __device__ bool isConsistentlySized(int i) const;
+
+  // Returns whether at each dimension `stride <= size`.
+  // If this is not the case then iterating once over the size space will
+  // touch the same memory locations multiple times.
+  __host__ __device__ bool isConsistentlySized() const;
+
+  /// Returns true if the given dimension index has no padding
+  __host__ __device__ bool isContiguousDim(int i) const;
+
+  /// Returns a tensor of the same dimension after transposing the two
+  /// dimensions given. Does not actually move elements; transposition
+  /// is made by permuting the size/stride arrays.
+  /// If the dimensions are not valid, asserts.
+  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+  transpose(int dim1, int dim2) const;
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the leading dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastOuter();
+
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the lowest/most varying dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastInner();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
+
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting at `at`.
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view(DataPtrType at);
+
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting where our data begins
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view();
+
+  /// Zeroes out the tensor asynchronously. Asserts if the contents
+  /// in question are not contiguous.
+  void zero(cudaStream_t stream = 0);
+
+ private:
+  /// Raw pointer to where the tensor data begins
+  DataPtrType data_;
+
+  /// Array of strides (in sizeof(T) terms) per each dimension
+  IndexT stride_[Dim];
+
+  /// Size per each dimension
+  IndexT size_[Dim];
+};
+
+namespace detail {
+
+/// Specialization for a view of a single value (0-dimensional)
+template <typename TensorType, template <typename U> class PtrTraits>
+class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
+ public:
+  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
+  operator=(typename TensorType::DataType val) {
+    *data_ = val;
+    return *this;
+  }
+
+  // operator T&
+  __host__ __device__ operator typename TensorType::DataType&() {
+    return *data_;
+  }
+
+  // const operator T& returning const T&
+  __host__ __device__ operator const typename TensorType::DataType&() const {
+    return *data_;
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+  private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class THCDeviceTensor<typename TensorType::DataType,
+                               1,
+                               typename TensorType::IndexType,
+                               PtrTraits>;
+
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// Where our value is located
+  typename TensorType::DataPtrType const data_;
+};
+
+/// A `SubDim`-rank slice of a parent THCDeviceTensor
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor {
+ public:
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor).
+  __host__ __device__ __forceinline__
+  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor) (const).
+  __host__ __device__ __forceinline__
+  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) const {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+
+  /// Returns a tensor that is a view of the SubDim-dimensional slice
+  /// of this tensor, starting where our data begins
+  THCDeviceTensor<typename TensorType::DataType,
+               SubDim,
+               typename TensorType::IndexType,
+               PtrTraits> view() {
+    return tensor_.template view<SubDim>(data_);
+  }
+
+ private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
+
+  /// Our parent tensor can create us
+  friend class
+  THCDeviceTensor<typename TensorType::DataType,
+               TensorType::NumDim,
+               typename TensorType::IndexType,
+               PtrTraits>;
+
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+
+  /// The tensor we're referencing
+  TensorType& tensor_;
+
+  /// The start of our sub-region
+  typename TensorType::DataPtrType const data_;
+};
+
+} // namespace detail
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                        Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      *this, data_)[index]);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                              Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      const_cast<TensorType&>(*this), data_)[index]);
+}
+
+#include "THCDeviceTensor-inl.cuh"
+
+#endif // THC_DEVICE_TENSOR_INC
diff --git a/lib/THC/THCDeviceTensorUtils-inl.cuh b/lib/THC/THCDeviceTensorUtils-inl.cuh
new file mode 100644
index 0000000..42aab34
--- /dev/null
+++ b/lib/THC/THCDeviceTensorUtils-inl.cuh
@@ -0,0 +1,118 @@
+namespace detail {
+
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHC :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim > Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template upcastOuter<NewDim>();
+  }
+};
+
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHC :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim < Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template downcastOuter<NewDim>();
+  }
+};
+
+} // namespace detail
+
+#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
+  case i:                                                               \
+  if (NewDim > i) {                                                     \
+    return detail::UpcastTHC<T, i, IndexT,                              \
+                             PtrTraits, NewDim, (NewDim > i)>::         \
+      make(state, t);                                                   \
+  } else if (NewDim == i) {                                             \
+    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
+  } else {                                                              \
+    return detail::DowncastTHC<T, i, IndexT,                            \
+                               PtrTraits, NewDim, (NewDim < i)>::       \
+      make(state, t);                                                   \
+  }                                                                     \
+  /* break; */
+
+template <typename T, int NewDim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  switch (THCudaTensor_nDimension(state, t)) {
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
+    default:
+      ;
+  }
+
+  // Not implemented
+  THError("THCDeviceTensor dimension size not supported");
+  return NULL; /* never enters this piece, appeasing compiler warnings */
+}
+
+#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
diff --git a/lib/THC/THCDeviceTensorUtils.cuh b/lib/THC/THCDeviceTensorUtils.cuh
new file mode 100644
index 0000000..472c6e1
--- /dev/null
+++ b/lib/THC/THCDeviceTensorUtils.cuh
@@ -0,0 +1,33 @@
+#ifndef THC_DEVICE_TENSOR_UTILS_INC
+#define THC_DEVICE_TENSOR_UTILS_INC
+
+#include "THCDeviceTensor.cuh"
+#include "THCTensor.h"
+#include <limits>
+
+/// Constructs a DeviceTensor initialized from a THCudaTensor by
+/// upcasting or downcasting the tensor to that of a different
+/// dimension.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t);
+
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+
+#include "generic/THCDeviceTensorUtils.cu"
+#include "THCGenerateAllTypes.h"
+
+#include "THCDeviceTensorUtils-inl.cuh"
+
+#endif // THC_DEVICE_TENSOR_UTILS_INC
diff --git a/lib/THC/THCDeviceUtils.cuh b/lib/THC/THCDeviceUtils.cuh
new file mode 100644
index 0000000..bd41055
--- /dev/null
+++ b/lib/THC/THCDeviceUtils.cuh
@@ -0,0 +1,36 @@
+#ifndef THC_DEVICE_UTILS_INC
+#define THC_DEVICE_UTILS_INC
+
+/* The largest consecutive integer representable in float32 (2^24) */
+#define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T THCCeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+/**
+   Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest
+   multiple of b
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T THCRoundUp(T a, T b) {
+  return THCCeilDiv(a, b) * b;
+}
+
+/**
+ * For CC 3.5+, perform a load using __ldg
+ */
+template <typename T>
+__device__ __forceinline__ T doLdg(const T* p) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(p);
+#else
+  return *p;
+#endif
+}
+
+#endif // THC_DEVICE_UTILS_INC
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
new file mode 100644
index 0000000..c442bd8
--- /dev/null
+++ b/lib/THC/THCGeneral.c
@@ -0,0 +1,770 @@
+#include "THCGeneral.h"
+#include "TH.h"
+#include "THCAllocator.h"
+#include "THCCachingHostAllocator.h"
+#include "THCStream.h"
+#include "THCThreadLocal.h"
+#include "THCTensorRandom.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Size of scratch space available in global memory per each SM + stream */
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)
+
+/* Minimum amount of scratch space per device. Total scratch memory per
+ * device is either this amount, or the # of SMs * the space per SM defined
+ * above, whichever is greater.*/
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float)
+
+THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
+  THCState *state, int device);
+
+THCState* THCState_alloc(void)
+{
+  THCState* state = (THCState*) malloc(sizeof(THCState));
+  memset(state, 0, sizeof(THCState));
+  return state;
+}
+
+void THCState_free(THCState* state)
+{
+  free(state);
+}
+
+static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
+{
+  return cudaMalloc(devPtr, size);
+}
+
+static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr)
+{
+  return cudaFree(devPtr);
+}
+
+static THCDeviceAllocator defaultDeviceAllocator = {
+  &cudaMallocWrapper,
+  NULL,
+  &cudaFreeWrapper,
+  NULL,
+  NULL,
+  NULL
+};
+
+void THCudaInit(THCState* state)
+{
+  if (!state->cudaDeviceAllocator) {
+    state->cudaDeviceAllocator = &defaultDeviceAllocator;
+  }
+  if (!state->cudaHostAllocator) {
+    state->cudaHostAllocator = &THCudaHostAllocator;
+  }
+  if (!state->cudaUVAAllocator) {
+    state->cudaUVAAllocator = &THCUVAAllocator;
+  }
+
+  int numDevices = 0;
+  THCudaCheck(cudaGetDeviceCount(&numDevices));
+  state->numDevices = numDevices;
+
+  int device = 0;
+  THCudaCheck(cudaGetDevice(&device));
+
+  /* Start in the default stream on the current device */
+  state->currentStreams = (THCThreadLocal*) malloc(numDevices * sizeof(THCThreadLocal));
+  for (int i = 0; i < numDevices; ++i) {
+    state->currentStreams[i] = THCThreadLocal_alloc();
+  }
+  state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
+
+  state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
+    malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
+  memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice));
+
+  state->deviceProperties =
+    (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp));
+
+  state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
+  THCRandom_init(state, numDevices, device);
+
+  // By default, all direct p2p kernel access (besides copy) is disallowed,
+  // since direct access without knowing whether or not a certain operation
+  // should be cross-GPU leads to synchronization errors. The user can choose
+  // to disable this functionality, however.
+  state->p2pKernelAccessEnabled = 0;
+
+  // p2pAccessEnabled records if p2p copies are allowed between pairs of
+  // devices. Values include "1" (copy allowed), "0" (copy not allowed), and
+  // "-1" (unknown).
+  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
+  for (int i = 0; i < numDevices; ++i) {
+    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
+    memset(state->p2pAccessEnabled[i], -1, sizeof(int) * numDevices);
+    state->p2pAccessEnabled[i][i] = 1;
+  }
+
+  for (int i = 0; i < numDevices; ++i) {
+    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);
+    THCudaCheck(cudaSetDevice(i));
+    THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
+
+    // Allocate space for the NULL stream
+    res->streams = (THCStream**) malloc(sizeof(THCStream*));
+    res->streams[0] = NULL;
+
+    /* The scratch space that we want to have available per each device is
+       based on the number of SMs available per device. We guarantee a
+       minimum of 128kb of space per device, but to future-proof against
+       future architectures that may have huge #s of SMs, we guarantee that
+       we have at least 16 bytes for each SM. */
+    int numSM = state->deviceProperties[i].multiProcessorCount;
+    size_t sizePerStream =
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ?
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE :
+      numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
+    res->scratchSpacePerStream = sizePerStream;
+  }
+
+  /* Restore to previous device */
+  THCudaCheck(cudaSetDevice(device));
+
+  // Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC
+  // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
+  // handles are created lazily.
+  state->numUserBlasHandles = 1;
+
+  state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
+  state->heapDelta = 0;
+}
+
+void THCudaShutdown(THCState* state)
+{
+  THCRandom_shutdown(state);
+
+  free(state->rngState);
+  free(state->deviceProperties);
+
+  int deviceCount = 0;
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaGetDeviceCount(&deviceCount));
+
+  /* cleanup p2p access state */
+  for (int dev = 0; dev < deviceCount; ++dev) {
+    free(state->p2pAccessEnabled[dev]);
+  }
+  free(state->p2pAccessEnabled);
+
+  /* cleanup per-device state */
+  for (int dev = 0; dev < deviceCount; ++dev) {
+    THCudaCheck(cudaSetDevice(dev));
+    THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
+    /* Free user reserved streams (0 is the default stream) */
+    for (int i = 1; i <= state->numUserStreams; ++i) {
+      THCStream_free(res->streams[i]);
+    }
+    /* Free user defined BLAS handles */
+    for (int i = 0; i < res->numBlasHandles; ++i) {
+      THCublasCheck(cublasDestroy(res->blasHandles[i]));
+    }
+    /* Free per-stream scratch space; starts at 0 because there is space for
+       the default stream as well*/
+    if (res->devScratchSpacePerStream) {
+      for (int stream = 0; stream <= state->numUserStreams; ++stream) {
+        THCudaCheck(THCudaFree(state, res->devScratchSpacePerStream[stream]));
+      }
+    }
+
+    free(res->streams);
+    free(res->blasHandles);
+    free(res->devScratchSpacePerStream);
+    THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
+    THCThreadLocal_free(state->currentStreams[dev]);
+  }
+  free(state->resourcesPerDevice);
+  if (state->cudaDeviceAllocator->emptyCache) {
+    state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state);
+  }
+  if (state->cudaHostAllocator == &THCCachingHostAllocator) {
+    THCCachingHostAllocator_emptyCache();
+  }
+  free(state->currentStreams);
+  THCThreadLocal_free(state->currentPerDeviceBlasHandle);
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
+{
+  if (dev < 0 || dev >= state->numDevices) {
+    THError("%d is not a device", dev);
+  }
+  if (devToAccess < 0 || devToAccess >= state->numDevices) {
+    THError("%d is not a device", devToAccess);
+  }
+  if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
+    int prevDev = 0;
+    THCudaCheck(cudaGetDevice(&prevDev));
+    THCudaCheck(cudaSetDevice(dev));
+
+    int access = 0;
+    THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
+    if (access) {
+      cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        // ignore and clear the error if access was already enabled
+        cudaGetLastError();
+      } else {
+        THCudaCheck(err);
+      }
+      state->p2pAccessEnabled[dev][devToAccess] = 1;
+    } else {
+      state->p2pAccessEnabled[dev][devToAccess] = 0;
+    }
+
+    THCudaCheck(cudaSetDevice(prevDev));
+  }
+  return state->p2pAccessEnabled[dev][devToAccess];
+}
+
+void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
+                                  int enable)
+{
+  /* This will perform device bounds checking for us */
+  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
+
+  if (enable != prevEnabled) {
+    /* If we're attempting to enable p2p access but p2p access isn't */
+    /* supported, throw an error */
+    if (enable) {
+      int access = 0;
+      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
+
+      if (!access) {
+        THError("p2p access not supported for %d accessing %d",
+                dev, devToAccess);
+      }
+    }
+
+    state->p2pAccessEnabled[dev][devToAccess] = enable;
+
+    int prevDev = 0;
+    THCudaCheck(cudaGetDevice(&prevDev));
+    THCudaCheck(cudaSetDevice(dev));
+
+    /* This should be in sync with the current access state */
+    if (enable) {
+      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
+    } else {
+      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
+    }
+
+    THCudaCheck(cudaSetDevice(prevDev));
+  }
+}
+
+int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
+  return state->p2pKernelAccessEnabled;
+}
+
+void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
+  state->p2pKernelAccessEnabled = val;
+}
+
+struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
+{
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+
+  return &(state->deviceProperties[curDev]);
+}
+
+struct THCRNGState* THCState_getRngState(THCState *state)
+{
+  return state->rngState;
+}
+
+THAllocator* THCState_getCudaHostAllocator(THCState* state)
+{
+  return state->cudaHostAllocator;
+}
+
+THAllocator* THCState_getCudaUVAAllocator(THCState* state)
+{
+  return state->cudaUVAAllocator;
+}
+
+void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
+{
+  state->cudaDeviceAllocator = allocator;
+}
+
+int THCState_getNumDevices(THCState *state)
+{
+  return state->numDevices;
+}
+
+static void THCState_initializeScratchSpace(THCState* state, int dev)
+{
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
+  if (res->devScratchSpacePerStream) {
+    return;
+  }
+  size_t size = (state->numUserStreams + 1) * sizeof(void*);
+  void** scratch = (void**)malloc(size);
+  for (int i = 0; i <= state->numUserStreams; ++i) {
+    THCudaCheck(THCudaMalloc(state, &scratch[i], res->scratchSpacePerStream));
+  }
+  res->devScratchSpacePerStream = scratch;
+}
+
+void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
+{
+  if (numStreams <= state->numUserStreams)
+  {
+    return;
+  }
+
+  int prevDev = -1;
+  THCudaCheck(cudaGetDevice(&prevDev));
+
+  /* Otherwise, we have to allocate a new set of streams and stream data */
+  for (int dev = 0; dev < state->numDevices; ++dev) {
+    THCudaCheck(cudaSetDevice(dev));
+    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
+
+    /* +1 for the default stream as well */
+    THCStream** newStreams = realloc(res->streams, (numStreams + 1) * sizeof(THCStream*));
+    THAssert(newStreams);
+
+    THCState_initializeScratchSpace(state, dev);
+    void** newScratchSpace = realloc(res->devScratchSpacePerStream, (numStreams + 1) * sizeof(void*));
+    THAssert(newScratchSpace);
+
+    /* Allocate new stream resources */
+    size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
+    unsigned int flags =
+      nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+
+    for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
+      newStreams[stream] = THCStream_new(flags);
+      newScratchSpace[stream] = NULL;
+      THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
+    }
+
+    res->streams = newStreams;
+    res->devScratchSpacePerStream = newScratchSpace;
+  }
+
+  state->numUserStreams = numStreams;
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles)
+{
+  int prevDev = -1;
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  if (numBlasHandles <= res->numBlasHandles) {
+    return;
+  }
+
+  THCudaCheck(cudaGetDevice(&prevDev));
+  THCudaCheck(cudaSetDevice(device));
+
+  size_t size = numBlasHandles * sizeof(cublasHandle_t);
+  cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size);
+  for (int i = res->numBlasHandles; i < numBlasHandles; ++i) {
+    handles[i] = NULL;
+    THCublasCheck(cublasCreate(&handles[i]));
+  }
+  res->blasHandles = handles;
+  res->numBlasHandles = numBlasHandles;
+
+  THCudaCheck(cudaSetDevice(prevDev));
+}
+
+void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
+{
+  // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
+  // to avoid initializing unused devices
+  if (numBlasHandles > state->numUserBlasHandles)
+  {
+    state->numUserBlasHandles = numBlasHandles;
+  }
+}
+
+int THCState_getNumStreams(THCState* state)
+{
+  return state->numUserStreams;
+}
+
+int THCState_getNumBlasHandles(THCState* state)
+{
+  return state->numUserBlasHandles;
+}
+
+THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
+  THCState *state, int device)
+{
+  /* `device` is a CUDA index */
+  if (device >= state->numDevices || device < 0)
+  {
+    THError("%d is not a device", device + 1 /* back to Torch index */);
+  }
+
+  return &(state->resourcesPerDevice[device]);
+}
+
+cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamIndex)
+{
+  if (streamIndex > state->numUserStreams || streamIndex < 0)
+  {
+    THError("%d is not a stream", streamIndex);
+  }
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  THCStream* stream = res->streams[streamIndex];
+  return stream ? stream->stream : NULL;
+}
+
+cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
+{
+  if (handle <= 0 || handle > state->numUserBlasHandles) {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserBlasHandles);
+  }
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  THCState_reserveDeviceBlasHandles(state, device, handle);
+  return res->blasHandles[handle - 1];
+}
+
+static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
+{
+  return (THCStream*) THCThreadLocal_get(state->currentStreams[device]);
+}
+
+static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream)
+{
+  if (stream) {
+    if (stream->device != device) {
+      THError("invalid stream; expected stream for device %d, but was on %d",
+          device, stream->device);
+    }
+    THCStream_retain(stream);
+  }
+  THCThreadLocal local = state->currentStreams[device];
+  THCStream_free((THCStream*)THCThreadLocal_get(local));
+  THCThreadLocal_set(local, stream);
+}
+
+cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device)
+{
+  THCStream* stream = THCState_getStreamOnDevice(state, device);
+  return stream ? stream->stream : NULL;
+}
+
+cudaStream_t THCState_getCurrentStream(THCState *state)
+{
+  /* This is called at the point of kernel execution.
+     For some debugging code or improperly instrumented kernels,
+     `state` is null */
+  if (state) {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+    return THCState_getCurrentStreamOnDevice(state, device);
+  } else {
+    /* assume default stream */
+    return NULL;
+  }
+}
+
+cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
+{
+  /* This is called at the point of kernel execution.
+     For some debugging code or improperly instrumented kernels,
+     `state` is null */
+  if (state) {
+    int device;
+    THCudaCheck(cudaGetDevice(&device));
+
+    int handle = THCState_getCurrentBlasHandleIndex(state);
+    return THCState_getDeviceBlasHandle(state, device, handle);
+  }
+  THError("THCState and blasHandles must be set as there is no default blasHandle");
+  return NULL;
+}
+
+int THCState_getCurrentStreamIndex(THCState *state)
+{
+  THCStream* stream = THCState_getStream(state);
+  if (!stream) {
+    return 0;
+  }
+
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+  for (int i = 0; i <= state->numUserStreams; ++i) {
+    if (res->streams[i] == stream) {
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+int THCState_getCurrentBlasHandleIndex(THCState *state)
+{
+  void* value = THCThreadLocal_get(state->currentPerDeviceBlasHandle);
+  if (value == NULL) {
+    return 1;
+  }
+  return (int) (intptr_t) value;
+}
+
+THCStream* THCState_getStream(THCState *state)
+{
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  return THCState_getStreamOnDevice(state, device);
+}
+
+void THCState_setStream(THCState *state, THCStream *stream)
+{
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  THCState_setStreamOnDevice(state, device, stream);
+}
+
+void THCState_setCurrentStreamIndex(THCState *state, int streamIndex)
+{
+  if (streamIndex < 0 || streamIndex > state->numUserStreams) {
+    THError("%d is not a valid stream, valid range is: (0, %d)", streamIndex,
+        state->numUserStreams);
+  }
+
+  int device;
+  for (device = 0; device < state->numDevices; ++device) {
+    THCStream* stream = NULL;
+    if (streamIndex != 0) {
+      THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+      stream = res->streams[streamIndex];
+    }
+
+    THCState_setStreamOnDevice(state, device, stream);
+  }
+}
+
+void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
+{
+  if (handle > state->numUserBlasHandles || handle <= 0)
+  {
+    THError("%d is not a valid handle, valid range is: (1, %d)",
+            handle, state->numUserBlasHandles);
+  }
+  THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
+}
+
+void* THCState_getCurrentDeviceScratchSpace(THCState* state)
+{
+  int device = -1;
+  THCudaCheck(cudaGetDevice(&device));
+  int stream = THCState_getCurrentStreamIndex(state);
+  if (stream < 0) {
+    // new stream API
+    return NULL;
+  }
+  return THCState_getDeviceScratchSpace(state, device, stream);
+}
+
+void* THCState_getDeviceScratchSpace(THCState* state, int dev, int stream)
+{
+  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
+  if (stream > state->numUserStreams || stream < 0) {
+    THError("%d is not a stream", stream);
+  }
+  THCState_initializeScratchSpace(state, dev);
+  return res->devScratchSpacePerStream[stream];
+}
+
+size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
+{
+  int device = -1;
+  THCudaCheck(cudaGetDevice(&device));
+  return THCState_getDeviceScratchSpaceSize(state, device);
+}
+
+size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
+{
+  THCCudaResourcesPerDevice* res =
+    THCState_getDeviceResourcePtr(state, device);
+
+  return res->scratchSpacePerStream;
+}
+
+void __THCudaCheck(cudaError_t err, const char *file, const int line)
+{
+  if(err != cudaSuccess)
+  {
+    static int alreadyFailed = 0;
+    if(!alreadyFailed) {
+      fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
+      alreadyFailed = 1;
+    }
+    _THError(file, line, "cuda runtime error (%d) : %s", err,
+             cudaGetErrorString(err));
+  }
+}
+
+void __THCudaCheckWarn(cudaError_t err, const char *file, const int line)
+{
+  if(err != cudaSuccess)
+  {
+    fprintf(stderr, "THCudaCheckWarn FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
+  }
+}
+
+void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
+{
+  if(status != CUBLAS_STATUS_SUCCESS)
+  {
+    const char* errmsg = NULL;
+
+    switch(status)
+    {
+      case CUBLAS_STATUS_NOT_INITIALIZED:
+        errmsg = "library not initialized";
+        break;
+
+      case CUBLAS_STATUS_ALLOC_FAILED:
+        errmsg = "resource allocation failed";
+        break;
+
+      case CUBLAS_STATUS_INVALID_VALUE:
+        errmsg = "an invalid numeric value was used as an argument";
+        break;
+
+      case CUBLAS_STATUS_ARCH_MISMATCH:
+        errmsg = "an absent device architectural feature is required";
+        break;
+
+      case CUBLAS_STATUS_MAPPING_ERROR:
+        errmsg = "an access to GPU memory space failed";
+        break;
+
+      case CUBLAS_STATUS_EXECUTION_FAILED:
+        errmsg = "the GPU program failed to execute";
+        break;
+
+      case CUBLAS_STATUS_INTERNAL_ERROR:
+        errmsg = "an internal operation failed";
+        break;
+
+      default:
+        errmsg = "unknown error";
+        break;
+    }
+
+    _THError(file, line, "cublas runtime error : %s", errmsg);
+  }
+}
+
+static ptrdiff_t heapSize = 0; // not thread-local
+static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
+static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
+static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
+static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
+
+void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
+{
+  state->cutorchGCFunction = cutorchGCFunction_;
+  state->cutorchGCData = data;
+}
+
+cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
+{
+  THCudaCheck(cudaGetLastError());
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+  cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream);
+  if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
+    cudaGetLastError(); // reset OOM error
+    (state->cutorchGCFunction)(state->cutorchGCData);
+    err = allocator->malloc(allocator->state, ptr, size, stream);
+  }
+  return err;
+}
+
+cudaError_t THCudaFree(THCState *state, void *ptr)
+{
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+  return allocator->free(allocator->state, ptr);
+}
+
+cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes)
+{
+  size_t cachedBytes = 0;
+  size_t largestBlock = 0;
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+
+  /* get info from CUDA first */
+  cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  int device;
+  ret = cudaGetDevice(&device);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  /* not always true - our optimistic guess here */
+  largestBlock = *freeBytes;
+
+  if (allocator->cacheInfo != NULL)
+    allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);
+
+  /* Adjust resulting free bytes number. largesBlock unused for now */
+  *freeBytes += cachedBytes;
+  return cudaSuccess;
+}
+
+static ptrdiff_t applyHeapDelta(THCState *state) {
+  ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
+  state->heapDelta = 0;
+  return newHeapSize;
+}
+
+// Here we maintain a dynamic softmax threshold for THC-allocated storages.
+// When THC heap size goes above this softmax, the GC hook is triggered.
+// If heap size is above 80% of the softmax after GC, then the softmax is
+// increased.
+static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
+  if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
+    (state->cutorchGCFunction)(state->cutorchGCData);
+
+    // ensure heapSize is accurate before updating heapSoftmax
+    ptrdiff_t newHeapSize = applyHeapDelta(state);
+
+    if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
+      state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
+    }
+  }
+}
+
+void THCHeapUpdate(THCState *state, ptrdiff_t size) {
+  state->heapDelta += size;
+  // batch updates to global heapSize to minimize thread contention
+  if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
+    return;
+  }
+
+  ptrdiff_t newHeapSize = applyHeapDelta(state);
+  if (size > 0) {
+    maybeTriggerGC(state, newHeapSize);
+  }
+}
+
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE
+
+#include "THCStorage.c"
+#include "THCAllocator.c"
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
new file mode 100644
index 0000000..a88bd7d
--- /dev/null
+++ b/lib/THC/THCGeneral.h.in
@@ -0,0 +1,191 @@
+#ifndef THC_GENERAL_INC
+#define THC_GENERAL_INC
+
+#include "THGeneral.h"
+#include "THAllocator.h"
+#include "THCThreadLocal.h"
+#undef log1p
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+
+#cmakedefine USE_MAGMA
+
+#ifdef __cplusplus
+# define THC_EXTERNC extern "C"
+#else
+# define THC_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# ifdef THC_EXPORTS
+#  define THC_API THC_EXTERNC __declspec(dllexport)
+#  define THC_CLASS __declspec(dllexport)
+# else
+#  define THC_API THC_EXTERNC __declspec(dllimport)
+#  define THC_CLASS __declspec(dllimport)
+# endif
+#else
+# define THC_API THC_EXTERNC
+# define THC_CLASS
+#endif
+
+#ifndef THAssert
+#define THAssert(exp)                                                   \
+  do {                                                                  \
+    if (!(exp)) {                                                       \
+      _THError(__FILE__, __LINE__, "assert(%s) failed", #exp);          \
+    }                                                                   \
+  } while(0)
+#endif
+
+struct THCRNGState;  /* Random number generator state. */
+typedef struct THCStream THCStream;
+typedef struct THCState THCState;
+
+typedef struct _THCDeviceAllocator {
+   cudaError_t (*malloc)( void*, void**, size_t,         cudaStream_t);
+   cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
+   cudaError_t (*free)(void*, void*);
+   cudaError_t (*emptyCache)(void*);
+   cudaError_t  (*cacheInfo)(void*, int, size_t*, size_t*);
+   void* state;
+} THCDeviceAllocator;
+
+typedef struct _THCCudaResourcesPerDevice {
+  THCStream** streams;
+  /* Number of materialized cuBLAS handles */
+  int numBlasHandles;
+  /* cuBLAS handes are lazily initialized */
+  cublasHandle_t* blasHandles;
+  /* Size of scratch space per each stream on this device available */
+  size_t scratchSpacePerStream;
+  /* Device-resident scratch space per stream, used for global memory
+     reduction kernels. Lazily initialized. */
+  void** devScratchSpacePerStream;
+} THCCudaResourcesPerDevice;
+
+
+/* Global state to be held in the cutorch table. */
+struct THCState {
+  struct THCRNGState* rngState;
+  struct cudaDeviceProp* deviceProperties;
+  /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
+     which specifies the per-device default stream. blasHandles do not have a
+     default and must be explicitly initialized. We always initialize 1
+     blasHandle but we can use more.
+  */
+  THCCudaResourcesPerDevice* resourcesPerDevice;
+  /* Captured number of devices upon startup; convenience for bounds checking */
+  int numDevices;
+  /* Number of Torch defined resources available, indices 1 ... numStreams */
+  int numUserStreams;
+  int numUserBlasHandles;
+
+  /* Allocator using cudaMallocHost. */
+  THAllocator* cudaHostAllocator;
+  THAllocator* cudaUVAAllocator;
+  THCDeviceAllocator* cudaDeviceAllocator;
+
+  /* Index of the current selected BLAS handle. The actual BLAS handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+  /* Array of thread locals containing the current stream for each device */
+  THCThreadLocal* currentStreams;
+
+  /* Table of enabled peer-to-peer access between directed pairs of GPUs.
+     If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
+  int** p2pAccessEnabled;
+
+  /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
+     copies are allowed via p2p if p2p access is enabled at all for
+     the pair of GPUs in question, but if this flag is true, then
+     all cross-GPU access checks are disabled, allowing kernels to
+     directly access memory on another GPUs.
+     Note that p2p access must exist and be enabled for the pair of
+     GPUs in question. */
+  int p2pKernelAccessEnabled;
+
+  void (*cutorchGCFunction)(void *data);
+  void *cutorchGCData;
+  ptrdiff_t heapSoftmax;
+  ptrdiff_t heapDelta;
+};
+
+THC_API THCState* THCState_alloc(void);
+THC_API void THCState_free(THCState* state);
+
+THC_API void THCudaInit(THCState* state);
+THC_API void THCudaShutdown(THCState* state);
+
+/* If device `dev` can access allocations on device `devToAccess`, this will return */
+/* 1; otherwise, 0. */
+THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
+/* Enables or disables allowed p2p access using cutorch copy. If we are */
+/* attempting to enable access, throws an error if CUDA cannot enable p2p */
+/* access. */
+THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
+                                          int enable);
+
+/* By default, direct in-kernel access to memory on remote GPUs is
+   disabled. When set, this allows direct in-kernel access to remote
+   GPUs where GPU/GPU p2p access is enabled and allowed. */
+THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state);
+THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val);
+
+THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state);
+
+THC_API struct THCRNGState* THCState_getRngState(THCState* state);
+THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
+THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
+THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
+
+THC_API void THCMagma_init(THCState *state);
+
+/* State manipulators and accessors */
+THC_API int THCState_getNumDevices(THCState* state);
+THC_API void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking);
+THC_API int THCState_getNumStreams(THCState* state);
+
+/* Stream API */
+THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device);
+THC_API cudaStream_t THCState_getCurrentStream(THCState *state);
+THC_API struct THCStream* THCState_getStream(THCState *state);
+THC_API void THCState_setStream(THCState *state, struct THCStream* stream);
+/* deprecated stream API */
+THC_API cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream);
+THC_API int THCState_getCurrentStreamIndex(THCState *state);
+THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);
+
+THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumBlasHandles(THCState* state);
+
+THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
+THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
+THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
+THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
+
+/* For the current device and stream, returns the allocated scratch space */
+THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
+THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
+THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
+THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
+
+#define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
+#define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
+#define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
+
+THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
+THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
+THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+
+THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
+THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
+THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
+THC_API void THCSetGCHandler(THCState *state,
+                             void (*torchGCHandlerFunction)(void *data),
+                             void *data );
+THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);
+
+#endif
diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h
new file mode 100644
index 0000000..27a8bd2
--- /dev/null
+++ b/lib/THC/THCGenerateAllTypes.h
@@ -0,0 +1,37 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h"
+#endif
+
+#define THCGenerateAllTypes
+
+#define THCTypeIdxByte   1
+#define THCTypeIdxChar   2
+#define THCTypeIdxShort  3
+#define THCTypeIdxInt    4
+#define THCTypeIdxLong   5
+#define THCTypeIdxFloat  6
+#define THCTypeIdxDouble 7
+#define THCTypeIdxHalf   8
+#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
+
+#include "THCGenerateByteType.h"
+#include "THCGenerateCharType.h"
+#include "THCGenerateShortType.h"
+#include "THCGenerateIntType.h"
+#include "THCGenerateLongType.h"
+#include "THCGenerateHalfType.h"
+#include "THCGenerateFloatType.h"
+#include "THCGenerateDoubleType.h"
+
+#undef THCTypeIdxByte
+#undef THCTypeIdxChar
+#undef THCTypeIdxShort
+#undef THCTypeIdxInt
+#undef THCTypeIdxLong
+#undef THCTypeIdxFloat
+#undef THCTypeIdxDouble
+#undef THCTypeIdxHalf
+#undef THCTypeIdx_
+
+#undef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
diff --git a/lib/THC/THCGenerateByteType.h b/lib/THC/THCGenerateByteType.h
new file mode 100644
index 0000000..c1cb415
--- /dev/null
+++ b/lib/THC/THCGenerateByteType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateByteType.h"
+#endif
+
+#define real unsigned char
+#define accreal long
+#define Real Byte
+#define CReal CudaByte
+#define THC_REAL_IS_BYTE
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_BYTE
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/lib/THC/THCGenerateCharType.h b/lib/THC/THCGenerateCharType.h
new file mode 100644
index 0000000..f16a3ca
--- /dev/null
+++ b/lib/THC/THCGenerateCharType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateCharType.h"
+#endif
+
+#define real char
+#define accreal long
+#define Real Char
+#define CReal CudaChar
+#define THC_REAL_IS_CHAR
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_CHAR
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/lib/THC/THCGenerateDoubleType.h b/lib/THC/THCGenerateDoubleType.h
new file mode 100644
index 0000000..fdf6a8e
--- /dev/null
+++ b/lib/THC/THCGenerateDoubleType.h
@@ -0,0 +1,22 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateDoubleType.h"
+#endif
+
+#define real double
+#define accreal double
+#define Real Double
+#define CReal CudaDouble
+#define THC_REAL_IS_DOUBLE
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_DOUBLE
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/lib/THC/THCGenerateFloatType.h b/lib/THC/THCGenerateFloatType.h
new file mode 100644
index 0000000..997988d
--- /dev/null
+++ b/lib/THC/THCGenerateFloatType.h
@@ -0,0 +1,24 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateFloatType.h"
+#endif
+
+#define real float
+/* FIXME: fp64 has bad performance on some platforms; avoid using it unless
+   we opt into it? */
+#define accreal float
+#define Real Float
+#define CReal Cuda
+#define THC_REAL_IS_FLOAT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_FLOAT
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/lib/THC/THCGenerateFloatTypes.h b/lib/THC/THCGenerateFloatTypes.h
new file mode 100644
index 0000000..11bf46d
--- /dev/null
+++ b/lib/THC/THCGenerateFloatTypes.h
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateFloatTypes.h"
+#endif
+
+#define THCGenerateFloatTypes
+
+#define THCTypeIdxByte   1
+#define THCTypeIdxChar   2
+#define THCTypeIdxShort  3
+#define THCTypeIdxInt    4
+#define THCTypeIdxLong   5
+#define THCTypeIdxFloat  6
+#define THCTypeIdxDouble 7
+#define THCTypeIdxHalf   8
+#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
+
+#include "THCGenerateHalfType.h"
+#include "THCGenerateFloatType.h"
+#include "THCGenerateDoubleType.h"
+
+#undef THCTypeIdxByte
+#undef THCTypeIdxChar
+#undef THCTypeIdxShort
+#undef THCTypeIdxInt
+#undef THCTypeIdxLong
+#undef THCTypeIdxFloat
+#undef THCTypeIdxDouble
+#undef THCTypeIdxHalf
+#undef THCTypeIdx_
+
+#undef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h
new file mode 100644
index 0000000..77d4c0a
--- /dev/null
+++ b/lib/THC/THCGenerateHalfType.h
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
+#endif
+
+#include "THCHalf.h"
+
+#if defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF)
+
+#define real half
+#define accreal float
+#define Real Half
+
+// if only here via FORCE_TH_HALF, don't define CReal since
+// FORCE_TH_HALF should only be used for TH types
+#ifdef CUDA_HALF_TENSOR
+#define CReal CudaHalf
+#endif
+
+#define THC_REAL_IS_HALF
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+
+#ifdef CUDA_HALF_TENSOR
+#undef CReal
+#endif
+
+#undef THC_REAL_IS_HALF
+
+#endif // defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF)
+
+#ifndef THCGenerateAllTypes
+#ifndef THCGenerateFloatTypes
+#undef THC_GENERIC_FILE
+#endif
+#endif
diff --git a/lib/THC/THCGenerateIntType.h b/lib/THC/THCGenerateIntType.h
new file mode 100644
index 0000000..41ca248
--- /dev/null
+++ b/lib/THC/THCGenerateIntType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateIntType.h"
+#endif
+
+#define real int
+#define accreal long
+#define Real Int
+#define CReal CudaInt
+#define THC_REAL_IS_INT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_INT
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/lib/THC/THCGenerateLongType.h b/lib/THC/THCGenerateLongType.h
new file mode 100644
index 0000000..fb0dce4
--- /dev/null
+++ b/lib/THC/THCGenerateLongType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateLongType.h"
+#endif
+
+#define real long
+#define accreal long
+#define Real Long
+#define CReal CudaLong
+#define THC_REAL_IS_LONG
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_LONG
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/lib/THC/THCGenerateShortType.h b/lib/THC/THCGenerateShortType.h
new file mode 100644
index 0000000..ae85f8c
--- /dev/null
+++ b/lib/THC/THCGenerateShortType.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#error "You must define THC_GENERIC_FILE before including THGenerateShortType.h"
+#endif
+
+#define real short
+#define accreal long
+#define Real Short
+#define CReal CudaShort
+#define THC_REAL_IS_SHORT
+#line 1 THC_GENERIC_FILE
+#include THC_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef CReal
+#undef THC_REAL_IS_SHORT
+
+#ifndef THCGenerateAllTypes
+#undef THC_GENERIC_FILE
+#endif
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
new file mode 100644
index 0000000..023774e
--- /dev/null
+++ b/lib/THC/THCHalf.cu
@@ -0,0 +1,141 @@
+#include "THCHalf.h"
+#include "THCThrustAllocator.cuh"
+#include <thrust/transform.h>
+#include <thrust/execution_policy.h>
+
+struct __half2floatOp {
+  __device__ float operator()(half v) { return __half2float(v); }
+};
+
+struct __float2halfOp {
+  __device__ half operator()(float v) { return __float2half(v); }
+};
+
+void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    in, in + len, out, __float2halfOp());
+}
+
+void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#else
+    thrust::device,
+#endif
+    in, in + len, out, __half2floatOp());
+}
+
+// FixMe: could call TH_half2float
+// and convert types here, but maybe slower?
+float THC_half2float(half h)
+{
+  unsigned sign = ((h.x >> 15) & 1);
+  unsigned exponent = ((h.x >> 10) & 0x1f);
+  unsigned mantissa = ((h.x & 0x3ff) << 13);
+
+  if (exponent == 0x1f) {  /* NaN or Inf */
+    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+    exponent = 0xff;
+  } else if (!exponent) {  /* Denorm or Zero */
+    if (mantissa) {
+      unsigned int msb;
+      exponent = 0x71;
+      do {
+        msb = (mantissa & 0x400000);
+        mantissa <<= 1;  /* normalize */
+        --exponent;
+      } while (!msb);
+      mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+    }
+  } else {
+    exponent += 0x70;
+  }
+
+  int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+  float x;
+  memcpy(&x,&temp,sizeof(float));
+  return x;
+}
+
+half THC_float2half(float f)
+{
+  half ret;
+
+  unsigned x;
+  memcpy(&x,&f,sizeof(f));
+  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+  unsigned sign, exponent, mantissa;
+
+  // Get rid of +NaN/-NaN case first.
+  if (u > 0x7f800000) {
+    ret.x = 0x7fffU;
+    return ret;
+  }
+
+  sign = ((x >> 16) & 0x8000);
+
+  // Get rid of +Inf/-Inf, +0/-0.
+  if (u > 0x477fefff) {
+    ret.x = sign | 0x7c00U;
+    return ret;
+  }
+  if (u < 0x33000001) {
+    ret.x = (sign | 0x0000);
+    return ret;
+  }
+
+  exponent = ((u >> 23) & 0xff);
+  mantissa = (u & 0x7fffff);
+
+  if (exponent > 0x70) {
+    shift = 13;
+    exponent -= 0x70;
+  } else {
+    shift = 0x7e - exponent;
+    exponent = 0;
+    mantissa |= 0x800000;
+  }
+  lsb = (1 << shift);
+  lsb_s1 = (lsb >> 1);
+  lsb_m1 = (lsb - 1);
+
+  // Round to nearest even.
+  remainder = (mantissa & lsb_m1);
+  mantissa >>= shift;
+  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+    ++mantissa;
+    if (!(mantissa & 0x3ff)) {
+      ++exponent;
+      mantissa = 0;
+    }
+  }
+
+  ret.x = (sign | (exponent << 10) | mantissa);
+  return ret;
+}
+
+THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
+  cudaDeviceProp* prop =
+    THCState_getCurrentDeviceProperties(state);
+
+  // CC 5.3+
+  return (prop->major > 5 ||
+          (prop->major == 5 && prop->minor == 3));
+}
+
+THC_EXTERNC int THC_fastHalfInstructions(THCState *state) {
+  cudaDeviceProp* prop =
+    THCState_getCurrentDeviceProperties(state);
+
+  // Check for CC 6.0 only (corresponds to P100)
+  return (prop->major == 6 && prop->minor == 0);
+}
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
new file mode 100644
index 0000000..7c055e7
--- /dev/null
+++ b/lib/THC/THCHalf.h
@@ -0,0 +1,29 @@
+#ifndef THC_HALF_CONVERSION_INC
+#define THC_HALF_CONVERSION_INC
+
+#include "THCGeneral.h"
+
+/* We compile with CudaHalfTensor support if we have this: */
+#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#define CUDA_HALF_TENSOR 1
+#endif
+
+#ifdef CUDA_HALF_TENSOR
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
+THC_API half THC_float2half(float a);
+THC_API float THC_half2float(half a);
+
+/* Check for native fp16 support on the current device (CC 5.3+) */
+THC_API int THC_nativeHalfInstructions(THCState *state);
+
+/* Check for performant native fp16 support on the current device */
+THC_API int THC_fastHalfInstructions(THCState *state);
+
+#endif /* CUDA_HALF_TENSOR */
+
+#endif
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
new file mode 100644
index 0000000..0944360
--- /dev/null
+++ b/lib/THC/THCNumerics.cuh
@@ -0,0 +1,631 @@
+#ifndef THC_NUMERICS_INC
+#define THC_NUMERICS_INC
+
+#include <cuda.h>
+#include <limits.h>
+#include "THCHalf.h"
+
+/// Class for numeric limits of the particular data type, which
+/// includes support for `half`.
+/// Unfortunately since `half` does not have a constructor, these have
+/// to be expressed as functions (either that or non-const statics).
+template <typename T>
+struct THCNumerics {
+};
+
+template <>
+struct THCNumerics<unsigned char> {
+  static inline __host__ __device__ unsigned char min() { return 0; }
+  static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; }
+
+  static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; }
+  static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; }
+  static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; }
+  static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; }
+  static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; }
+  static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; }
+
+  static inline __host__ __device__  unsigned char add(unsigned char a, unsigned char b) { return a + b; }
+  static inline __host__ __device__  unsigned char mul(unsigned char a, unsigned char b) { return a * b; }
+  static inline __host__ __device__  unsigned char sub(unsigned char a, unsigned char b) { return a - b; }
+  static inline __host__ __device__  unsigned char div(unsigned char a, unsigned char b) { return a / b; }
+  static inline __host__ __device__  unsigned char abs(unsigned char a) { return abs(a); }
+};
+
+template <>
+struct THCNumerics<char> {
+  static inline __host__ __device__ char min() { return CHAR_MIN; }
+  static inline __host__ __device__ char max() { return CHAR_MAX; }
+
+  static inline __host__ __device__ bool lt(char a, char b) { return a < b; }
+  static inline __host__ __device__ bool le(char a, char b) { return a <= b; }
+  static inline __host__ __device__ bool gt(char a, char b) { return a > b; }
+  static inline __host__ __device__ bool ge(char a, char b) { return a >= b; }
+  static inline __host__ __device__ bool eq(char a, char b) { return a == b; }
+  static inline __host__ __device__ bool ne(char a, char b) { return a != b; }
+
+  static inline __host__ __device__  char add(char a, char b) { return a + b; }
+  static inline __host__ __device__  char mul(char a, char b) { return a * b; }
+  static inline __host__ __device__  char sub(char a, char b) { return a - b; }
+  static inline __host__ __device__  char div(char a, char b) { return a / b; }
+  static inline __host__ __device__  char abs(char a) { return abs(a); }
+};
+
+template <>
+struct THCNumerics<short> {
+  static inline __host__ __device__ short min() { return SHRT_MIN; }
+  static inline __host__ __device__ short max() { return SHRT_MAX; }
+
+  static inline __host__ __device__ bool lt(short a, short b) { return a < b; }
+  static inline __host__ __device__ bool le(short a, short b) { return a <= b; }
+  static inline __host__ __device__ bool gt(short a, short b) { return a > b; }
+  static inline __host__ __device__ bool ge(short a, short b) { return a >= b; }
+  static inline __host__ __device__ bool eq(short a, short b) { return a == b; }
+  static inline __host__ __device__ bool ne(short a, short b) { return a != b; }
+
+  static inline __host__ __device__  short add(short a, short b) { return a + b; }
+  static inline __host__ __device__  short mul(short a, short b) { return a * b; }
+  static inline __host__ __device__  short sub(short a, short b) { return a - b; }
+  static inline __host__ __device__  short div(short a, short b) { return a / b; }
+  static inline __host__ __device__  short abs(short a) { return abs(a); }
+};
+
+template <>
+struct THCNumerics<int> {
+  static inline __host__ __device__ int min() { return INT_MIN; }
+  static inline __host__ __device__ int max() { return INT_MAX; }
+
+  static inline __host__ __device__ bool lt(int a, int b) { return a < b; }
+  static inline __host__ __device__ bool le(int a, int b) { return a <= b; }
+  static inline __host__ __device__ bool gt(int a, int b) { return a > b; }
+  static inline __host__ __device__ bool ge(int a, int b) { return a >= b; }
+  static inline __host__ __device__ bool eq(int a, int b) { return a == b; }
+  static inline __host__ __device__ bool ne(int a, int b) { return a != b; }
+
+  static inline __host__ __device__  int add(int a, int b) { return a + b; }
+  static inline __host__ __device__  int mul(int a, int b) { return a * b; }
+  static inline __host__ __device__  int sub(int a, int b) { return a - b; }
+  static inline __host__ __device__  int div(int a, int b) { return a / b; }
+  static inline __host__ __device__  int abs(int a) { return ::abs(a); }
+};
+
+template <>
+struct THCNumerics<long> {
+  static inline __host__ __device__ long min() { return LONG_MIN; }
+  static inline __host__ __device__ long max() { return LONG_MAX; }
+
+  static inline __host__ __device__ bool lt(long a, long b) { return a < b; }
+  static inline __host__ __device__ bool le(long a, long b) { return a <= b; }
+  static inline __host__ __device__ bool gt(long a, long b) { return a > b; }
+  static inline __host__ __device__ bool ge(long a, long b) { return a >= b; }
+  static inline __host__ __device__ bool eq(long a, long b) { return a == b; }
+  static inline __host__ __device__ bool ne(long a, long b) { return a != b; }
+
+  static inline __host__ __device__  long add(long a, long b) { return a + b; }
+  static inline __host__ __device__  long mul(long a, long b) { return a * b; }
+  static inline __host__ __device__  long sub(long a, long b) { return a - b; }
+  static inline __host__ __device__  long div(long a, long b) { return a / b; };
+  static inline __host__ __device__  long abs(long a) { return labs(a); }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct THCNumerics<half> {
+  static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
+  static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
+
+  static inline __host__ __device__ bool lt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hlt(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa < fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) < THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool le(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hle(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa <= fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) <= THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool gt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hgt(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa > fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) > THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool ge(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hge(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa >= fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) >= THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool eq(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __heq(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa == fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) == THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ bool ne(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hne(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return fa != fb;
+#endif
+#else // __CUDA_ARCH__
+    return THC_half2float(a) != THC_half2float(b);
+#endif
+  }
+
+  static inline __host__ __device__ half exp(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hexp(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(expf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(expf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hlog(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(logf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(logf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half log1p(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(log1pf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(log1pf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half cos(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hcos(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(cosf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(cosf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sin(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hsin(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(sinf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(sinf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sqrt(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hsqrt(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(sqrtf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(sqrtf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half rsqrt(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hrsqrt(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(rsqrtf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(rsqrtf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half ceil(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hceil(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(ceilf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(ceilf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half floor(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return hfloor(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(floorf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(floorf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half trunc(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return htrunc(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(truncf(fa));
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(truncf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half neg(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(a);
+#else
+    float fa = __half2float(a);
+    return __float2half(-fa);
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(-(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half acos(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(acosf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(acosf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half cosh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(coshf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(coshf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half asin(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(asinf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(asinf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half sinh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(sinhf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(sinhf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half tan(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(tanf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(tanf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half atan(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(atanf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(atanf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half tanh(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(tanhf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(tanhf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half abs(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(fabs(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(fabs(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half round(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(roundf(fa));
+#else // __CUDA_ARCH__
+    return THC_float2half(roundf(THC_half2float(a)));
+#endif
+  }
+
+  static inline __host__ __device__ half frac(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(fa - truncf(fa));
+#else // __CUDA_ARCH__
+    float fa = THC_half2float(a);
+    return THC_float2half(fa - floorf(fa));
+#endif
+  }
+
+  static inline __host__ __device__ half cinv(half a) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    return __float2half(1.0f / fa);
+#else // __CUDA_ARCH__
+    return THC_float2half(1.0f / THC_half2float(a));
+#endif
+  }
+
+  static inline __host__ __device__ half add(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hadd(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa + fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) + THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half div(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa / fb );
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) / THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half mul(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa * fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) * THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half sub(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hsub(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa - fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) - THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half pow(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(powf(fa, fb));
+#else // __CUDA_ARCH__
+    return THC_float2half(powf(THC_half2float(a), THC_half2float(b)));
+#endif
+  }
+
+};
+#endif
+
+template <>
+struct THCNumerics<float> {
+  static inline __host__ __device__ float min() { return -FLT_MAX; }
+  static inline __host__ __device__ float max() { return FLT_MAX; }
+
+  static inline __host__ __device__ bool lt(float a, float b) { return a < b; }
+  static inline __host__ __device__ bool le(float a, float b) { return a <= b; }
+  static inline __host__ __device__ bool gt(float a, float b) { return a > b; }
+  static inline __host__ __device__ bool ge(float a, float b) { return a >= b; }
+  static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
+  static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
+
+  static inline __host__ __device__  float exp  (float a) { return   expf(a); }
+  static inline __host__ __device__  float log  (float a) { return   logf(a); }
+  static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
+  static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
+  static inline __host__ __device__  float sin  (float a) { return   sinf(a); }
+  static inline __host__ __device__  float sqrt (float a) { return  sqrtf(a); }
+  static inline __host__ __device__  float rsqrt(float a) { return rsqrtf(a); }
+  static inline __host__ __device__  float ceil (float a) { return  ceilf(a); }
+  static inline __host__ __device__  float floor(float a) { return floorf(a); }
+  static inline __host__ __device__  float trunc(float a) { return truncf(a); }
+  static inline __host__ __device__  float neg  (float a) { return        -a; }
+  static inline __host__ __device__  float acos (float a) { return  acosf(a); }
+  static inline __host__ __device__  float cosh (float a) { return  coshf(a); }
+  static inline __host__ __device__  float acosh(float a) { return acoshf(a); }
+  static inline __host__ __device__  float asin (float a) { return  asinf(a); }
+  static inline __host__ __device__  float sinh (float a) { return  sinhf(a); }
+  static inline __host__ __device__  float asinh(float a) { return asinhf(a); }
+  static inline __host__ __device__  float tan  (float a) { return   tanf(a); }
+  static inline __host__ __device__  float atan (float a) { return  atanf(a); }
+  static inline __host__ __device__  float tanh (float a) { return  tanhf(a); }
+  static inline __host__ __device__  float abs  (float a) { return   fabs(a); }
+  static inline __host__ __device__  float round(float a) { return roundf(a); }
+  static inline __host__ __device__  float frac (float a) { return a - truncf(a); }
+  static inline __host__ __device__  float cinv (float a) { return 1.0f / a; }
+  static inline __host__ __device__  float add  (float a, float b) { return a + b; }
+  static inline __host__ __device__  float div  (float a, float b) { return a / b; }
+  static inline __host__ __device__  float mul  (float a, float b) { return a * b; }
+  static inline __host__ __device__  float sub  (float a, float b) { return a - b; }
+  static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
+};
+
+template <>
+struct THCNumerics<double> {
+  static inline __host__ __device__ double min() { return -DBL_MAX; }
+  static inline __host__ __device__ double max() { return DBL_MAX; }
+
+  static inline __host__ __device__ bool lt(double a, double b) { return a < b; }
+  static inline __host__ __device__ bool le(double a, double b) { return a <= b; }
+  static inline __host__ __device__ bool gt(double a, double b) { return a > b; }
+  static inline __host__ __device__ bool ge(double a, double b) { return a >= b; }
+  static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
+  static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
+
+  static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
+  static inline __host__ __device__  double log  (double a) { return   ::log(a); }
+  static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
+  static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
+  static inline __host__ __device__  double sin  (double a) { return   ::sin(a); }
+  static inline __host__ __device__  double sqrt (double a) { return  ::sqrt(a); }
+  static inline __host__ __device__  double rsqrt(double a) { return ::rsqrt(a); }
+  static inline __host__ __device__  double ceil (double a) { return  ::ceil(a); }
+  static inline __host__ __device__  double floor(double a) { return ::floor(a); }
+  static inline __host__ __device__  double trunc(double a) { return ::trunc(a); }
+  static inline __host__ __device__  double neg  (double a) { return       -a; }
+  static inline __host__ __device__  double acos (double a) { return  ::acos(a); }
+  static inline __host__ __device__  double cosh (double a) { return  ::cosh(a); }
+  static inline __host__ __device__  double acosh(double a) { return ::acosh(a); }
+  static inline __host__ __device__  double asin (double a) { return  ::asin(a); }
+  static inline __host__ __device__  double sinh (double a) { return  ::sinh(a); }
+  static inline __host__ __device__  double asinh(double a) { return ::asinh(a); }
+  static inline __host__ __device__  double tan  (double a) { return   ::tan(a); }
+  static inline __host__ __device__  double atan (double a) { return  ::atan(a); }
+  static inline __host__ __device__  double tanh (double a) { return  ::tanh(a); }
+  static inline __host__ __device__  double abs  (double a) { return   ::abs(a); }
+  static inline __host__ __device__  double round(double a) { return ::round(a); }
+  static inline __host__ __device__  double frac (double a) { return a - ::trunc(a); }
+  static inline __host__ __device__  double cinv (double a) { return 1.0 / a; }
+  static inline __host__ __device__  double add  (double a, double b) { return a + b; }
+  static inline __host__ __device__  double div  (double a, double b) { return a / b; }
+  static inline __host__ __device__  double mul  (double a, double b) { return a * b; }
+  static inline __host__ __device__  double sub  (double a, double b) { return a - b; }
+  static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
+};
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+template <typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ Out to(const In v) { return (Out) v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static __host__ __device__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+    return (Out) __half2float(v);
+#else
+    return (Out) THC_half2float(v);
+#endif
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static __host__ __device__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+    return __float2half((float) v);
+#else
+    return THC_float2half((float) v);
+#endif
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static __host__ __device__ half to(const half v) {
+    return v;
+  }
+};
+#endif
+
+#endif // THC_NUMERICS_INC
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
new file mode 100644
index 0000000..7f276a2
--- /dev/null
+++ b/lib/THC/THCReduce.cuh
@@ -0,0 +1,323 @@
+#ifndef THC_REDUCE_INC
+#define THC_REDUCE_INC
+
+//
+// This file contains dimension reduction operation functions and
+// kernels that work on both contiguous and non-contiguous tensor
+// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
+// arguments without copying or temporary storage.
+//
+
+#include "THCTensorTypeUtils.cuh"
+#include "THCReduceApplyUtils.cuh"
+
+// Threads per thread block
+#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
+  // Each thread handles one slice
+  return getLinearBlockId<IndexType>() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x;
+}
+
+// Kernel that handles an entire reduction of a slice of a tensor per each thread
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims, int BDims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(32 * 16, 4)
+#endif
+__global__ void
+kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
+                         TensorInfo<T, IndexType> in,
+                         IndexType reductionStride,
+                         IndexType reductionSize,
+                         IndexType totalSlices,
+                         T init,
+                         ModifyOp modifyOp,
+                         ReduceOp reduceOp) {
+  const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
+
+  if (sliceIndex >= totalSlices) {
+    return;
+  }
+
+  // Each thread picks a point in `out` and `in` for which it is
+  // producing the reduction
+  const IndexType outOffset =
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
+  const IndexType inBaseOffset =
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
+
+  // For each point in reductionSize, reduce into `r`
+  IndexType inOffset = inBaseOffset;
+  T r = init;
+
+  for (IndexType i = 0; i < reductionSize; ++i) {
+    r = reduceOp(r, modifyOp(in.data[inOffset]));
+    inOffset += reductionStride;
+  }
+
+  // Write out reduced value
+  out.data[outOffset] = r;
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
+  // Each block handles one slice
+  return getLinearBlockId<IndexType>();
+}
+
+// Kernel that handles an entire reduction of a slice of a tensor per
+// each block
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims, int BDims>
+__global__ void
+kernelReduceContigDim(TensorInfo<T, IndexType> out,
+                      TensorInfo<T, IndexType> in,
+                      IndexType reductionSize,
+                      IndexType totalSlices,
+                      T init,
+                      ModifyOp modifyOp,
+                      ReduceOp reduceOp) {
+  const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
+
+  if (sliceIndex >= totalSlices) {
+    return;
+  }
+
+  // Get the offset in `out` for the reduction
+  const IndexType outOffset =
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
+
+  // Get the base offset in `in` for this block's reduction
+  const IndexType inBaseOffset =
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
+
+  // Each thread in the block will reduce some subset of elements in
+  // the slice. The elements are guaranteed contiguous starting at
+  // `inBaseOffset`.
+  T r = init;
+  for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
+    r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
+  }
+
+  // Reduce within the block
+  // FIXME: extern name
+  extern __shared__ char smemChar[];
+  T* smem = (T*) smemChar;
+  r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out reduced value
+    out.data[outOffset] = r;
+  }
+}
+
+inline dim3 getNoncontigReduceBlock() {
+  return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
+}
+
+inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
+  // If the number of slices is low but the reduction dimension size
+  // is high, then we should increase block size for greater parallelism.
+  // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
+  // inquiring the real number for now).
+  int maxWarps = 4; // better occupancy if many blocks are around
+  // For numSlices > 15 * 8, there are > 32 warps active per SM.
+  if (numSlices < 15 * 8) {
+    maxWarps = 8;
+    if (numSlices < 15 * 4) {
+      maxWarps = 16;
+      if (numSlices < 15 * 2) {
+        maxWarps = 32;
+      }
+    }
+  }
+
+  // Scale up block size based on the reduction dimension size
+  long warpsInReductionSize = THCCeilDiv(reductionSize, 32L);
+  int numWarps = warpsInReductionSize > (long) maxWarps ?
+    maxWarps : (int) warpsInReductionSize;
+
+  return dim3(numWarps * 32);
+}
+
+inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
+  // One output point per thread
+  return THC_getGridFromTiles(THCCeilDiv(elements,
+                                         (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
+}
+
+inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
+  // One output point per block
+  return THC_getGridFromTiles(elements, grid);
+}
+
+// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
+// all in where i and the out's 0 are indexed at dimension `dim`
+template <typename TensorType, typename ModifyOp, typename ReduceOp>
+bool THC_reduceDim(THCState* state,
+                   TensorType* out,
+                   TensorType* in,
+                   const ModifyOp& modifyOp,
+                   const ReduceOp& reduceOp,
+                   typename TensorUtils<TensorType>::DataType init,
+                   int dim) {
+  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
+
+  long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
+  long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
+  ptrdiff_t outElements = inElements / reductionSize;
+
+  if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (TensorUtils<TensorType>::getDims(state, in) == 0) {
+    // Zero-dim tensor; do nothing
+    return true;
+  }
+
+  // Is the reduction dimension contiguous? If so, then we can use a
+  // shared memory reduction kernel to increase performance.
+  bool contigReduction = (reductionStride == 1);
+
+  dim3 block;
+  dim3 grid;
+  int smemSize = 0; // contiguous reduction uses smem
+  if (contigReduction) {
+    if (!getContigReduceGrid(outElements, grid)) {
+      return false;
+    }
+
+    block = getContigReduceBlock(outElements, reductionSize);
+    smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x;
+  } else {
+    if (!getNoncontigReduceGrid(outElements, grid)) {
+      return false;
+    }
+
+    block = getNoncontigReduceBlock();
+  }
+
+  // Resize out to correspond to the reduced size
+  THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
+  THLongStorage_set(sizes, dim, 1);
+  TensorUtils<TensorType>::resize(state, out, sizes, NULL);
+  THLongStorage_free(sizes);
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, OUT, IN)                                      \
+  if (contigReduction) {                                                \
+    kernelReduceContigDim<ModifyOp, ReduceOp,                           \
+                          typename TensorUtils<TensorType>::DataType,   \
+                          TYPE, OUT, IN>                                \
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(    \
+        outInfo, inInfo, reductionSize,                                 \
+        (TYPE) outElements, init, modifyOp, reduceOp);                  \
+  } else {                                                              \
+    kernelReduceNoncontigDim<ModifyOp, ReduceOp,                        \
+                             typename TensorUtils<TensorType>::DataType, \
+                             TYPE, OUT, IN>                             \
+      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(           \
+        outInfo, inInfo, reductionStride, reductionSize,                \
+        (TYPE) outElements, init, modifyOp, reduceOp);                  \
+  }                                                                     \
+
+#define HANDLE_IN_CASE(TYPE, OUT, IN)                     \
+  {                                                       \
+    if (inInfo.isContiguous()) {                          \
+      HANDLE_CASE(TYPE, OUT, -2);                         \
+    } else {                                              \
+      switch (IN) {                                       \
+        case 1:                                           \
+          HANDLE_CASE(TYPE, OUT, 1);                      \
+          break;                                          \
+        case 2:                                           \
+          HANDLE_CASE(TYPE, OUT, 2);                      \
+          break;                                          \
+        default:                                          \
+          HANDLE_CASE(TYPE, OUT, -1);                     \
+          break;                                          \
+      }                                                   \
+    }                                                     \
+  }
+
+#define HANDLE_OUT_CASE(TYPE, OUT, IN)                 \
+  {                                                    \
+    if (outInfo.isContiguous()) {                      \
+      HANDLE_IN_CASE(TYPE, -2, IN);                    \
+    } else {                                           \
+      switch (OUT) {                                   \
+        case 1:                                        \
+          HANDLE_IN_CASE(TYPE, 1, IN);                 \
+          break;                                       \
+        case 2:                                        \
+          HANDLE_IN_CASE(TYPE, 2, IN);                 \
+          break;                                       \
+        default:                                       \
+          HANDLE_IN_CASE(TYPE, -1, IN);                \
+          break;                                       \
+      }                                                \
+    }                                                  \
+  }
+
+  if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
+      TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+    TensorInfo<typename TensorUtils<TensorType>::DataType,
+               unsigned int> outInfo =
+      getTensorInfo<TensorType, unsigned int>(state, out);
+    outInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorType>::DataType,
+               unsigned int> inInfo =
+      getTensorInfo<TensorType, unsigned int>(state, in);
+    inInfo.reduceDim(dim);
+    inInfo.collapseDims();
+
+    HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
+  } else {
+    TensorInfo<typename TensorUtils<TensorType>::DataType,
+               unsigned long> outInfo =
+      getTensorInfo<TensorType, unsigned long>(state, out);
+    outInfo.collapseDims();
+
+    TensorInfo<typename TensorUtils<TensorType>::DataType,
+               unsigned long> inInfo =
+      getTensorInfo<TensorType, unsigned long>(state, in);
+    inInfo.reduceDim(dim);
+    inInfo.collapseDims();
+
+    // For large tensors, we only compile the completely contiguous
+    // version and the completely generic version, to reduce
+    // compilation time.
+    if (outInfo.isContiguous() && inInfo.isContiguous()) {
+      HANDLE_CASE(unsigned long, -2, -2);
+    } else {
+      HANDLE_CASE(unsigned long, -1, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_IN_CASE
+#undef HANDLE_OUT_CASE
+
+  return true;
+}
+
+#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE
+
+#endif // THC_REDUCE_INC
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
new file mode 100644
index 0000000..9a335c7
--- /dev/null
+++ b/lib/THC/THCReduceAll.cuh
@@ -0,0 +1,347 @@
+#ifndef THC_REDUCEALL_INC
+#define THC_REDUCEALL_INC
+
+//
+// This file contains dimension reduction operation functions and
+// kernels that work on both contiguous and non-contiguous tensor
+// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
+// arguments without copying or temporary storage, for reducing an
+// entire tensor to one value.
+//
+
+#include "THCReduceApplyUtils.cuh"
+
+// Size per each reduction block
+#define THC_REDUCE_ALL_BLOCK_SIZE 1024L
+
+// Cutoff size for two-pass reduction
+#define THC_TWO_PASS_REDUCTION_SIZE 2048L
+
+// Kernel that handles an entire reduction of a tensor in one pass
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename ReduceAccOp,
+          typename InT,
+          typename AccT,
+          typename IndexType,
+          int ADims>
+__global__ void
+kernelReduceAll(TensorInfo<InT, IndexType> in,
+                IndexType totalElements,
+                AccT init,
+                ModifyOp modifyOp,
+                ReduceOp reduceOp,
+                ReduceAccOp reduceAccOp,
+                AccT* out) {
+  // With a block-wide stride, have each thread perform its own reduction.
+  AccT r = init;
+  for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
+    const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
+    r = reduceOp(r, modifyOp(in.data[inOffset]));
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  AccT* smem = (AccT*) smemChar;
+  r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out reduced value
+    *out = r;
+  }
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getStartIndex(IndexType totalSize) {
+  IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
+  return blockIdx.x * sizePerBlock;
+}
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
+  IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
+  return min((IndexType) ((blockIdx.x + 1) * sizePerBlock), totalSize);
+}
+
+// Kernel that handles an entire reduction of a tensor in two passes
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename ReduceAccOp,
+          typename InT,
+          typename AccT,
+          typename IndexType,
+          int ADims>
+__global__ void
+kernelReduceAllPass1(TensorInfo<InT, IndexType> in,
+                     IndexType totalElements,
+                     AccT init,
+                     ModifyOp modifyOp,
+                     ReduceOp reduceOp,
+                     ReduceAccOp reduceAccOp,
+                     AccT* scratchSpace) {
+  const IndexType startIndex = getStartIndex<IndexType>(totalElements);
+  const IndexType endIndex = getEndIndex<IndexType>(totalElements);
+
+  // With a block-wide stride, have each thread perform its own reduction.
+  AccT r = init;
+  for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
+    const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
+    r = reduceOp(r, modifyOp(in.data[inOffset]));
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  AccT* smem = (AccT*) smemChar;
+  r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);
+
+  if (threadIdx.x == 0) {
+    // Write out block-wide reduced value
+    scratchSpace[blockIdx.x] = r;
+  }
+}
+
+template <typename ReduceOp, typename T, typename IndexType>
+__global__ void
+kernelReduceAllPass2(int numPass1Blocks,
+                     T init,
+                     ReduceOp reduceOp,
+                     T* scratchSpace,
+                     T* out) {
+  T r = init;
+  if (threadIdx.x < numPass1Blocks) {
+    r = scratchSpace[threadIdx.x];
+  }
+
+  // Reduce within the block
+  extern __shared__ char smemChar[];
+  T* smem = (T*) smemChar;
+  r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
+
+  if (threadIdx.x == 0) {
+    *out = r;
+  }
+}
+
+// Perform a two-pass reduction if the tensor is large enough to
+// warrant it.
+inline bool isTwoPassReductionSize(ptrdiff_t elements) {
+  return (elements > THC_TWO_PASS_REDUCTION_SIZE);
+}
+
+template <typename InT, typename AccT>
+inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) {
+  ptrdiff_t numBlocks = THCCeilDiv(elements, (ptrdiff_t)THC_REDUCE_ALL_BLOCK_SIZE);
+
+  // We can only have as many blocks as there is scratch space
+  ptrdiff_t scratchSpace =
+    THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
+  THAssert(scratchSpace > 0);
+
+  // Limit to 1024 due to dimensionality constraint
+  if (scratchSpace > 1024) {
+    scratchSpace = 1024;
+  }
+
+  if (numBlocks > scratchSpace) {
+    numBlocks = scratchSpace;
+  }
+
+  return numBlocks;
+}
+
+// Get the block/grid size that we want
+template <typename InT, typename AccT>
+inline void getPass1ReduceBlockGrid(THCState* state, ptrdiff_t elements,
+                                    dim3& grid, dim3& block) {
+  grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
+  block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
+}
+
+template <typename InT, typename AccT>
+inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements,
+                                    dim3& grid, dim3& block) {
+  grid = dim3(1);
+  // We only need as many threads as there were blocks originally
+  block = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
+}
+
+template <typename InT, typename AccT>
+inline void getSinglePassReduceBlockGrid(ptrdiff_t elements,
+                                         dim3& grid, dim3& block) {
+  grid = dim3(1);
+  block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
+}
+
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename ReduceAccOp,
+          typename InT,
+          typename AccT,
+          typename IndexType,
+          int ADims>
+void callReduceAll(THCState* state,
+                   const TensorInfo<InT, IndexType>& in,
+                   ptrdiff_t totalElements,
+                   AccT init,
+                   const ModifyOp& modifyOp,
+                   const ReduceOp& reduceOp,
+                   const ReduceAccOp& reduceAccOp,
+                   AccT* devOut) {
+  dim3 grid;
+  dim3 block;
+
+  if (isTwoPassReductionSize(totalElements)) {
+    bool freeScratchSpace = false;
+    void* scratchSpace = THCState_getCurrentDeviceScratchSpace(state);
+    if (!scratchSpace) {
+      THCudaCheck(THCudaMalloc(state, &scratchSpace,
+          THCState_getCurrentDeviceScratchSpaceSize(state)));
+      freeScratchSpace = true;
+    }
+
+    getPass1ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAllPass1<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp,
+        (AccT*) scratchSpace);
+
+    int numPass1Blocks = grid.x;
+    getPass2ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
+    smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAllPass2<ReduceAccOp, AccT, IndexType>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        numPass1Blocks, init, reduceAccOp,
+        (AccT*) scratchSpace, devOut);
+
+    if (freeScratchSpace) {
+      THCudaCheck(THCudaFree(state, scratchSpace));
+    }
+  } else {
+    getSinglePassReduceBlockGrid<InT, AccT>(totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(AccT);
+
+    kernelReduceAll<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
+      <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
+        in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, devOut);
+  }
+}
+
+// Reduces the entire tensor to one value. `out` points to
+// host-resident memory.
+template <typename TensorType,
+          typename ModifyOp,
+          typename ReduceOp,
+          typename ReduceAccOp,
+          typename AccT>
+bool THC_reduceAll(THCState* state,
+                   TensorType* in,
+                   const ModifyOp& modifyOp,
+                   const ReduceOp& reduceOp,
+                   const ReduceAccOp& reduceAccOp,
+                   AccT init,
+                   AccT* out,
+                   int outOnDevice) {
+  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
+
+  if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
+    return false;
+  }
+
+  if (TensorUtils<TensorType>::getDims(state, in) == 0) {
+    // Zero-dim tensor; do nothing
+    *out = init;
+    return true;
+  }
+
+  bool freeDevOut = false;
+  AccT* devOut = out;
+  if (!outOnDevice) {
+    // Use the stream-specific scratch space for the reduction kernel
+    // to write out its value
+    devOut = (AccT*) THCState_getCurrentDeviceScratchSpace(state);
+    if (!devOut) {
+      THCudaCheck(THCudaMalloc(state, (void**)&devOut,
+          THCState_getCurrentDeviceScratchSpaceSize(state)));
+      freeDevOut = true;
+    }
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+#define HANDLE_CASE(TYPE, IN)                                           \
+  callReduceAll<ModifyOp, ReduceOp, ReduceAccOp,                        \
+                typename TensorUtils<TensorType>::DataType,             \
+                AccT,                                                   \
+                TYPE, IN>(                                              \
+                  state, inInfo, inElements, init, modifyOp,            \
+                  reduceOp, reduceAccOp, devOut);
+
+#define HANDLE_IN_CASE(TYPE, IN)                    \
+  {                                                 \
+    if (inInfo.isContiguous()) {                    \
+      HANDLE_CASE(TYPE, -2);                        \
+    } else {                                        \
+      switch (IN) {                                 \
+        case 1:                                     \
+          HANDLE_CASE(TYPE, 1);                     \
+          break;                                    \
+        case 2:                                     \
+          HANDLE_CASE(TYPE, 2);                     \
+          break;                                    \
+        default:                                    \
+          HANDLE_CASE(TYPE, -1);                    \
+          break;                                    \
+      }                                             \
+    }                                               \
+  }
+
+  if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+    TensorInfo<typename TensorUtils<TensorType>::DataType, unsigned int> inInfo =
+      getTensorInfo<TensorType, unsigned int>(state, in);
+    inInfo.collapseDims();
+
+    HANDLE_IN_CASE(unsigned int, inInfo.dims);
+  } else {
+    TensorInfo<typename TensorUtils<TensorType>::DataType,
+               unsigned long long> inInfo =
+      getTensorInfo<TensorType, unsigned long long>(state, in);
+    inInfo.collapseDims();
+
+    // For large tensors, we only compile the completely contiguous
+    // version and the completely generic version, to reduce
+    // compilation time.
+    if (inInfo.isContiguous()) {
+      HANDLE_IN_CASE(unsigned long long, -2);
+    } else {
+      HANDLE_IN_CASE(unsigned long long, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_IN_CASE
+
+  // If our destination is not on the device, copy the value back to
+  // the host (synchronous!)
+  if (!outOnDevice) {
+    cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
+  }
+
+  if (freeDevOut) {
+    THCudaCheck(THCudaFree(state, devOut));
+  }
+
+  return true;
+}
+
+#undef THC_REDUCE_ALL_BLOCK_SIZE
+#undef THC_TWO_PASS_REDUCTION_SIZE
+
+#endif // THC_REDUCEALL_INC
diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu
new file mode 100644
index 0000000..6d4c06e
--- /dev/null
+++ b/lib/THC/THCReduceApplyUtils.cu
@@ -0,0 +1,35 @@
+#include "THCReduceApplyUtils.cuh"
+
+#include <assert.h>
+#include <stdlib.h>
+
+// Maximum size per grid dimension that we assume (compute capability >= 2.0)
+#define MAX_GRID_SIZE 65535LL
+
+void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) {
+  long dims = THCudaTensor_nDimension(state, tensor);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING);
+}
+
+bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid) {
+  if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
+    return false;
+  }
+
+  long gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+  long gridY = 1;
+  long gridZ = 1;
+
+  if (gridTiles > MAX_GRID_SIZE) {
+    gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
+    gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+
+    if (gridTiles > MAX_GRID_SIZE) {
+      gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
+      gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+    }
+  }
+
+  grid = dim3(gridX, gridY, gridZ);
+  return true;
+}
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
new file mode 100644
index 0000000..e365b3a
--- /dev/null
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -0,0 +1,81 @@
+#ifndef THC_REDUCE_APPLY_UTILS_INC
+#define THC_REDUCE_APPLY_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCTensor.h"
+#include "THCDeviceUtils.cuh"
+#include "THCTensorInfo.cuh"
+
+// Enum that indicates whether tensor arguments are read/write or
+// read-only
+enum TensorArgType { ReadWrite, ReadOnly };
+
+template <typename IndexType>
+__device__ __forceinline__ IndexType getLinearBlockId() {
+  return blockIdx.z * gridDim.y * gridDim.x +
+    blockIdx.y * gridDim.x +
+    blockIdx.x;
+}
+
+// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
+// return the reduced value
+template <typename T, typename ReduceOp>
+__device__ T reduceBlock(T* smem,
+                         int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  if (numVals == 0) {
+    return init;
+  }
+
+  if (threadIdx.x < numVals) {
+    smem[threadIdx.x] = threadVal;
+  }
+
+  // First warp will perform reductions across warps
+  __syncthreads();
+  if ((threadIdx.x / warpSize) == 0) {
+    T r = threadIdx.x < numVals ? smem[threadIdx.x] : init;
+
+    for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) {
+      r = reduceOp(r, smem[i]);
+    }
+
+    smem[threadIdx.x] = r;
+  }
+
+  // First thread will perform reductions across the block
+  __syncthreads();
+
+  T r = init;
+  if (threadIdx.x == 0) {
+    r = smem[0];
+
+    int numLanesParticipating = min(numVals, warpSize);
+
+    if (numLanesParticipating == 32) {
+      // Unroll for warpSize == 32 and numVals >= 32
+#pragma unroll
+      for (int i = 1; i < 32; ++i) {
+        r = reduceOp(r, smem[i]);
+      }
+    } else {
+      for (int i = 1; i < numLanesParticipating; ++i) {
+        r = reduceOp(r, smem[i]);
+      }
+    }
+  }
+
+  return r;
+}
+
+// Make sure the given tensor doesn't have too many dimensions
+void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
+
+// Produces a grid with at least one point per tile
+THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
+
+#endif // THC_REDUCE_APPLY_UTILS_INC
diff --git a/lib/THC/THCScanUtils.cuh b/lib/THC/THCScanUtils.cuh
new file mode 100644
index 0000000..41a4423
--- /dev/null
+++ b/lib/THC/THCScanUtils.cuh
@@ -0,0 +1,116 @@
+#ifndef THC_SCAN_UTILS_INC
+#define THC_SCAN_UTILS_INC
+
+#include "THCAsmUtils.cuh"
+
+// Collection of in-kernel scan / prefix sum utilities
+
+// Inclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency>
+__device__ void inclusivePrefixSum(T* smem, T in, T* out) {
+  // FIXME: this is a slow, simple implementation; need up/down sweep,
+  // prevent smem conflicts
+  smem[threadIdx.x] = in;
+
+  __syncthreads();
+
+  for (int offset = 1; offset < blockDim.x; offset *= 2) {
+    T val = 0;
+
+    if (threadIdx.x >= offset) {
+      val = smem[threadIdx.x - offset] + smem[threadIdx.x];
+    }
+
+    __syncthreads();
+    if (threadIdx.x >= offset) {
+      smem[threadIdx.x] = val;
+    }
+
+    __syncthreads();
+  }
+
+  *out = smem[threadIdx.x];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum using shared memory
+template <typename T, bool KillWARDependency>
+__device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
+  // FIXME: crappy implementation
+  // We kill write-after-read dependencies separately below, hence the `false`
+  inclusivePrefixSum<T, false>(smem, in, out);
+
+  *out -= in;
+  *carry = smem[blockDim.x - 1];
+
+  // Prevent write-after-read dependencies on smem usage above if necessary
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Inclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency>
+__device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
+  // Within-warp, we use warp voting.
+  T vote = __ballot(in);
+  T index = __popc(getLaneMaskLe() & vote);
+  T carry = __popc(vote);
+
+  int warp = threadIdx.x / 32;
+
+  // Per each warp, write out a value
+  if (getLaneId() == 0) {
+    smem[warp] = carry;
+  }
+
+  __syncthreads();
+
+  // Sum across warps in one thread. This appears to be faster than a
+  // warp shuffle scan for CC 3.0+
+  if (threadIdx.x == 0) {
+    int current = 0;
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      T v = smem[i];
+      smem[i] += current;
+      current += v;
+    }
+  }
+
+  __syncthreads();
+
+  // load the carry from the preceding warp
+  if (warp >= 1) {
+    index += smem[warp - 1];
+  }
+
+  *out = index;
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency>
+__device__ void exclusiveBinaryPrefixSum(T* smem, bool in, T* out, T* carry) {
+  inclusiveBinaryPrefixSum<T, false>(smem, in, out);
+
+  // Inclusive to exclusive
+  *out -= (T) in;
+
+  // The outgoing carry for all threads is the last warp's sum
+  *carry = smem[(blockDim.x / 32) - 1];
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+#endif // THC_SCAN_UTILS_INC
diff --git a/lib/THC/THCSleep.cu b/lib/THC/THCSleep.cu
new file mode 100644
index 0000000..026f333
--- /dev/null
+++ b/lib/THC/THCSleep.cu
@@ -0,0 +1,21 @@
+#include "THCSleep.h"
+
+
+__global__ void spin_kernel(long long cycles)
+{
+  // see concurrentKernels CUDA sampl
+  long long start_clock = clock64();
+  long long clock_offset = 0;
+  while (clock_offset < cycles)
+  {
+    clock_offset = clock64() - start_clock;
+  }
+}
+
+THC_API void THC_sleep(THCState* state, long long cycles)
+{
+  dim3 grid(1);
+  dim3 block(1);
+  spin_kernel<<<grid, block, 0, THCState_getCurrentStream(state)>>>(cycles);
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/lib/THC/THCSleep.h b/lib/THC/THCSleep.h
new file mode 100644
index 0000000..18c7be7
--- /dev/null
+++ b/lib/THC/THCSleep.h
@@ -0,0 +1,10 @@
+#ifndef THC_SPIN_INC
+#define THC_SPIN_INC
+
+#include "THCGeneral.h"
+#include <time.h>
+
+// enqueues a kernel that spins for the specified number of cycles
+THC_API void THC_sleep(THCState* state, long long cycles);
+
+#endif
diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh
new file mode 100644
index 0000000..ec676c0
--- /dev/null
+++ b/lib/THC/THCSortUtils.cuh
@@ -0,0 +1,171 @@
+#ifndef THC_SORT_UTILS_INC
+#define THC_SORT_UTILS_INC
+
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include "THCNumerics.cuh"
+
+// Collection of kernel sort routines
+template <typename T>
+struct LTComp {
+  __device__ inline bool operator()(const T& a, const T& b) const {
+    return THCNumerics<T>::lt(a, b);
+  }
+};
+
+template <typename T>
+struct GTComp {
+  __device__ inline bool operator()(const T& a, const T& b) const {
+    return THCNumerics<T>::gt(a, b);
+  }
+};
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
+                                   K& kB, V& vB, bool& validB,
+                                   bool dir,
+                                   const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <typename Comparator, typename K, typename V,
+          typename IndexType, int Power2SortSize>
+__device__ inline void bitonicSort(K keys[Power2SortSize],
+                                   V values[Power2SortSize],
+                                   bool valid[Power2SortSize],
+                                   const Comparator& comp) {
+#pragma unroll
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#pragma unroll
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+
+      // Single warp per slice is completely synchronous
+      if (Power2SortSize > 64) {
+        __syncthreads();
+      }
+
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwap<Comparator, K, V>(
+        keys[pos], values[pos], valid[pos],
+        keys[pos + stride], values[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+#pragma unroll
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    // Single warp per slice is completely synchronous
+    if (Power2SortSize > 64) {
+      __syncthreads();
+    }
+
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwap<Comparator, K, V>(
+      keys[pos], values[pos], valid[pos],
+      keys[pos + stride], values[pos + stride], valid[pos + stride],
+      false, comp);
+  }
+
+  // Single warp per slice is completely synchronous
+  if (Power2SortSize > 64) {
+    __syncthreads();
+  }
+}
+
+// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
+// modifies the input `keys` and `values`
+template <typename K, typename V,
+          int KeyDims, int ValueDims,
+          typename Comparator, typename IndexType, int Power2SortSize>
+__global__ void
+bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
+                     IndexType keySlices,
+                     IndexType keySliceSize,
+                     IndexType keySliceStride,
+                     TensorInfo<V, IndexType> values,
+                     IndexType valueSliceStride,
+                     const Comparator& comp) {
+  // Find the slice of the tensor that we are sorting
+  const IndexType linearIndex = getLinearBlockId<IndexType>();
+  // Tiling the slices could have us be out of bounds, if there are a
+  // lot of slices to sort
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  __shared__ K sharedKeys[Power2SortSize];
+  __shared__ V sharedValues[Power2SortSize];
+  __shared__ bool sharedValid[Power2SortSize];
+
+  const IndexType keyStartOffset =
+    IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  // If the sort size is 1, the data is already sorted
+  if (Power2SortSize == 1) {
+    return;
+  } else {
+    // Otherwise, each thread is responsible for loading and storing 2
+    // elements. The sort size is guaranteed to be >= 2
+    const int elem1 = threadIdx.x;
+    const int elem2 = threadIdx.x + (Power2SortSize / 2);
+
+    bool valid1 = (elem1 < keySliceSize);
+    K k1 = valid1 ?
+      keys.data[keyStartOffset + elem1 * keySliceStride] : ScalarConvert<int, K>::to(0);
+    V v1 = valid1 ?
+      values.data[valueStartOffset + elem1 * valueSliceStride] : ScalarConvert<int, V>::to(0);
+
+    sharedKeys[elem1] = k1;
+    sharedValues[elem1] = v1;
+    sharedValid[elem1] = valid1;
+
+    bool valid2 = (elem2 < keySliceSize);
+    K k2 = valid2 ?
+      keys.data[keyStartOffset + elem2 * keySliceStride] : ScalarConvert<int, K>::to(0);
+    V v2 = valid2 ?
+      values.data[valueStartOffset + elem2 * valueSliceStride] : ScalarConvert<int, V>::to(0);
+
+    sharedKeys[elem2] = k2;
+    sharedValues[elem2] = v2;
+    sharedValid[elem2] = valid2;
+
+    // Sort!
+    bitonicSort<Comparator, K, V, IndexType, Power2SortSize>(
+      sharedKeys, sharedValues, sharedValid, comp);
+
+    // elem1 and elem2 values might be out-of-range, if the data size we are
+    // sorting is smaller than half the power2 size
+    if (valid1) {
+      keys.data[keyStartOffset + elem1 * keySliceStride] =
+        sharedKeys[elem1];
+      values.data[valueStartOffset + elem1 * valueSliceStride] =
+        sharedValues[elem1];
+    }
+
+    if (valid2) {
+      keys.data[keyStartOffset + elem2 * keySliceStride] =
+        sharedKeys[elem2];
+      values.data[valueStartOffset + elem2 * valueSliceStride] =
+        sharedValues[elem2];
+    }
+  }
+}
+
+#endif // THC_SORT_UTILS_INC
diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c
new file mode 100644
index 0000000..669efa8
--- /dev/null
+++ b/lib/THC/THCStorage.c
@@ -0,0 +1,8 @@
+#include "THCStorage.h"
+#include "THCGeneral.h"
+#include "THAtomic.h"
+
+#include "THCHalf.h"
+
+#include "generic/THCStorage.c"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu
new file mode 100644
index 0000000..5555c6f
--- /dev/null
+++ b/lib/THC/THCStorage.cu
@@ -0,0 +1,13 @@
+#include "THCStorage.h"
+
+#include "THCThrustAllocator.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+#include "THCHalf.h"
+
+#include "generic/THCStorage.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h
new file mode 100644
index 0000000..ac1cd70
--- /dev/null
+++ b/lib/THC/THCStorage.h
@@ -0,0 +1,17 @@
+#ifndef THC_STORAGE_INC
+#define THC_STORAGE_INC
+
+#include "THStorage.h"
+#include "THCGeneral.h"
+
+#define THCStorage        TH_CONCAT_3(TH,CReal,Storage)
+#define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME)
+
+/* fast access methods */
+#define THC_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
+#define THC_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
+
+#include "generic/THCStorage.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c
new file mode 100644
index 0000000..ee9bf81
--- /dev/null
+++ b/lib/THC/THCStorageCopy.c
@@ -0,0 +1,6 @@
+#include "THCStorageCopy.h"
+
+#include "THCTensorCopy.h"
+
+#include "generic/THCStorageCopy.c"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu
new file mode 100644
index 0000000..5664188
--- /dev/null
+++ b/lib/THC/THCStorageCopy.cu
@@ -0,0 +1,8 @@
+#include "THCStorageCopy.h"
+#include "THCGeneral.h"
+
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+
+#include "generic/THCStorageCopy.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h
new file mode 100644
index 0000000..837056f
--- /dev/null
+++ b/lib/THC/THCStorageCopy.h
@@ -0,0 +1,11 @@
+#ifndef THC_STORAGE_COPY_INC
+#define THC_STORAGE_COPY_INC
+
+#include "THCStorage.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+#include "generic/THCStorageCopy.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/lib/THC/THCStream.c b/lib/THC/THCStream.c
new file mode 100644
index 0000000..e261a51
--- /dev/null
+++ b/lib/THC/THCStream.c
@@ -0,0 +1,30 @@
+#include "THCStream.h"
+
+#include <cuda_runtime_api.h>
+#include "THAtomic.h"
+
+
+THCStream* THCStream_new(int flags)
+{
+  THCStream* self = (THCStream*) malloc(sizeof(THCStream));
+  self->refcount = 1;
+  THCudaCheck(cudaGetDevice(&self->device));
+  THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
+  return self;
+}
+
+void THCStream_free(THCStream* self)
+{
+  if (!self) {
+    return;
+  }
+  if (THAtomicDecrementRef(&self->refcount)) {
+    THCudaCheck(cudaStreamDestroy(self->stream));
+    free(self);
+  }
+}
+
+void THCStream_retain(THCStream* self)
+{
+  THAtomicIncrementRef(&self->refcount);
+}
diff --git a/lib/THC/THCStream.h b/lib/THC/THCStream.h
new file mode 100644
index 0000000..de3f64e
--- /dev/null
+++ b/lib/THC/THCStream.h
@@ -0,0 +1,19 @@
+#ifndef THC_STREAM_INC
+#define THC_STREAM_INC
+
+#include <cuda_runtime_api.h>
+#include "THCGeneral.h"
+
+struct THCStream
+{
+    cudaStream_t stream;
+    int device;
+    int refcount;
+};
+
+
+THC_API THCStream* THCStream_new(int flags);
+THC_API void THCStream_free(THCStream* self);
+THC_API void THCStream_retain(THCStream* self);
+
+#endif // THC_STREAM_INC
diff --git a/lib/THC/THCTensor.c b/lib/THC/THCTensor.c
new file mode 100644
index 0000000..3bcf69d
--- /dev/null
+++ b/lib/THC/THCTensor.c
@@ -0,0 +1,7 @@
+#include "THCGeneral.h"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THAtomic.h"
+
+#include "generic/THCTensor.c"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensor.cu b/lib/THC/THCTensor.cu
new file mode 100644
index 0000000..1e6fc20
--- /dev/null
+++ b/lib/THC/THCTensor.cu
@@ -0,0 +1,4 @@
+#include "THCTensor.h"
+
+#include "generic/THCTensor.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h
new file mode 100644
index 0000000..e2521b1
--- /dev/null
+++ b/lib/THC/THCTensor.h
@@ -0,0 +1,21 @@
+#ifndef THC_TENSOR_INC
+#define THC_TENSOR_INC
+
+#include "THTensor.h"
+#include "THCStorage.h"
+#include "THCGeneral.h"
+
+#define THCTensor          TH_CONCAT_3(TH,CReal,Tensor)
+#define THCTensor_(NAME)   TH_CONCAT_4(TH,CReal,Tensor_,NAME)
+
+#define THC_DESC_BUFF_LEN 64
+
+typedef struct THC_CLASS THCDescBuff
+{
+    char str[THC_DESC_BUFF_LEN];
+} THCDescBuff;
+
+#include "generic/THCTensor.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/lib/THC/THCTensorConv.cu b/lib/THC/THCTensorConv.cu
new file mode 100644
index 0000000..71aac03
--- /dev/null
+++ b/lib/THC/THCTensorConv.cu
@@ -0,0 +1,953 @@
+#include "THCTensorConv.h"
+#include "THCTensorMath.h"
+#include "THCTensorCopy.h"
+#include "THCGeneral.h"
+#include <stdio.h>
+
+/*
+ * Description:
+ *   This code provides convolutions and xcorrelations that are API compatible with
+ *   the ones in THLabConv.
+ *
+ * History:
+ *   Sept 11, 2011, 11:59PM  -  Clement Farabet  -  Optimized RevConv by a good x2
+ *   July 22, 2011, 8:38PM   -  Clement Farabet  -  All Valid/Full/XCORR/CONV implemented
+ *   July 22, 2011, 4:00PM   -  Clement Farabet  -  Rewrote for loop to insure memory coalescing
+ *   July 21, 2011, 11:21PM  -  Clement Farabet  -  Creation, based conv2d routine
+ */
+
+#define CUDA_SHARED_MEM_SIZE (12*1024-32) // this is given by nVidia: max shared mem per block
+
+/*
+ * Description:
+ *   base conv2D routine: 3D input, 3D output, 4D kernel
+ *
+ *   - all chunks of data should be contiguous
+ *   - the swapkernel flag can be used to generate a conv2 instead of xcorr2
+ *   - the templated kernel size is useful to generate code that's 2x faster
+ *     but can be set to 0 to allow arbitrary kernel sizes
+ */
+template <bool swapkernel, int T_kernel_h, int T_kernel_w>
+  __global__ void conv2generic(float *input, float *kernel, float *output,
+                               int input_n, int input_h, int input_w,
+                               int kernel_n, int kernel_h, int kernel_w,
+                               int stride_h, int stride_w)
+{
+  // output dimensions
+  int output_h = (input_h - kernel_h) / stride_h + 1;
+  int output_w = (input_w - kernel_w) / stride_w + 1;
+
+  // xcorr or conv
+  int koffset = swapkernel ? kernel_w*kernel_h-1 : 0;
+
+  // nb outputs
+  int output_n = kernel_n / input_n;
+
+  // generate offsets according to block/thread ids
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  int oo_start = blockIdx.x;
+  int oo_end = oo_start+1;
+
+  int ii_start = (blockIdx.x / output_n) * input_n;
+  int ii_end = ii_start + input_n;
+
+  // nb threads, unique thread id
+  int tid = blockDim.x*blockDim.y*threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
+  int nthreads = blockDim.x * blockDim.y * blockDim.z;
+
+  // iterators
+  int oo, ii, xx, yy, kx, ky, kk;
+
+  // do the kernels fit in shared mem ?
+  if (input_n*kernel_w*kernel_h <= CUDA_SHARED_MEM_SIZE) {
+
+    // put the kernel in shared memory
+    __shared__ float shared_kernel[CUDA_SHARED_MEM_SIZE];
+
+    // first thread of each block does the copy
+    for (kk = tid; kk < kernel_w*kernel_h*input_n; kk += nthreads) {
+      shared_kernel[kk] = kernel[input_n*kernel_w*kernel_h*(oo_start % output_n) + kk];
+    }
+    __syncthreads();
+
+    // templated kernel size
+    if ((T_kernel_w > 0) && (T_kernel_h > 0)) {
+      // unrolled convolution loop
+      for(oo = oo_start; oo < oo_end; oo++) {
+        for(ii = ii_start; ii < ii_end; ii++) {
+          for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+            for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+              // Dot product in two dimensions... (between input image and the mask)
+              float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w;
+              float *output_p = output + oo*output_h*output_w + yy*output_w + xx;
+              float *kernel_p = shared_kernel + (ii % input_n)*kernel_w*kernel_h + koffset;
+              float sum = 0;
+              if (swapkernel) {
+#pragma unroll
+                for(ky = 0; ky < T_kernel_h; ky++) {
+#pragma unroll
+                  for(kx = 0; kx < T_kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p--);
+                  }
+                  input_p += input_w;
+                }
+              } else {
+#pragma unroll
+                for(ky = 0; ky < T_kernel_h; ky++) {
+#pragma unroll
+                  for(kx = 0; kx < T_kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p++);
+                  }
+                  input_p += input_w;
+                }
+              }
+              *output_p += sum;
+            }
+          }
+        }
+      }
+    } else {
+      // default convolution loop
+      for(oo = oo_start; oo < oo_end; oo++) {
+        for(ii = ii_start; ii < ii_end; ii++) {
+          for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+            for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+              // Dot product in two dimensions... (between input image and the mask)
+              float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w;
+              float *output_p = output + oo*output_h*output_w + yy*output_w + xx;
+              float *kernel_p = shared_kernel + (ii % input_n) * kernel_w * kernel_h + koffset;
+              float sum = 0;
+              if (swapkernel) {
+                for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                  for(kx = 0; kx < kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p--);
+                  }
+                  input_p += input_w;
+                }
+              } else {
+                for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                  for(kx = 0; kx < kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p++);
+                  }
+                  input_p += input_w;
+                }
+              }
+              *output_p += sum;
+            }
+          }
+        }
+      }
+    }
+
+  } else { // not enough shared mem for kernels, simply stream them
+
+    // convolution loop
+    for(oo = oo_start; oo < oo_end; oo++) {
+      for(ii = ii_start; ii < ii_end; ii++) {
+        for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+          for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+            // Dot product in two dimensions... (between input image and the mask)
+            float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w;
+            float *output_p = output + oo*output_h*output_w + yy*output_w + xx;
+            float *kernel_p = kernel + ((oo % output_n) * input_n + (ii % input_n))*kernel_w*kernel_h + koffset;
+            float sum = 0;
+            if (swapkernel) {
+              for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                for(kx = 0; kx < kernel_w; kx++) {
+                  sum += input_p[kx]*(*kernel_p--);
+                }
+                input_p += input_w;
+              }
+            } else {
+              for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                for(kx = 0; kx < kernel_w; kx++) {
+                  sum += input_p[kx]*(*kernel_p++);
+                }
+                input_p += input_w;
+              }
+            }
+            *output_p += sum;
+          }
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Description:
+ *   base conv2D routine with reversed stride: 3D input, 4D output, 3D kernel
+ *   this is useful for computing gradients with respect to kernels, where:
+ *   input=input, kernel=gradOutput, output=gradWeight
+ *
+ *   - all chunks of data should be contiguous
+ *   - the swapkernel flag can be used to generate a conv2 instead of xcorr2
+ */
+__global__ void conv2genericrev(float *input, float *kernel, float *output,
+                                int input_n, int input_h, int input_w,
+                                int kernel_n, int kernel_h, int kernel_w,
+                                float alpha, int stride_h, int stride_w)
+{
+  // output dimensions
+  int output_h = input_h - (kernel_h - 1) * stride_h;
+  int output_w = input_w - (kernel_w - 1) * stride_w;
+
+  // this thread only processes one output, defined by the block Ids
+  int kk = blockIdx.x;
+  int ii = blockIdx.y;
+
+  // batch id
+  int batch = threadIdx.z;
+
+  // kernel id
+  int kid = threadIdx.x;
+  int nkids = blockDim.x;
+
+  // thread ID
+  int tid = kid + batch*blockDim.x;
+  int nthreads = blockDim.x * blockDim.z;
+
+  // one thread only sees one output
+  output = output + (kk * input_n + ii) * output_h*output_w;
+
+  // put the output in shared memory
+  __shared__ float shared_output[CUDA_SHARED_MEM_SIZE];
+
+  // generate tid outputs in shared memory
+  float *output_s = shared_output + tid*output_w*output_h;
+
+  // convolution loop
+  int xx, yy, kx, ky;
+  yy = threadIdx.y;
+  float *output_p = output_s + yy * output_w;
+  for(xx=0; xx<output_w; xx++) {
+    // Dot product in two dimensions... (between input image and kernel)
+    float *input_p = input + (ii + batch*input_n)*input_h*input_w + yy*stride_h*input_w + xx*stride_w;
+    float *kernel_p = kernel + (kk + batch*kernel_n)*kernel_w*kernel_h;
+    float sum = 0;
+    for(ky=0; ky<kernel_h; ky++) {
+      for(kx=kid; kx<kernel_w; kx+=nkids) {
+        sum += input_p[kx]*kernel_p[kx];
+      }
+      input_p += input_w;
+      kernel_p += kernel_w;
+    }
+    *(output_p++) = sum;
+  }
+  __syncthreads();
+
+  // reduce and write back
+  if (yy == 0) {
+    // reduce outputs
+    for (int k=1; k<nthreads; k++) {
+      for (int i=tid; i<output_w*output_h; i+=nthreads) {
+        shared_output[i] += shared_output[k*output_h*output_w + i];
+      }
+    }
+    __syncthreads();
+
+    // add existing output, and write back
+    for (int i=tid; i<output_w*output_h; i+=nthreads) {
+      output[i] += alpha*shared_output[i];
+    }
+  }
+}
+
+// A helper macro for the common pattern of checking the input
+// rows/columns for a small number of values, specializing the kernel
+// template paremeters if rows/columns are equal and small, and
+// otherwise just passing zero to the kernel.
+#define FOR_KERNEL_SPECIALIZED_DIMENSION(ROWS, COLUMNS, KERNEL) \
+  if ((ROWS) == (COLUMNS)) {                                    \
+    switch ((ROWS)) {                                           \
+      case 3: { KERNEL(3); break; }                             \
+      case 4: { KERNEL(4); break; }                             \
+      case 5: { KERNEL(5); break; }                             \
+      case 6: { KERNEL(6); break; }                             \
+      case 7: { KERNEL(7); break; }                             \
+      case 8: { KERNEL(8); break; }                             \
+      case 9: { KERNEL(9); break; }                             \
+      case 10: { KERNEL(10); break; }                           \
+      case 11: { KERNEL(11); break; }                           \
+      case 12: { KERNEL(12); break; }                           \
+      case 13: { KERNEL(13); break; }                           \
+      default: { KERNEL(0); break; }                            \
+    }                                                           \
+  } else {                                                      \
+    KERNEL(0);                                                  \
+  }
+
+/*
+ * API-compatible with THRealTensor_conv2Dmv
+ * 3D input, 4D kernel, 3D output
+ * matrix vector product like: y <- Ax + beta*y
+ */
+THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
+                                   THCudaTensor *kernel, long srow, long scol, const char *type)
+{
+  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+
+  THArgCheck(kernel->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(type[0] == 'v' || type[0] == 'f', 7, "type of convolution can 'v' or 'f'");
+  THArgCheck(type[1] == 'c' || type[1] == 'x', 7, "type of convolution can 'x' or 'c'");
+
+  input = THCudaTensor_newContiguous(state, input);
+  kernel = THCudaTensor_newContiguous(state, kernel);
+
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  nKernelRows  = kernel->size[2];
+  nKernelCols  = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *type == 'f', 2,
+              "conv2Dmv : Input image is smaller than kernel");
+
+  if (*type == 'f') {
+    // output dims
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+
+    // use temp buffer
+    THCudaTensor *inputP = THCudaTensor_new(state);
+
+    // create a zero-padded input
+    long nInputRowsPadded = (nOutputRows - 1) * srow + nKernelRows;
+    long nInputColsPadded = (nOutputCols - 1) * scol + nKernelCols;
+    THCudaTensor_resize3d(state, inputP, nInputPlane, nInputRowsPadded, nInputColsPadded);
+    THCudaTensor_zero(state, inputP);
+
+    THCudaTensor *centered = THCudaTensor_new(state);
+    THCudaTensor_narrow(state, centered, inputP, 2, nKernelCols-1, nInputCols);
+    THCudaTensor_narrow(state, centered, NULL, 1, nKernelRows-1, nInputRows);
+    THCudaTensor_copy(state, centered, input);
+    THCudaTensor_free(state, centered);
+
+    // remap input to newly created tensor
+    THCudaTensor_free(state, input);
+    input = inputP;
+    nInputRows = nInputRowsPadded;
+    nInputCols = nInputColsPadded;
+
+  } else { // 'v'
+    // output dims
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
+  THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
+    THCudaTensor_zero(state, output);
+  } else if (beta != 1) {
+    THCudaTensor_mul(state, output, output, beta);
+  }
+
+  float *input_data = THCudaTensor_data(state, input);
+  float *weight_data = THCudaTensor_data(state, kernel);
+  float *output_data = THCudaTensor_data(state, output);
+
+  // cuda blocks & threads:
+  int yblocks = (int)(16L / nOutputPlane);
+  yblocks = yblocks < 1 ? 1 : yblocks;
+  dim3 blocks(nOutputPlane,yblocks);
+  dim3 threads(32,8);
+
+  // convolution: xcorr2 or conv2
+  if (type[1] == 'x') {
+#define X_CONV_KERNEL(dim)                                              \
+    conv2generic <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \
+        input_data, weight_data, output_data,                           \
+        nInputPlane, nInputRows, nInputCols,                            \
+        nOutputPlane*nInputPlane, nKernelRows, nKernelCols,             \
+        srow, scol);
+
+    FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelRows, nKernelCols, X_CONV_KERNEL);
+#undef X_CONV_KERNEL
+  } else { // 'c'
+#define C_CONV_KERNEL(dim)                                              \
+    conv2generic <true, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (           \
+        input_data, weight_data, output_data,                           \
+        nInputPlane, nInputRows, nInputCols,                            \
+        nOutputPlane*nInputPlane, nKernelRows, nKernelCols,             \
+        srow, scol);
+
+    FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelRows, nKernelCols, C_CONV_KERNEL);
+#undef C_CONV_KERNEL
+  }
+
+  // clean
+  if (*type != 'f') THCudaTensor_free(state, input);
+  THCudaTensor_free(state, kernel);
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in conv2Dmv: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+/*
+ * API-compatible with THRealTensor_conv2Dmm
+ * 4D input, 4D kernel, 4D output
+ * matrix vector product like: y <- Ax + beta*y
+ */
+THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input,
+                                   THCudaTensor *kernel, long srow, long scol, const char *type)
+{
+  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  long nbatch, nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+
+  THArgCheck(kernel->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(type[0] == 'v' || type[0] == 'f', 7, "type of convolution can 'v' or 'f'");
+  THArgCheck(type[1] == 'c' || type[1] == 'x', 7, "type of convolution can 'x' or 'c'");
+
+  input = THCudaTensor_newContiguous(state, input);
+  kernel = THCudaTensor_newContiguous(state, kernel);
+
+  nbatch      = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  nKernelRows  = kernel->size[2];
+  nKernelCols  = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *type == 'f', 2,
+              "conv2Dmm : Input image is smaller than kernel");
+
+  if (*type == 'f') {
+    // output dims
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+
+    // use temp buffer
+    THCudaTensor *inputP = THCudaTensor_new(state);
+
+    // create a zero-padded input
+    long nInputRowsPadded = (nOutputRows - 1) * srow + nKernelRows;
+    long nInputColsPadded = (nOutputCols - 1) * scol + nKernelCols;
+    THCudaTensor_resize4d(state, inputP, nbatch, nInputPlane, nInputRowsPadded, nInputColsPadded);
+    THCudaTensor_zero(state, inputP);
+
+    THCudaTensor *centered = THCudaTensor_new(state);
+    THCudaTensor_narrow(state, centered, inputP, 3, nKernelCols-1, nInputCols);
+    THCudaTensor_narrow(state, centered, NULL, 2, nKernelRows-1, nInputRows);
+    THCudaTensor_copy(state, centered, input);
+    THCudaTensor_free(state, centered);
+
+    // remap input to newly created tensor
+    THCudaTensor_free(state, input);
+    input = inputP;
+    nInputRows = nInputRowsPadded;
+    nInputCols = nInputColsPadded;
+
+  } else { // 'v'
+    // output dims
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
+  THCudaTensor_resize4d(state, output, nbatch, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
+    THCudaTensor_zero(state, output);
+  } else if (beta != 1) {
+    THCudaTensor_mul(state, output, output, beta);
+  }
+
+  float *input_data = THCudaTensor_data(state, input);
+  float *weight_data = THCudaTensor_data(state, kernel);
+  float *output_data = THCudaTensor_data(state, output);
+
+  // cuda blocks & threads:
+  int yblocks = (int)(16L / nOutputPlane);
+  yblocks = yblocks < 1 ? 1 : yblocks;
+  dim3 blocks(nOutputPlane*nbatch,yblocks);
+  dim3 threads(32,8);
+
+  // convolution: xcorr2 or conv2
+  if (type[1] == 'x') {
+#define X_CONV_KERNEL(dim)                                              \
+    conv2generic <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \
+      input_data, weight_data, output_data,                             \
+      nInputPlane, nInputRows, nInputCols,                              \
+      nOutputPlane*nInputPlane, nKernelRows, nKernelCols,               \
+      srow, scol);
+
+    FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, X_CONV_KERNEL);
+#undef X_CONV_KERNEL
+  } else { // 'c'
+#define C_CONV_KERNEL(dim)                                              \
+    conv2generic <true, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \
+      input_data, weight_data, output_data,                             \
+      nInputPlane, nInputRows, nInputCols,                              \
+      nOutputPlane*nInputPlane, nKernelRows, nKernelCols,               \
+      srow, scol);                                                      \
+
+    FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, C_CONV_KERNEL);
+#undef C_CONV_KERNEL
+  }
+
+  // clean
+  if (*type != 'f') THCudaTensor_free(state, input);
+  THCudaTensor_free(state, kernel);
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    printf("error in conv2Dmm: %s\n", cudaGetErrorString(err));
+    printf("requested grid size: %dx%dx%d, max allowed: %dx%dx%d\n",
+           blocks.x, blocks.y, blocks.z,
+           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
+    printf("requested block size: %dx%dx%d, max allowed: %dx%dx%d\n",
+           threads.x, threads.y, threads.z,
+           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
+    THError("aborting");
+  }
+}
+
+/*
+ * API-compatible with THRealTensor_conv2DRevger
+ * 3D input, 3D kernel, 4D output
+ * like rank1 update
+ * A <- xx' + beta*A
+ * for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
+ * calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+ */
+THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, float beta, float alpha,
+                                       THCudaTensor *input, THCudaTensor *kernel,
+                                       long srow, long scol)
+{
+  THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel));
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelPlane, nKernelRows, nKernelCols;
+  long nOutputRows, nOutputCols;
+
+  THArgCheck(input->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(kernel->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THCudaTensor_newContiguous(state, input);
+  kernel = THCudaTensor_newContiguous(state, kernel);
+
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  nKernelPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2,
+             "conv2DRevger : Input image is smaller than kernel");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
+  THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
+    THCudaTensor_zero(state, output);
+  } else if (beta != 1) {
+    THCudaTensor_mul(state, output, output, beta);
+  }
+
+  float *input_data = THCudaTensor_data(state, input);
+  float *kernel_data = THCudaTensor_data(state, kernel);
+  float *output_data = THCudaTensor_data(state, output);
+
+  // auto compute nb of blocks and threads
+  dim3 blocks(nKernelPlane, nInputPlane);
+  dim3 threads(128/nOutputRows, nOutputRows);
+
+  // compute rev conv
+  conv2genericrev <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+    input_data, kernel_data, output_data,
+    nInputPlane, nInputRows, nInputCols,
+    nKernelPlane, nKernelRows, nKernelCols,
+    alpha, srow, scol);
+
+  // clean
+  THCudaTensor_free(state, input);
+  THCudaTensor_free(state, kernel);
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in conv2DRevger: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+/*
+ * API-compatible with THRealTensor_conv2DRevgerm
+ * 4D input, 4D kernel, 4D output
+ * conv2DRevgerm is doing the same thing as conv2DRevger, but with batch inputs
+ */
+THC_API void THCudaTensor_conv2DRevgerm(THCState *state, THCudaTensor *output, float beta, float alpha,
+                                        THCudaTensor *input, THCudaTensor *kernel,
+                                        long srow, long scol)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelPlane, nKernelRows, nKernelCols;
+  long nOutputRows, nOutputCols;
+  long nbatch;
+
+  THArgCheck(input->nDimension == 4 , 3, "input: 3D Tensor expected");
+  THArgCheck(kernel->nDimension == 4 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THCudaTensor_newContiguous(state, input);
+  kernel = THCudaTensor_newContiguous(state, kernel);
+
+  nbatch      = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  nKernelPlane = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2,
+             "conv2DRevger : Input image is smaller than kernel");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
+  THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
+    THCudaTensor_zero(state, output);
+  } else if (beta != 1) {
+    THCudaTensor_mul(state, output, output, beta);
+  }
+
+  float *input_data = THCudaTensor_data(state, input);
+  float *kernel_data = THCudaTensor_data(state, kernel);
+  float *output_data = THCudaTensor_data(state, output);
+
+  // kernel is called multiple times
+  // (the arbitrary split below is just here to make sure we dont go over 256 threads)
+  for (int sl=0; sl<nbatch; sl+=6) {
+    // auto compute nb of blocks and threads
+    dim3 blocks(nKernelPlane, nInputPlane);
+    int subbatch = 6;
+    if (sl+subbatch > nbatch) subbatch = nbatch - sl;
+    int cst = 256 / (subbatch * nOutputRows);
+    dim3 threads(cst, nOutputRows, subbatch);
+
+    // compute rev conv
+    conv2genericrev <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+      input_data + input->stride[0]*sl,
+      kernel_data + kernel->stride[0]*sl,
+      output_data,
+      nInputPlane, nInputRows, nInputCols,
+      nKernelPlane, nKernelRows, nKernelCols,
+      alpha, srow, scol);
+  }
+
+  // clean
+  THCudaTensor_free(state, input);
+  THCudaTensor_free(state, kernel);
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in conv2DRevger: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+
+///////////////////////////////////
+///// ConvolutionMap
+/*
+ * Description:
+ *   base conv2D routine: 3D input, 3D output, 4D kernel
+ *
+ *   - all chunks of data should be contiguous
+ *   - the swapkernel flag can be used to generate a conv2 instead of xcorr2
+ *   - the templated kernel size is useful to generate code that's 2x faster
+ *     but can be set to 0 to allow arbitrary kernel sizes
+ *   ---- the table should have the first dim with the outputs, each output
+ *   ---- should have a fanin set of inputs contiguously
+ */
+template <bool swapkernel, int T_kernel_h, int T_kernel_w>
+  __global__ void conv2mapgeneric(float *input, float *kernel, float *output,
+                                  int input_n, int input_h, int input_w,
+                                  int kernel_n, int kernel_h, int kernel_w,
+                                  int stride_w, int stride_h,
+                                  float *table, int fanin)
+{
+  // output dimensions
+  int output_h = (input_h - kernel_h) / stride_h + 1;
+  int output_w = (input_w - kernel_w) / stride_w + 1;
+
+  // xcorr or conv
+  int koffset = swapkernel ? kernel_w*kernel_h-1 : 0;
+
+  // nb outputs
+  // int output_n = kernel_n / fanin;
+
+  // generate offsets according to block/thread ids
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  int oo_start = blockIdx.x;
+  int oo_end = oo_start+1;
+
+  int table_start = blockIdx.x * (fanin * 2);
+  int table_end = table_start + (fanin * 2);
+
+  // nb threads, unique thread id
+  int tid = blockDim.x*blockDim.y*threadIdx.z
+    + blockDim.x * threadIdx.y + threadIdx.x;
+  int nthreads = blockDim.x * blockDim.y * blockDim.z;
+
+  // iterators
+  int oo, ii, xx, yy, kx, ky, kk;
+
+  // do the kernels fit in shared mem ?
+  if (kernel_w*kernel_h*kernel_n <= CUDA_SHARED_MEM_SIZE) {
+    // put the kernel in shared memory
+    __shared__ float shared_kernel[CUDA_SHARED_MEM_SIZE];
+
+    // first thread of each block does the copy
+    for (kk = tid; kk < kernel_w*kernel_h*kernel_n; kk += nthreads) {
+      shared_kernel[kk] = kernel[kk];
+    }
+    __syncthreads();
+
+    // templated kernel size
+    if ((T_kernel_w > 0) && (T_kernel_h > 0)) {
+      // unrolled convolution loop
+      for(oo = oo_start; oo < oo_end; oo++) {
+        for (ii = table_start; ii < table_end; ii = ii + 2) {
+          for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+            for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+              // Dot product in two dimensions... (between input image and the mask)
+              float *input_p = input + ((long)table[ii]-1)*input_h*input_w
+                + yy*stride_h*input_w + xx*stride_w;
+              float *output_p = output + oo*output_h*output_w + yy*output_w + xx;
+              float *kernel_p = shared_kernel
+                + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset;
+              float sum = 0;
+              if (swapkernel) {
+#pragma unroll
+                for(ky = 0; ky < T_kernel_h; ky++) {
+#pragma unroll
+                  for(kx = 0; kx < T_kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p--);
+                  }
+                  input_p += input_w;
+                }
+              } else {
+#pragma unroll
+                for(ky = 0; ky < T_kernel_h; ky++) {
+#pragma unroll
+                  for(kx = 0; kx < T_kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p++);
+                  }
+                  input_p += input_w;
+                }
+              }
+              *output_p += sum;
+            }
+          }
+        }
+      }
+    } else {
+      // default convolution loop
+      for(oo = oo_start; oo < oo_end; oo++) {
+        for (ii = table_start; ii < table_end; ii++) {
+          for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+            for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+              // Dot product in two dims (between input image and the mask)
+              float *input_p = input + ((long)table[ii]-1)*input_h*input_w
+                + yy*stride_h*input_w + xx*stride_w;
+              float *output_p = output + oo*output_h*output_w + yy*output_w
+                + xx;
+              float *kernel_p = shared_kernel
+                + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset;
+              float sum = 0;
+              if (swapkernel) {
+                for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                  for(kx = 0; kx < kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p--);
+                  }
+                  input_p += input_w;
+                }
+              } else {
+                for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                  for(kx = 0; kx < kernel_w; kx++) {
+                    sum += input_p[kx]*(*kernel_p++);
+                  }
+                  input_p += input_w;
+                }
+              }
+              *output_p += sum;
+            }
+          }
+        }
+      }
+    }
+
+  } else { // not enough shared mem for kernels, simply stream them
+
+    // convolution loop
+    for(oo = oo_start; oo < oo_end; oo++) {
+      for (ii = table_start; ii < table_end; ii = ii + 2) {
+        for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+          for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+            // Dot product in two dimensions... (between input image and the mask)
+            float *input_p = input + ((long)table[ii]-1)*input_h*input_w
+              + yy*stride_h*input_w + xx*stride_w;
+            float *output_p = output + oo*output_h*output_w + yy*output_w + xx;
+            float *kernel_p = kernel + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset;
+            float sum = 0;
+            if (swapkernel) {
+              for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                for(kx = 0; kx < kernel_w; kx++) {
+                  sum += input_p[kx]*(*kernel_p--);
+                }
+                input_p += input_w;
+              }
+            } else {
+              for(ky = 0; ky < kernel_h; ky++) {
+#pragma unroll 5
+                for(kx = 0; kx < kernel_w; kx++) {
+                  sum += input_p[kx]*(*kernel_p++);
+                }
+                input_p += input_w;
+              }
+            }
+            *output_p += sum;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*
+ * API-compatible with THRealTensor_conv2Dmv
+ * 3D input, 4D kernel, 3D output
+ * matrix vector product like: y <- Ax + beta*y
+ */
+THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCudaTensor *input,
+                                    THCudaTensor *kernel, long stride_x, long stride_y,
+                                    THCudaTensor *table, long fanin)
+{
+  THAssert(THCudaTensor_checkGPU(state, 4, output, input, kernel, table));
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+
+  THArgCheck(kernel->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(stride_x >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(stride_y >= 1, 6, "Stride should be a positive integer");
+
+  input = THCudaTensor_newContiguous(state, input);
+  kernel = THCudaTensor_newContiguous(state, kernel);
+  table = THCudaTensor_newContiguous(state, table);
+
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  nKernelRows  = kernel->size[1];
+  nKernelCols  = kernel->size[2];
+  nOutputPlane = kernel->size[0] / fanin;
+  // THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols), 2,
+              "conv2Dmap : Input image is smaller than kernel");
+
+  // output dims
+  nOutputRows = (nInputRows - nKernelRows) / stride_y + 1;
+  nOutputCols = (nInputCols - nKernelCols) / stride_x + 1;
+
+  // ptrdiff_t nelem = THCudaTensor_nElement(state, output);
+  THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols);
+
+  float *input_data = THCudaTensor_data(state, input);
+  float *kernel_data = THCudaTensor_data(state, kernel);
+  float *output_data = THCudaTensor_data(state, output);
+  float *table_data = THCudaTensor_data(state, table);
+
+  // set the number of blocks and threads
+  int nthreads_x = 32;
+  int nthreads_y = 8;
+  int block_height = (int)(16L / nOutputPlane);
+  if (block_height < 1)
+    block_height = 1;
+  dim3 blocks(nOutputPlane,block_height);
+  dim3 threads(nthreads_x,nthreads_y);
+
+#define GENERIC_MAP_KERNEL(dim)                                         \
+  conv2mapgeneric <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \
+      input_data, kernel_data, output_data, nInputPlane, nInputRows,    \
+      nInputCols, nOutputPlane*fanin, nKernelRows, nKernelCols,         \
+      stride_x, stride_y, table_data, fanin);
+
+  FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, GENERIC_MAP_KERNEL);
+#undef GENERIC_MAP_KERNEL
+  // clean
+  THCudaTensor_free(state, input);
+  THCudaTensor_free(state, kernel);
+  THCudaTensor_free(state, table);
+
+  // check for errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in conv2Dmap: %s\n", cudaGetErrorString(err));
+    THError("aborting");
+  }
+}
+
+#undef FOR_KERNEL_SPECIALIZED_DIMENSION
diff --git a/lib/THC/THCTensorConv.h b/lib/THC/THCTensorConv.h
new file mode 100644
index 0000000..0a7d0a6
--- /dev/null
+++ b/lib/THC/THCTensorConv.h
@@ -0,0 +1,26 @@
+#ifndef TH_CUDA_TENSOR_CONV_INC
+#define TH_CUDA_TENSOR_CONV_INC
+
+#include "THCTensor.h"
+
+struct THCState;
+
+THC_API void THCudaTensor_conv2Dmv(struct THCState *state, THCudaTensor *output,
+                                   float beta, THCudaTensor *input, THCudaTensor *kernel,
+                                   long srow, long scol, const char *type);
+THC_API void THCudaTensor_conv2Dmm(struct THCState *state, THCudaTensor *output,
+                                   float beta, THCudaTensor *input, THCudaTensor *kernel,
+                                   long srow, long scol, const char *type);
+
+THC_API void THCudaTensor_conv2DRevger(struct THCState *state, THCudaTensor *output,
+                                       float beta, float alpha, THCudaTensor *input,
+                                       THCudaTensor *kernel, long srow, long scol);
+THC_API void THCudaTensor_conv2DRevgerm(struct THCState *state, THCudaTensor *output,
+                                        float beta, float alpha, THCudaTensor *input,
+                                        THCudaTensor *kernel, long srow, long scol);
+
+THC_API void THCudaTensor_conv2Dmap(struct THCState *state, THCudaTensor *output,
+                                    THCudaTensor *input, THCudaTensor *kernel,
+                                    long stride_x, long stride_y, THCudaTensor *table, long fanin);
+
+#endif
diff --git a/lib/THC/THCTensorCopy.c b/lib/THC/THCTensorCopy.c
new file mode 100644
index 0000000..920e785
--- /dev/null
+++ b/lib/THC/THCTensorCopy.c
@@ -0,0 +1,5 @@
+#include "THCTensorCopy.h"
+#include "THCCachingHostAllocator.h"
+
+#include "generic/THCTensorCopy.c"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
new file mode 100644
index 0000000..e6b3567
--- /dev/null
+++ b/lib/THC/THCTensorCopy.cu
@@ -0,0 +1,208 @@
+#include "THCApply.cuh"
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+
+inline int curGPU() {
+  int curDev;
+  THCudaCheck(cudaGetDevice(&curDev));
+  return curDev;
+}
+
+// Copy operator for the pointwise apply kernel
+template <typename TypeDst, typename TypeSrc>
+struct CopyOp {
+  __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) {
+#if __CUDA_ARCH__ >= 350
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src));
+#else
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src);
+#endif
+  }
+};
+
+// Copy for the same type to the same type
+template <typename TensorTypeDst, typename TensorTypeSrc>
+void
+THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
+  ptrdiff_t totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
+
+  THArgCheck(totalElements ==
+             TensorUtils<TensorTypeSrc>::getNumElements(state, src),
+             2, "sizes do not match");
+
+  if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) {
+    // Zero-dim tensor; copy nothing
+    return;
+  }
+
+  // We can memcpy the memory if:
+  // -both tensors are contiguous; or,
+  // -there is only one element to copy; or,
+  // -FIXME: if both tensors have matching size and stride arrays, and no
+  // holes within (in other words, there is some permutation that can be applied
+  // to the size/strides such that the resulting tensor is
+  // contiguous).
+  // -AND: both tensors have the same type.
+  bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>();
+  bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src);
+  bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst);
+  bool memcpyEligible =
+    ((srcContig && dstContig) || (totalElements == 1)) && sameType;
+
+  int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src);
+  int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst);
+  int oldDev = curGPU();
+
+  // Try to enable p2p access. This also handles the case srcDev == dstDev.
+  bool p2pEnabled = THCState_getPeerToPeerAccess(state, srcDev, dstDev);
+
+  // We always perform the copy on the source device, using the
+  // current stream on the source device.
+  // If the copy is on the default stream, then we fully synchronize
+  // both src and dst's default streams for completion of the
+  // copy. We have to explicitly do this for non-contig copies.
+  // This mimics the behavior of cross-device cudaMemcpyAsync on
+  // the default stream.
+  // If the copy is not on the default stream, then it is up to the
+  // user to add needed synchronization on the dst device, since the
+  // stream on the dst device that wishes to synchronize may not be
+  // the same index as the one on the src device.
+  cudaStream_t copyStream = THCState_getCurrentStreamOnDevice(state, srcDev);
+  if (srcDev != dstDev && copyStream == NULL) {
+    // This is a cross-device copy on the default stream. We perform a
+    // two-way barrier between both devices' default streams before
+    // the copy. This ensures that any write-after-write and
+    // write-after-read dependencies on the destination side are
+    // handled, so that no one is operating on the dst memory when
+    // we perform the copy.
+    // src waits on dst barrier (src already waits on src)
+    cudaEvent_t dstReady;
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(dstReady, NULL));
+
+    THCudaCheck(cudaSetDevice(srcDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
+    THCudaCheck(cudaEventDestroy(dstReady));
+  } else if (srcDev != oldDev) {
+    THCudaCheck(cudaSetDevice(srcDev));
+  }
+
+  // We are now on srcDev
+  if (memcpyEligible) {
+    // Perform the copy
+    THCudaCheck(cudaMemcpyAsync(
+                  TensorUtils<TensorTypeDst>::getData(state, dst),
+                  TensorUtils<TensorTypeSrc>::getData(state, src),
+                  totalElements *
+                  sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+                  cudaMemcpyDeviceToDevice,
+                  copyStream));
+  } else {
+    // Non-contiguous copy or a type-conversion copy
+
+    // We avoid creating temporary memory copies if possible.
+    // If both src and dst are on the same device, or if they are on
+    // different devices and p2p access is enabled, perform the copy
+    // by a pointwise copy kernel.
+    // Otherwise, we'll have to make contiguous (which will in fact
+    // invoke copy() again), and then perform the copy.
+    // FIXME: might want to consider only running the pointwise kernel
+    // if both src and dst innermost dimensions are contiguous. If
+    // they are not, then taking the hit of the memory allocation/free
+    // might be worth it to avoid non-coalesced reads or writes.
+    if (p2pEnabled) {
+      bool succ =
+        THC_pointwiseApply2(
+          state, dst, src,
+          CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+                 typename TensorUtils<TensorTypeSrc>::DataType>());
+
+      THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+    } else {
+      // GPUs can't access each other directly, but the tensors
+      // involved are non-contiguous and/or are different types.
+
+      // Make sure the src is contiguous and in the same type as dst
+      THCudaCheck(cudaSetDevice(srcDev));
+      TensorTypeDst* srcContig = NULL;
+
+      if (sameType) {
+        srcContig =
+          (TensorTypeDst*) // this is actually the same type as src
+          TensorUtils<TensorTypeSrc>::newContiguous(state, src);
+
+      } else {
+        // Types are different
+        // Copy into the new format, contiguous, on the source device
+        srcContig = TensorUtils<TensorTypeDst>::newTensor(state);
+        TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst);
+
+        bool succ =
+          THC_pointwiseApply2(
+            state, srcContig, src,
+            CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+                   typename TensorUtils<TensorTypeSrc>::DataType>());
+
+        THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+      }
+
+      // Make sure the dst is contiguous
+      THCudaCheck(cudaSetDevice(dstDev));
+      TensorTypeDst* dstContig =
+        TensorUtils<TensorTypeDst>::newContiguous(state, dst);
+
+      // Now, we are ready for a cross-device memcpy of contiguous
+      // data, of the same layout and type
+      THCudaCheck(cudaSetDevice(srcDev));
+
+      THCudaCheck(cudaMemcpyAsync(
+                    TensorUtils<TensorTypeDst>::getData(state, dstContig),
+                    TensorUtils<TensorTypeDst>::getData(state, srcContig),
+                    totalElements *
+                    sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+                    cudaMemcpyDeviceToDevice,
+                    copyStream));
+
+      // We are done with the src
+      TensorUtils<TensorTypeDst>::free(state, srcContig);
+
+      if (dst != dstContig) {
+        TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst);
+      } else {
+        TensorUtils<TensorTypeDst>::free(state, dstContig);
+      }
+
+      // We're still on srcDev at this point
+    }
+  }
+
+  if (srcDev != dstDev && copyStream == NULL) {
+    // dst waits on src barrier (dst already waits on dst). We cannot
+    // operate on dst's copy until the copy is complete.
+
+    // Still on srcDev, record default stream event
+    cudaEvent_t srcReady;
+    THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(srcReady, NULL));
+
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
+    THCudaCheck(cudaEventDestroy(srcReady));
+
+    // We are now on dstDev (right above). Restore prior device from dst
+    if (dstDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  } else {
+    // We are still on srcDev. Restore prior device from src
+    if (srcDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#include "generic/THCTensorCopy.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
new file mode 100644
index 0000000..e8bc4f4
--- /dev/null
+++ b/lib/THC/THCTensorCopy.h
@@ -0,0 +1,11 @@
+#ifndef TH_CUDA_TENSOR_COPY_INC
+#define TH_CUDA_TENSOR_COPY_INC
+
+#include "THCTensor.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+
+#include "generic/THCTensorCopy.h"
+#include "THCGenerateAllTypes.h"
+
+#endif
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
new file mode 100644
index 0000000..1f216b3
--- /dev/null
+++ b/lib/THC/THCTensorIndex.cu
@@ -0,0 +1,336 @@
+#include "THC.h"
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCTensorRandom.h"
+#include "THCHalf.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCNumerics.cuh"
+#include "THCAtomics.cuh"
+#include <algorithm> // for std::min
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexCopyLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexCopySmallIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<T, IndexType> src,
+                                    TensorInfo<long, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType innerSize,
+                                    long dstCopyDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstCopyDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+
+      dstOffset += dstIndex * dst.strides[dstCopyDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcCopyDim];
+
+      dst.data[dstOffset] = src.data[srcOffset];
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexCopySmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<T, IndexType> src,
+                                    TensorInfo<long, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType innerSize,
+                                    long dstCopyDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < innerSize * indices.sizes[0];
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType srcIndex = linearIndex / innerSize;
+    IndexType elementInSlice = linearIndex % innerSize;
+
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstCopyDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstCopyDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcCopyDim];
+
+    dst.data[dstOffset] = src.data[srcOffset];
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexAddLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexAddSmallIndex(TensorInfo<T, IndexType> dst,
+                                   TensorInfo<T, IndexType> src,
+                                   TensorInfo<long, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType innerSize,
+                                   long dstAddDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstAddDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstAddDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcAddDim];
+
+      atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexAddSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexAddLargeIndex(TensorInfo<T, IndexType> dst,
+                                   TensorInfo<T, IndexType> src,
+                                   TensorInfo<long, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType innerSize,
+                                   long dstAddDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < innerSize * indices.sizes[0];
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType srcIndex = linearIndex / innerSize;
+    IndexType elementInSlice = linearIndex % innerSize;
+
+    // Lua indices begin at 1
+    IndexType dstIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstAddDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstAddDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcAddDim];
+
+    atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexFillLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int IdxDim>
+__global__ void indexFillSmallIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<long, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType innerSize,
+                                    long dstFillDimSize,
+                                    T val) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex_ =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex < dstFillDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+          IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex_ * dst.strides[dstFillDim];
+
+      dst.data[dstOffset] = val;
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexFillSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int IdxDim>
+__global__ void indexFillLargeIndex(TensorInfo<T, IndexType> dst,
+                                    TensorInfo<long, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType innerSize,
+                                    long dstFillDimSize,
+                                    T val) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < innerSize * indices.sizes[0];
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType dstIndex = linearIndex / innerSize;
+    IndexType elementInSlice = linearIndex % innerSize;
+
+    // Lua indices begin at 1
+    IndexType dstIndex_ =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(dstIndex_ < dstFillDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex_ * dst.strides[dstFillDim];
+
+    dst.data[dstOffset] = val;
+  }
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexSelectLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexSelectSmallIndex(TensorInfo<T, IndexType> dst,
+                                      TensorInfo<T, IndexType> src,
+                                      TensorInfo<long, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType innerSize,
+                                      long srcSelectDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
+    // Lua indices begin at 1
+    IndexType srcIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(srcIndex < srcSelectDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+      IndexType srcOffset =
+        IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcSelectDim];
+
+      dst.data[dstOffset] = src.data[srcOffset];
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexSelectSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexSelectLargeIndex(TensorInfo<T, IndexType> dst,
+                                      TensorInfo<T, IndexType> src,
+                                      TensorInfo<long, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType totalSize,
+                                      IndexType innerSize,
+                                      long srcSelectDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType dstIndex = linearIndex / innerSize;
+    IndexType elementInSlice = linearIndex % innerSize;
+
+    // Lua indices begin at 1
+    IndexType srcIndex =
+      indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
+    assert(srcIndex < srcSelectDimSize);
+
+    IndexType dstOffset =
+      IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+    IndexType srcOffset =
+      IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcSelectDim];
+
+    dst.data[dstOffset] = src.data[srcOffset];
+  }
+}
+
+#include "generic/THCTensorIndex.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorInfo.cuh b/lib/THC/THCTensorInfo.cuh
new file mode 100644
index 0000000..3389e61
--- /dev/null
+++ b/lib/THC/THCTensorInfo.cuh
@@ -0,0 +1,280 @@
+#ifndef THC_TENSOR_INFO_INC
+#define THC_TENSOR_INFO_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCTensor.h"
+
+// Maximum number of dimensions allowed for cutorch
+#define MAX_CUTORCH_DIMS 25
+
+// Warning string for tensor arguments that are too large or have too
+// many dimensions
+#define CUTORCH_STR(X) #X
+#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
+  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_CUTORCH_DIMS],
+             IndexType st[MAX_CUTORCH_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  // Collapses all runs of successive dimensions if the size/strides
+  // match up within the run and there are no holes between the
+  // dimensions.
+  // If excludeDim is set (not -1), then excludeDim will not be
+  // collapsed with any other dimension.
+  // Function returns the new dimension index that excludeDim maps to,
+  // since the collapsed dimensions are <= the input dimensions.
+  int collapseDims(int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_CUTORCH_DIMS];
+  IndexType strides[MAX_CUTORCH_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_CUTORCH_DIMS],
+                                     IndexType st[MAX_CUTORCH_DIMS]) {
+  data = p;
+  dims = dim;
+  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  assert(dim < dims && dim >= 0);
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(int excludeDim) {
+  // Find the innermost dimension not of size 1, since dimensions of size 1 are
+  // collapsible.
+  int firstNonOneDim = -1;
+
+  for (int i = dims - 1; i >= 0; --i) {
+    if (i == excludeDim) {
+      // We cannot collapse this dimension, even if it is size 1
+      firstNonOneDim = i;
+      break;
+    }
+
+    if (sizes[i] != 1) {
+      firstNonOneDim = i;
+      break;
+    }
+  }
+
+  // Special case: if all dimensions are of size 1, then this is a
+  // single-point tensor that we still have to operate on. Reduce to a
+  // single point.
+  if (firstNonOneDim == -1) {
+    assert(excludeDim == -1);
+
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    // Everything effectively got collapsed into this dimension
+    return 0;
+  }
+
+  // Count the number of successive dimensions that can be collapsed, from
+  // innermost to outermost.
+  int numCollapsed = 0;
+
+  // Skip the leading size 1 dims
+  numCollapsed += dims - 1 - firstNonOneDim;
+
+  // We perform one pass through to determine how many dimensions we
+  // can collapse, before calculating the actual size of the collapsed
+  // dimensions.
+  // size/strideInner are the size/strides of the previous inner
+  // non-collapsible dim we encounter.
+  long sizeInner = sizes[firstNonOneDim];
+  long strideInner = strides[firstNonOneDim];
+
+  for (int i = firstNonOneDim - 1; i >= 0; --i) {
+    long sizeOuter = sizes[i];
+    long strideOuter = strides[i];
+
+    // Don't collapse this dimension if we want to exclude it from
+    // collapsing.
+    // Since this code is attempting to collapse a subsequent
+    // dimension (i) with the preceding dimension (i + 1), we can only
+    // perform collapsing if the preceding dimension can be collapsed
+    // (i.e., not excludeDim)
+    if ((excludeDim != i) && (excludeDim != i + 1)) {
+      // The next outermost dimension can be skipped if size 1
+      if (sizeOuter == 1) {
+        ++numCollapsed;
+        continue;
+      }
+
+      // If the next outermost dimension is contiguous with the
+      // previous non-collapsed one, collapse it
+      if (strideOuter == strideInner * sizeInner) {
+        ++numCollapsed;
+
+        // This is the run of collapsed dimensions' size
+        sizeInner = sizeInner * sizeOuter;
+        continue;
+      }
+    }
+
+    // Otherwise, this new outer dimension at `i` cannot be collapsed
+    // because it is excluded from collapsing, or it is not contiguous
+    // with the previous inner dimension.
+    sizeInner = sizeOuter;
+    strideInner = strideOuter;
+  }
+
+  // This will be our new size/stride and dimension.
+  IndexType newSizes[MAX_CUTORCH_DIMS];
+  IndexType newStrides[MAX_CUTORCH_DIMS];
+
+  assert(numCollapsed < dims);
+  int newDims = dims - numCollapsed;
+
+  // We return the index of the excluded dimension that is excluded
+  // from being collapsed here.
+  int returnDim = -1;
+
+  // We perform a second pass through the dimensions to actually
+  // calculate the size of the collapsed dimensions.
+  int collapsedIndex = dims - numCollapsed - 1;
+  newSizes[collapsedIndex] = sizes[firstNonOneDim];
+  newStrides[collapsedIndex] = strides[firstNonOneDim];
+
+  if (firstNonOneDim == excludeDim) {
+    returnDim = collapsedIndex;
+  }
+
+  for (int i = firstNonOneDim - 1; i >= 0; --i) {
+    IndexType sizeOuter = sizes[i];
+    IndexType strideOuter = strides[i];
+
+    if ((excludeDim != i) && (excludeDim != i + 1)) {
+      if (sizeOuter == 1) {
+        // skip
+        continue;
+      }
+
+      if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) {
+        // collapse
+        newSizes[collapsedIndex] *= sizeOuter;
+        continue;
+      }
+    }
+
+    // Otherwise, strides don't match, or dim `i` is excluded from
+    // collapsing.
+    --collapsedIndex;
+    assert(collapsedIndex >= 0);
+    assert(collapsedIndex < newDims);
+    newSizes[collapsedIndex] = sizeOuter;
+    newStrides[collapsedIndex] = strideOuter;
+
+    if (excludeDim == i) {
+      returnDim = collapsedIndex;
+    }
+  }
+
+  // We must have filled all the dimensions we're looking for
+  assert(collapsedIndex == 0);
+  assert((excludeDim == -1) || (returnDim != -1));
+
+  dims = newDims;
+
+  for (int i = 0; i < dims; ++i) {
+    sizes[i] = newSizes[i];
+    strides[i] = newStrides[i];
+  }
+
+  // After collapsing, the original `excludeDim` may have been
+  // renumbered to this new `returnDim`, since some dimensions could
+  // have been collapsed.
+  return returnDim;
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    IndexType offset = 0;
+
+    // Use static dims
+    for (int i = Dims - 1; i >= 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+
+      if (i > 0) {
+        linearId /= info.sizes[i];
+      }
+    }
+
+    return offset;
+  }
+};
+
+// For contiguous tensors, the offset = index
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -2> {
+  static inline __host__ __device__ IndexType
+    get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
+    return linearId;
+  }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Use dynamic dims
+    for (int i = info.dims - 1; i >= 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+
+      linearId /= info.sizes[i];
+    }
+
+    return offset;
+  }
+};
+
+#endif // THC_TENSOR_INFO_INC
diff --git a/lib/THC/THCTensorMasked.cuh b/lib/THC/THCTensorMasked.cuh
new file mode 100644
index 0000000..814e263
--- /dev/null
+++ b/lib/THC/THCTensorMasked.cuh
@@ -0,0 +1,58 @@
+#ifndef THC_TENSOR_MASKED_CUH
+#define THC_TENSOR_MASKED_CUH
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCThrustAllocator.cuh"
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename T, typename MaskT>
+struct TensorMaskedFillOp {
+  TensorMaskedFillOp(T v) : value(v) {}
+  __device__ inline void operator()(T* t, MaskT* mask) {
+    if (*mask) {
+      *t = value;
+    }
+  }
+
+  T value;
+};
+
+template <typename T, typename MaskT, typename MaskPrefixSumT>
+struct TensorMaskedCopyOp {
+  TensorMaskedCopyOp(T* s) : in(s) {}
+
+  __device__ inline void operator()(T* out,
+                                    MaskT* mask,
+                                    MaskPrefixSumT* maskPrefixSum) {
+    if (*mask) {
+      *out = in[*maskPrefixSum];
+    }
+  }
+
+  // Where we are copying from
+  T* in;
+};
+
+template <typename T, typename MaskT, typename MaskPrefixSumT>
+struct TensorMaskedSelectOp {
+  TensorMaskedSelectOp(T* t) : out(t) {}
+  __device__ inline void operator()(MaskT* mask,
+                                    MaskPrefixSumT* maskPrefixSum,
+                                    T* in) {
+    if (*mask) {
+      out[*maskPrefixSum] = *in;
+    }
+  }
+
+  T* out;
+};
+
+#endif // THC_TENSOR_MASKED_CUH
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
new file mode 100644
index 0000000..41e6466
--- /dev/null
+++ b/lib/THC/THCTensorMath.cu
@@ -0,0 +1,112 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMath.cuh"
+#include "THCThrustAllocator.cuh"
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+#include <cfloat>
+
+template <typename T>
+struct TensorFillOp {
+  TensorFillOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* v) { *v = val; }
+
+  const T val;
+};
+
+// copypasta from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
+template <typename Iterator>
+class strided_range
+{
+ public:
+
+  typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+  struct stride_functor : public thrust::unary_function<difference_type,
+                                                        difference_type>
+  {
+    difference_type stride;
+
+    stride_functor(difference_type stride)
+        : stride(stride) {}
+
+    __host__ __device__
+    difference_type operator()(const difference_type& i) const
+      {
+        return stride * i;
+      }
+  };
+
+  typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+  typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+  typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+  // type of the strided_range iterator
+  typedef PermutationIterator iterator;
+
+  // construct strided_range for the range [first,last)
+  strided_range(Iterator first, Iterator last, difference_type stride)
+      : first(first), last(last), stride(stride) {}
+
+  iterator begin(void) const
+    {
+      return PermutationIterator(first,
+                                 TransformIterator(CountingIterator(0),
+                                                   stride_functor(stride)));
+    }
+
+  iterator end(void) const
+    {
+      return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+
+ protected:
+  Iterator first;
+  Iterator last;
+  difference_type stride;
+};
+
+struct idx_functor
+{
+  long div;
+  long size;
+
+  __host__ __device__
+  idx_functor(long div, long size) : div(div), size(size) {}
+
+  __host__ __device__
+  long operator()(long val) {
+    return (val / div) % size + TH_INDEX_BASE;
+  }
+};
+
+template <typename T>
+struct NonZeroOp
+{
+  NonZeroOp() {}
+  __host__ __device__ bool operator()(T lhs) const {
+    if (THCNumerics<T>::ne(lhs, ScalarConvert<float, T>::to(0.0))) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+
+#include "generic/THCTensorMath.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.cuh b/lib/THC/THCTensorMath.cuh
new file mode 100644
index 0000000..ae8f5db
--- /dev/null
+++ b/lib/THC/THCTensorMath.cuh
@@ -0,0 +1,102 @@
+#ifndef THC_TENSORMATH_CUH
+#define THC_TENSORMATH_CUH
+
+// Copy the kth diagonal of a matrix B to a vector A.
+template <typename T>
+__global__ void THCTensor_copyFromDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideA) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < size;
+       linearIndex += gridDim.x * blockDim.x) {
+    const ptrdiff_t bOffset = start + strideSum * linearIndex;
+    a[strideA * linearIndex] = b[bOffset];
+  }
+}
+
+// Copy vector B to the kth diagonal of a matrix A
+template <typename T>
+__global__ void THCTensor_copyToDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideB) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < size;
+       linearIndex += gridDim.x * blockDim.x) {
+    const ptrdiff_t aOffset = start + strideSum * linearIndex;
+    a[aOffset] = b[strideB * linearIndex];
+  }
+}
+
+#define CAT_ARRAY_BATCH_SIZE 1024
+#define CAT_ARRAY_MAX_INPUT_DIMS 4
+
+// Similar to any other IndexToOffset calculation for copying along a given dimension.
+template <typename IndexType, int Dims>
+struct CatArrIndexToOffset {
+  static inline __device__ IndexType compute(
+      const IndexType outputSize[Dims],
+      const IndexType outputStride[Dims],
+      const IndexType dimSize,
+      const unsigned int concatDim,
+      IndexType linearIndex) {
+    IndexType offset = 0;
+
+#pragma unroll
+    for (int i = Dims - 1; i >= 1; --i) {
+      IndexType curDimSize = i == concatDim ? dimSize : outputSize[i];
+      IndexType nextDimIndex = linearIndex / curDimSize;
+      IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
+      IndexType curDimOffset = curDimIndex * outputStride[i];
+      offset += curDimOffset;
+      linearIndex = nextDimIndex;
+    }
+
+    return offset + linearIndex * outputStride[0];
+  }
+};
+
+template <typename T, typename IndexType>
+struct CatArrInputTensor {
+  T* input;
+  IndexType offset;
+  IndexType dimSize;
+  IndexType nElements;
+};
+
+template<typename IndexType, unsigned int MaxDims>
+struct OutputTensorSizeStride {
+  IndexType outputSize[MaxDims];
+  IndexType outputStride[MaxDims];
+};
+
+/**
+  * Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a grid-stride loop based off of
+  * the blockIdx.x, threadIdx.x for each input to copy each element from each input tensor into the output.
+  *
+  * output: base pointer to the storage associated with the output tensor
+  * inputs: GPU-allocated array of input metadata for each input to concatenate in the kernel
+  * os: the size/stride vectors for the output tensor
+  * concatDim: dimension along which we are concatenating
+  * dimStride: the stride of the output tensor at the concatDim
+  *
+  * The most important assumption made is that the input tensors are contiguous.
+  */
+template <typename T, typename IndexType, int Dims>
+__global__ void CatArrayBatchedCopy(
+    T* output,
+    CatArrInputTensor<T, IndexType>* inputs,
+    OutputTensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+  T* data = inputs[blockIdx.y].input;
+  IndexType offset = inputs[blockIdx.y].offset;
+  IndexType dimSize = inputs[blockIdx.y].dimSize;
+  IndexType nElements = inputs[blockIdx.y].nElements;
+  IndexType dataOffset = offset * dimStride;
+
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+      linearIndex < nElements;
+      linearIndex += gridDim.x * blockDim.x) {
+    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+        os.outputSize, os.outputStride, dimSize, concatDim, linearIndex);
+    output[dataOffset + elementOffset] = data[linearIndex];
+  }
+}
+
+#endif
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
new file mode 100644
index 0000000..19ae679
--- /dev/null
+++ b/lib/THC/THCTensorMath.h
@@ -0,0 +1,49 @@
+#ifndef TH_CUDA_TENSOR_MATH_INC
+#define TH_CUDA_TENSOR_MATH_INC
+
+#include "THCTensor.h"
+#include "THCGeneral.h"
+
+#include "generic/THCTensorMath.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathBlas.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathMagma.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPairwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPointwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathReduce.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompare.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompareT.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathScan.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMasked.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorScatterGather.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorIndex.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorSort.h"
+#include "THCGenerateAllTypes.h"
+
+THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
+THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
+
+#endif
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
new file mode 100644
index 0000000..7e6af9b
--- /dev/null
+++ b/lib/THC/THCTensorMath2.cu
@@ -0,0 +1,30 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCTensorRandom.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCTensorMathReduce.cuh"
+#include "THCTensorMathPointwise.cuh"
+
+struct TensorATan2Op {
+  __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
+    *out = atan2f(*a, *b);
+  }
+};
+
+void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx, THCudaTensor *ty)
+{
+  THAssert(THCudaTensor_checkGPU(state, 3, self_, tx, ty));
+  THArgCheck(THCudaTensor_nElement(state, tx) ==
+             THCudaTensor_nElement(state, ty), 3, "sizes do not match");
+  THCudaTensor_resizeAs(state, self_, tx);
+
+  if (!THC_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
diff --git a/lib/THC/THCTensorMathBlas.cu b/lib/THC/THCTensorMathBlas.cu
new file mode 100644
index 0000000..0804d64
--- /dev/null
+++ b/lib/THC/THCTensorMathBlas.cu
@@ -0,0 +1,8 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCNumerics.cuh"
+
+#include "generic/THCTensorMathBlas.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathCompare.cuh b/lib/THC/THCTensorMathCompare.cuh
new file mode 100644
index 0000000..e7e1bb5
--- /dev/null
+++ b/lib/THC/THCTensorMathCompare.cuh
@@ -0,0 +1,87 @@
+#ifndef THC_TENSORMATH_COMPARE_CUH
+#define THC_TENSORMATH_COMPARE_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+
+template <typename T, typename TOut>
+struct TensorLTValueOp {
+  TensorLTValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorGTValueOp {
+  TensorGTValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*in, value));
+  }
+
+  const T value;
+};
+
+
+template <typename T, typename TOut>
+struct TensorLEValueOp {
+  TensorLEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorGEValueOp {
+  TensorGEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorEQValueOp {
+  TensorEQValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*in, value));
+  }
+
+  const T value;
+};
+
+template <typename T, typename TOut>
+struct TensorNEValueOp {
+  TensorNEValueOp(T v) : value(v) {}
+  __device__ __forceinline__ void operator()(TOut* out, T* in) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*in, value));
+  }
+
+  const T value;
+};
+
+template<typename TensorType, typename TensorTypeOut, class Op>
+void THC_logicalValue(THCState *state,
+                      TensorTypeOut *self_,
+                      TensorType *src,
+                      Op op) {
+  THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src);
+  TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+  THLongStorage_free(st);
+
+  if (!THC_pointwiseApply2(state, self_, src, op)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_TENSORMATH_COMPARE_CUH
diff --git a/lib/THC/THCTensorMathCompareT.cuh b/lib/THC/THCTensorMathCompareT.cuh
new file mode 100644
index 0000000..dbf4cf0
--- /dev/null
+++ b/lib/THC/THCTensorMathCompareT.cuh
@@ -0,0 +1,74 @@
+#ifndef THC_TENSORMATH_COMPARET_CUH
+#define THC_TENSORMATH_COMPARET_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+
+template <typename T, typename TOut>
+struct TensorLTOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorGTOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorLEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorGEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorEQOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*a, *b));
+  }
+};
+
+template <typename T, typename TOut>
+struct TensorNEOp {
+  __device__ inline void operator()(TOut* out, T* a, T* b) {
+    *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*a, *b));
+  }
+};
+
+template<typename TensorType, typename TensorTypeOut, typename Op>
+void THC_logicalTensor(THCState *state,
+                       TensorTypeOut *self_,
+                       TensorType *src1,
+                       TensorType *src2,
+                       Op op) {
+  THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src1);
+  TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+  THLongStorage_free(st);
+
+  THArgCheck(TensorUtils<TensorType>::getNumElements(state, src1) ==
+             TensorUtils<TensorType>::getNumElements(state, src2), 3,
+             "sizes do not match");
+
+  if (!THC_pointwiseApply3(state, self_, src1, src2, op)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif // THC_TENSORMATH_COMPARET_CUH
diff --git a/lib/THC/THCTensorMathMagma.cu b/lib/THC/THCTensorMathMagma.cu
new file mode 100644
index 0000000..cac5d73
--- /dev/null
+++ b/lib/THC/THCTensorMathMagma.cu
@@ -0,0 +1,27 @@
+#include "THCGeneral.h"
+#include "THCTensorMath.h"
+#include "THCTensorCopy.h"
+#include "THCTensorMathMagma.cuh"
+#include <algorithm>
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#else
+#include "THCBlas.h"
+#endif
+
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)"
+
+void THCMagma_init(THCState *state)
+{
+#ifdef USE_MAGMA
+  magma_init();
+#endif
+}
+
+#include "generic/THCTensorMathMagma.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathMagma.cuh b/lib/THC/THCTensorMathMagma.cuh
new file mode 100644
index 0000000..6495049
--- /dev/null
+++ b/lib/THC/THCTensorMathMagma.cuh
@@ -0,0 +1,22 @@
+#ifndef THC_TENSOR_MATH_MAGMA_CUH
+#define THC_TENSOR_MATH_MAGMA_CUH
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#else
+#include "THCBlas.h"
+#endif
+
+#ifdef USE_MAGMA
+template <typename T>
+static inline T* th_magma_malloc_pinned(size_t n)
+{
+  void* ptr;
+  if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T)))
+    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
+  return reinterpret_cast<T*>(ptr);
+}
+
+#endif
+
+#endif // THC_TENSOR_MATH_MAGMA_CUH
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..094cf0b
--- /dev/null
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -0,0 +1,403 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMathCompareT.cuh"
+
+template <typename T>
+struct TensorAddConstantOp {
+  TensorAddConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in + val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v += val;
+  }
+
+  const T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorAddConstantOp(half v) : val(v) {}
+#else
+  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin + fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hadd(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv += fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+
+template <typename T>
+struct TensorSubConstantOp {
+  TensorSubConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in - val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v -= val;
+  }
+
+  const T val;
+};
+
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
+#else
+  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin + fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hadd(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv += fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+
+template <typename T>
+struct TensorMulConstantOp {
+  TensorMulConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v *= val;
+  }
+
+  const T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorMulConstantOp(half v) : val(v) {}
+#else
+  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin * fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hmul(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv *= fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivConstantOp {
+  TensorDivConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in / val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v /= val;
+  }
+
+  const T val;
+};
+
+template <>
+struct TensorDivConstantOp<float> {
+  TensorDivConstantOp(float v) : val(1.f / v) {}
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(float* v) {
+    *v *= val;
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorDivConstantOp<double> {
+  TensorDivConstantOp(double v) : val(1. / v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in * val;
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v *= val;
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivConstantOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
+#else
+  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
+#endif
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fout = fin * fval;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hmul(*v, val);
+#else
+    float fv = __half2float(*v);
+    fv *= fval;
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorRemainderOp {
+  TensorRemainderOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in - val * (*in / val);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = *v - val * (*v / val);
+  }
+
+  const T val;
+};
+
+template <>
+struct TensorRemainderOp<float> {
+  TensorRemainderOp(float v) : val(v) {}
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in - val * floorf(*in / val);
+  }
+
+  __device__ __forceinline__ void operator()(float* v) {
+    *v = *v - val * floorf(*v / val);
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorRemainderOp<double> {
+  TensorRemainderOp(double v) : val(v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in - val * floor(*in / val);
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v = *v - val * floor(*v / val);
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorRemainderOp<half> {
+#ifdef CUDA_HALF_INSTRUCTIONS
+  TensorRemainderOp(half v) : val(v) {}
+#else
+  TensorRemainderOp(half v): fval(THC_half2float(v)) {}
+#endif
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in,  __hmul(val, hfloor(__hdiv(*in,  val))));
+#else
+    float fin = __half2float(*in);
+    float fout = fin - fval * floorf(fin / fval);
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val))));
+#else
+    float fv = __half2float(*v);
+    fv = fv - fval * floorf(fv / fval);
+    *v = __float2half(fv);
+#endif
+  }
+
+#ifdef CUDA_HALF_INSTRUCTIONS
+  const half val;
+#else
+  const float fval;
+#endif
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorFmodOp {
+  TensorFmodOp(T v) : val((float)v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = (T) fmodf((float) *in, val);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = (T) fmodf((float) *v, val);
+  }
+
+  const float val;
+};
+
+template <>
+struct TensorFmodOp<double> {
+  TensorFmodOp(double v) : val(v) {}
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = fmod(*in, val);
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v = fmod(*v, val);
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorFmodOp<half> {
+  TensorFmodOp(half v): fval(THC_half2float(v)) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    *out = __float2half(fmodf(__half2float(*in), fval));
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+    *v = __float2half(fmodf(__half2float(*v), fval));
+  }
+
+  const float fval;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T, int Upper>
+struct TensorTriOp {
+  TensorTriOp(T *start_, long stride0_, long stride1_, long k_)
+    : start(start_), stride0(stride0_), stride1(stride1_), k(k_) {}
+
+  __device__ __forceinline__ int mask(T *in) {
+    ptrdiff_t n = in - start;
+    long row, col;
+    if (stride0 > stride1)
+    {
+      row = (long) (n / stride0);
+      col = (long) ((n % stride0) / stride1);
+    }
+    else
+    {
+      row = (long) ((n % stride1) / stride0);
+      col = (long) (n / stride1);
+    }
+
+    return Upper ? (col - row >= k) : (col - row <= k);
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = mask(in) ? *in : ScalarConvert<int, T>::to(0);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    if (!mask(v))
+      *v = ScalarConvert<int, T>::to(0);
+  }
+
+  const T *start;
+  const long stride0, stride1, k;
+};
+
+#include "generic/THCTensorMathPairwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
new file mode 100644
index 0000000..de96cad
--- /dev/null
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -0,0 +1,663 @@
+#ifndef THC_TENSORMATH_POINTWISE_CUH
+#define THC_TENSORMATH_POINTWISE_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+
+template <typename T>
+struct TensorSigmoidOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) const {
+    T one = (T) 1.0;
+    *out = one / (one + THCNumerics<T>::exp(- *in));
+  }
+
+  __device__ __forceinline__ void operator()(T* v) const {
+    T one = (T) 1.0;
+    *v = one / (one + THCNumerics<T>::exp(- *v));
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSigmoidOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+#else
+    float fin = __half2float(*in);
+    *out = __float2half(1.0f / (1.0f + expf(- fin)));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+#else
+    float fv = __half2float(*v);
+    *v = __float2half(1.0f / (1.0f + expf(- fv)));
+#endif
+  }
+};
+#endif
+
+template <typename T>
+struct TensorSignOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T orig = *in;
+    *out = (orig > 0) - (orig < 0);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    T orig = *v;
+    *v = (orig > 0) - (orig < 0);
+  }
+};
+
+template <>
+struct TensorSignOp<unsigned char> {
+  __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) {
+    unsigned char orig = *in;
+    *out = (orig == 0) ? 0 : 1;
+  }
+
+  __device__ __forceinline__ void operator()(unsigned char* v) {
+    unsigned char orig = *v;
+    *v = (orig == 0) ? 0 : 1;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSignOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half zero = ScalarConvert<int, half>::to(0);
+    half orig = *in;
+    *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero));
+#else
+    float orig = __half2float(*in);
+    *out = __float2half((orig > 0) - (orig < 0));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half zero = ScalarConvert<int, half>::to(0);
+    half orig = *v;
+    *v = __float2half((float) __hgt(orig, zero) -  (float) __hlt(orig, zero));
+#else
+    float orig = __half2float(*v);
+    *v = __float2half((orig > 0) - (orig < 0));
+#endif
+  }
+};
+#endif
+
+template <typename T>
+struct TensorAddOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout += fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 + fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCAddOp {
+  TensorCAddOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += val * *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + val * *in2;
+  }
+
+  T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCAddOp<half> {
+  TensorCAddOp(half v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, __hmul(val, *in));
+#else
+    float fout = __half2float(*out);
+    float fval = __half2float(val);
+    float fin = __half2float(*in);
+
+    fout += fval * fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, __hmul(val, *in2));
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fval = __half2float(val);
+
+    float fout = fin1 + fval * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+
+  half val;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorSubOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out -= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 - *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout -= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 - fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorMulOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out *= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 * *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout *= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template<typename T>
+struct TensorPowOp {
+  TensorPowOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = powf((float) *in, (float) val);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = powf((float) *v, (float) val);
+  }
+
+  const T val;
+};
+
+template <>
+struct TensorPowOp<double> {
+  TensorPowOp(double v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = pow(*in, val);
+  }
+
+  __device__ __forceinline__ void operator()(double* v) {
+    *v = pow(*v, val);
+  }
+
+  const double val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorPowOp<half> {
+  TensorPowOp(half v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    // No fp16 pow function yet
+    float fin = __half2float(*in);
+    float fval = __half2float(val);
+    float fout = powf(fin, fval);
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void operator()(half* v) {
+    // No fp16 pow function yet
+    float fv = __half2float(*v);
+    float fval = __half2float(val);
+    float fout = powf(fv, fval);
+    *v = __float2half(fout);
+  }
+
+  const half val;
+};
+#endif // CUDA_HALF_TENSOR
+
+template<typename T>
+struct TensorTPowOp {
+  TensorTPowOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::pow(val, *in);
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    *v = THCNumerics<T>::pow(val, *v);
+  }
+
+  const T val;
+};
+
+template <typename T>
+struct TensorCPowOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = powf((float) *out, (float) *in);
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = powf((float) *in1, (float) *in2);
+  }
+};
+
+template <>
+struct TensorCPowOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = pow(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = pow(*in1, *in2);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCPowOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    // No fp16 pow function yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout = powf(fout, fin);
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+    // No fp16 pow function yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = powf(fin1, fin2);
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out /= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 / *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivOp<half> {
+  __device__ __forceinline__ void
+  operator()(half* out, half* in) {
+    // No fp16 div instruction yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout /= fin;
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void
+  operator()(half* out, half* in1, half* in2) {
+    // No fp16 div instruction yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 / fin2;
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCRemainderOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *in != 0 ? *out - *in * (*out / *in) : NAN;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in2 != 0 ? *in1 - *in2 * (*in1 / *in2) : NAN;
+  }
+};
+
+template <>
+struct TensorCRemainderOp<float> {
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = *in != 0 ? *out - *in * floorf(*out / *in) : NAN;
+  }
+
+  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
+    *out = *in2 != 0 ? *in1 - *in2 * floorf(*in1 / *in2) : NAN;
+  }
+};
+
+template <>
+struct TensorCRemainderOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = *in != 0 ? *out - *in * floor(*out / *in) : NAN;
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = *in2 != 0 ? *in1 - *in2 * floor(*in1 / *in2) : NAN;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCRemainderOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in))));
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2))));
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCFmodOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = *out % *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 % *in2;
+  }
+};
+
+template <>
+struct TensorCFmodOp<float> {
+  __device__ __forceinline__ void operator()(float* out, float* in) {
+    *out = fmodf(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
+    *out = fmodf(*in1, *in2);
+  }
+};
+
+template <>
+struct TensorCFmodOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = fmod(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = fmod(*in1, *in2);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCFmodOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorClampOp {
+  TensorClampOp(T min, T max) : minValue(min), maxValue(max) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    T val = THCNumerics<T>::lt(*in, maxValue) ? *in : maxValue;
+    *out = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+  }
+
+  __device__ __forceinline__ void operator()(T* v) {
+    T val = THCNumerics<T>::lt(*v, maxValue) ? *v : maxValue;
+    *v = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+  }
+
+  const T minValue;
+  const T maxValue;
+};
+
+template <typename T>
+struct TensorLerpOp {
+  TensorLerpOp(T w) : w(w) {}
+
+  __device__ __forceinline__ void operator()(T *out, T *a, T *b) {
+    *out = THCNumerics<T>::add(
+      *a,
+      THCNumerics<T>::mul(
+          w,
+          THCNumerics<T>::sub(*b, *a)
+        )
+    );
+  }
+
+  const T w;
+};
+
+template <typename T>
+struct TensorCrossOp {
+  TensorCrossOp(long sx, long sy, long so) : sx(sx), sy(sy), so(so) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* x, T*y) {
+    out[0 * so] = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[1 * sx], y[2 * sy]),
+        THCNumerics<T>::mul(x[2 * sx], y[1 * sy])
+    );
+
+    out[1 * so] = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[2 * sx], y[0 * sy]),
+        THCNumerics<T>::mul(x[0 * sx], y[2 * sy])
+    );
+
+    out[2 * so] = THCNumerics<T>::sub(
+        THCNumerics<T>::mul(x[0 * sx], y[1 * sy]),
+        THCNumerics<T>::mul(x[1 * sx], y[0 * sy])
+    );
+  }
+
+  const long sx, sy, so;
+};
+
+template <typename T>
+struct TensorMaxOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::gt(*out, *in) ? *out : *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::gt(*in1, *in2) ? *in1 : *in2;
+  }
+};
+
+template <typename T>
+struct TensorMinOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::lt(*out, *in) ? *out : *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::lt(*in1, *in2) ? *in1 : *in2;
+  }
+};
+
+template <typename T>
+struct TensorMaxValueOp {
+  TensorMaxValueOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out) {
+    *out = THCNumerics<T>::gt(*out, val) ? *out : val;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::gt(*in, val) ? *in : val;
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorMinValueOp {
+  TensorMinValueOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out) {
+    *out = THCNumerics<T>::lt(*out, val) ? *out : val;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = THCNumerics<T>::lt(*in, val) ? *in : val;
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorAddCMulOp {
+  TensorAddCMulOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::add(
+      *out,
+      THCNumerics<T>::mul(
+        val,
+        THCNumerics<T>::mul(*in1, *in2)
+      )
+    );
+  }
+
+  T val;
+};
+
+template <typename T>
+struct TensorAddCDivOp {
+  TensorAddCDivOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = THCNumerics<T>::add(
+      *out,
+      THCNumerics<T>::mul(
+        val,
+        THCNumerics<T>::div(*in1, *in2)
+      )
+    );
+  }
+
+  T val;
+};
+
+#endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu
new file mode 100644
index 0000000..446daec
--- /dev/null
+++ b/lib/THC/THCTensorMathReduce.cu
@@ -0,0 +1,31 @@
+#include "THCTensorMathReduce.cuh"
+
+THC_API int
+THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
+  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<unsigned char>(),
+                     LogicalAll(),
+                     LogicalAll(),
+                     (unsigned char) 1, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
+
+THC_API int
+THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
+  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<unsigned char>(),
+                     LogicalAny(),
+                     LogicalAny(),
+                     (unsigned char) 0, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
diff --git a/lib/THC/THCTensorMathReduce.cuh b/lib/THC/THCTensorMathReduce.cuh
new file mode 100644
index 0000000..5fefbab
--- /dev/null
+++ b/lib/THC/THCTensorMathReduce.cuh
@@ -0,0 +1,690 @@
+#ifndef THC_TENSORMATH_REDUCE_CUH
+#define THC_TENSORMATH_REDUCE_CUH
+
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+#include "THCReduceAll.cuh"
+#include "THCThrustAllocator.cuh"
+#include <thrust/functional.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/inner_product.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+// Reduction operators that support `half`, unlike Thrust
+template <typename InT, typename AccT>
+struct ReduceAdd {
+  inline __device__ AccT operator()(AccT a, InT b) const {
+    return a + (AccT) b;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceAdd<half, half> {
+  inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hadd(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(fa + fb);
+#endif
+  }
+};
+
+template <>
+struct ReduceAdd<half, float> {
+  inline __device__ float operator()(float a, half b) const {
+    return a + __half2float(b);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename InT, typename AccT>
+struct ReduceMultiply {
+  inline __device__ AccT operator()(AccT a, InT b) const {
+    return a * (AccT) b;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceMultiply<half, half> {
+  inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(fa * fb);
+#endif
+  }
+};
+
+template <>
+struct ReduceMultiply<half, float> {
+  inline __device__ float operator()(float a, half b) const {
+    return a * __half2float(b);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename ResT, typename ArgT>
+struct SquareFunctor {
+    SquareFunctor(ResT mean): mean_(mean) {}
+
+    inline __device__ ResT operator()(ArgT x) const {
+      return (((ResT) x) - mean_) * (((ResT) x) - mean_);
+    }
+
+    const ResT mean_;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <typename ResT>
+struct SquareFunctor<ResT, half> {
+    SquareFunctor(ResT mean): mean_(mean) {}
+
+    inline __device__ ResT operator()(half x) const {
+      return THCNumerics<ResT>::mul(
+        THCNumerics<ResT>::sub(mean_, ScalarConvert<half, ResT>::to(x)),
+        THCNumerics<ResT>::sub(mean_, ScalarConvert<half, ResT>::to(x))
+      );
+    }
+
+    const ResT mean_;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct ReduceMin {
+  inline __device__ T operator()(T a, T b) const {
+    return THCNumerics<T>::lt(a, b) ? a : b;
+  }
+};
+
+template <typename T>
+struct ReduceMax {
+  inline __device__ T operator()(T a, T b) const {
+    return THCNumerics<T>::gt(a, b) ? a : b;
+  }
+};
+
+struct LogicalAll {
+  inline __device__ unsigned char operator()(unsigned char x,
+                                             unsigned char y) const {
+    return (x && y);
+  }
+};
+
+struct LogicalAny {
+  inline __device__ unsigned char operator()(unsigned char x,
+                                             unsigned char y) const {
+    return (x || y);
+  }
+};
+
+template<typename Real>
+__global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrdiff_t size, const Real maxnorm)
+{
+  __shared__ Real buffer[32];
+  long tx = threadIdx.x;
+  long bx = blockIdx.x;
+  long step = blockDim.x;
+  Real *row = data + size*bx;
+
+  buffer[tx] = ScalarConvert<int, Real>::to(0);
+
+  // get norm of axis
+  for (ptrdiff_t i=tx; i<size; i+=step)
+  {
+    buffer[tx] = THCNumerics<Real>::add(
+      buffer[tx],
+      THCNumerics<Real>::pow(
+        THCNumerics<Real>::abs(row[i]),
+        value)
+    );
+  }
+  // add (reduce)
+  for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1)
+  {
+    __syncthreads();
+    if (tx < stride)
+      buffer[tx] = THCNumerics<Real>::add(buffer[tx], buffer[tx+stride]);
+  }
+  // clip norms
+  __syncthreads();
+  Real norm = THCNumerics<Real>::pow(buffer[0], THCNumerics<Real>::cinv(value));
+  if (THCNumerics<Real>::gt(norm, maxnorm))
+  {
+    norm = THCNumerics<Real>::div(
+      maxnorm,
+      THCNumerics<Real>::add(
+        norm,
+        ScalarConvert<float, Real>::to(1e-7)
+      )
+    );
+    // renormalize
+    for (ptrdiff_t i=tx; i<size; i+=step)
+    {
+      row[i] = THCNumerics<Real>::mul(row[i], norm);
+    }
+  }
+}
+
+template <typename T>
+struct TensorNonZeroOp
+{
+  TensorNonZeroOp() {}
+  __host__ __device__ T operator()(T lhs) const {
+    if (THCNumerics<T>::eq(lhs, ScalarConvert<float, T>::to(0.0))) {
+      return ScalarConvert<int, T>::to(0);
+    } else {
+      return ScalarConvert<int, T>::to(1);
+    }
+  }
+};
+
+template <typename T, int StaticExp>
+struct TensorNormOp
+{
+  TensorNormOp(T exp) : exponent(exp) {}
+
+  __host__ __device__ T operator()(T x) const {
+    if (StaticExp == 1) {
+      return (T) fabsf((float) x);
+    } else if (StaticExp == 2) {
+      return x * x;
+    } else {
+      return (T) powf(fabsf((float) x), (float) exponent);
+    }
+  }
+
+  const T exponent;
+};
+
+template <int StaticExp>
+struct TensorNormOp<double, StaticExp>
+{
+  TensorNormOp(double exp) : exponent(exp) {}
+
+  __host__ __device__ double operator()(double x) const {
+    if (StaticExp == 1) {
+      return fabs(x);
+    } else if (StaticExp == 2) {
+      return x * x;
+    } else {
+      return pow(fabs(x), exponent);
+    }
+  }
+
+  const double exponent;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <int StaticExp>
+struct TensorNormOp<half, StaticExp>
+{
+  TensorNormOp(half exp) : exponent(exp) {}
+
+  __host__ __device__ half operator()(half x) const {
+    if (StaticExp == 1) {
+      return THCNumerics<half>::abs(x);
+    } else if (StaticExp == 2) {
+      return THCNumerics<half>::mul(x, x);
+    } else {
+      return THCNumerics<half>::pow(THCNumerics<half>::abs(x), exponent);
+    }
+  }
+
+  const half exponent;
+};
+#endif
+
+template <typename Tacc, typename T>
+struct TensorDistOp
+{
+  TensorDistOp(Tacc exp) : exponent(exp) {}
+
+  __host__ __device__ Tacc operator()(T x, T y) const {
+    Tacc xr = ScalarConvert<T, Tacc>::to(x);
+    Tacc yr = ScalarConvert<T, Tacc>::to(y);
+    return THCNumerics<Tacc>::pow(
+      THCNumerics<Tacc>::abs(THCNumerics<Tacc>::sub(xr, yr)),
+      exponent
+    );
+  }
+
+  const Tacc exponent;
+};
+
+#include <thrust/functional.h>
+
+// Given the sum of values and the sum of squares, compute the variance or standard deviation.
+template<typename Real, bool flag, bool apply_sqrt>
+__forceinline__ __device__ Real THCTensor_computeVar(Real sum, Real sum2, unsigned row_size) {
+  Real rs2 = ScalarConvert<unsigned, Real>::to(row_size);
+  Real rs2m = ScalarConvert<unsigned, Real>::to(row_size - 1);
+  Real zero = ScalarConvert<int, Real>::to(0);
+  if (flag) {
+    sum = THCNumerics<Real>::div(sum, rs2);
+    sum2 = THCNumerics<Real>::div(sum2, rs2);
+    sum2 = THCNumerics<Real>::sub(sum2, THCNumerics<Real>::mul(sum, sum));
+    sum2 = (THCNumerics<Real>::lt(sum2, zero) ? zero : sum2);
+  }
+  else {
+    sum = THCNumerics<Real>::div(sum, rs2);
+    sum2 = THCNumerics<Real>::div(sum2, rs2m);
+    sum2 = THCNumerics<Real>::sub(sum2,
+      THCNumerics<Real>::mul(
+        THCNumerics<Real>::div(rs2 ,rs2m),
+        THCNumerics<Real>::mul(sum, sum)));
+    sum2 = (THCNumerics<Real>::lt(sum2, zero) ? zero : sum2);
+  }
+  if (apply_sqrt)
+    return THCNumerics<Real>::sqrt(sum2);
+  else
+    return sum2;
+}
+
+/* Compute the variance (or standard deviation) along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ * - if flag is set, normalize by `row_size` instead of `row_size - 1`
+ * - if apply_sqrt is set, compute the standard deviation instead of variance
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename Real, bool flag, bool apply_sqrt>
+__global__ void THCTensor_kernel_varOuterDim(Real *tgt, Real *src_, unsigned num_orows, unsigned num_irows, unsigned row_size)
+{
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      Real *src = src_ + orow * row_size * num_irows + irow;
+      Real sum = ScalarConvert<int, Real>::to(0), sum2 = ScalarConvert<int, Real>::to(0);
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        Real val = *src;
+        sum = THCNumerics<Real>::add(sum, val);
+        sum2 = THCNumerics<Real>::add(
+          sum2,
+          THCNumerics<Real>::mul(val, val)
+        );
+
+        src += num_irows;
+      }
+
+      tgt[orow * num_irows + irow] = THCTensor_computeVar<Real, flag, apply_sqrt>(sum, sum2, row_size);
+    }
+  }
+}
+
+template<typename TensorTypeK, typename Real, bool apply_sqrt>
+__host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, long dimension, int flag)
+{
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  // Treat all outer dimensions (i.e. dim < dimension) as one.
+  unsigned num_orows = 1;
+  for (long dim = 0; dim < dimension; dim++) {
+    num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, dimension);
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  unsigned num_irows = 1;
+  for (unsigned dim = dimension + 1; dim < ndim; dim++) {
+    num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  if (flag) {
+    THCTensor_kernel_varOuterDim<Real, true, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        TensorUtils<TensorTypeK>::getData(state, tgt), TensorUtils<TensorTypeK>::getData(state, src), num_orows, num_irows, row_size);
+  } else {
+    THCTensor_kernel_varOuterDim<Real, false, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        TensorUtils<TensorTypeK>::getData(state, tgt), TensorUtils<TensorTypeK>::getData(state, src), num_orows, num_irows, row_size);
+  }
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess) {
+    THError(cudaGetErrorString(errcode));
+  }
+}
+
+/* Compute the variance (or standard deviation) of the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ * - if flag is set, normalize by `row_size` instead of `row_size - 1`
+ * - if apply_sqrt is set, compute the standard deviation instead of variance
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename Real, bool flag, bool apply_sqrt>
+__global__ void THCTensor_kernel_varInnermostDim(Real *tgt, Real *src_, unsigned num_rows, unsigned row_size)
+{
+  __shared__ Real ssum[32][16];
+  __shared__ Real ssum2[32][16];
+
+  for (unsigned block_row = blockIdx.x * blockDim.y; block_row < num_rows; block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    Real sum = ScalarConvert<int, Real>::to(0), sum2 = ScalarConvert<int, Real>::to(0);
+    if (row < num_rows) {
+      Real *src = src_ + row * row_size;
+      // Sequential reduction within a thread.
+      for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+        Real val = src[col];
+        sum = THCNumerics<Real>::add(sum, val);
+        sum2 = THCNumerics<Real>::add(sum2, THCNumerics<Real>::mul(val, val));
+      }
+    }
+    ssum[threadIdx.y][threadIdx.x] = sum;
+    ssum2[threadIdx.y][threadIdx.x] = sum2;
+    __syncthreads();
+
+    // Reduce intermediate values to single value.
+    for (unsigned s = 8; s > 1; s >>= 1) {
+      if (row < num_rows && threadIdx.x < s) {
+        ssum[threadIdx.y][threadIdx.x] =
+          THCNumerics<Real>::add(ssum[threadIdx.y][threadIdx.x], ssum[threadIdx.y][threadIdx.x + s]);
+        ssum2[threadIdx.y][threadIdx.x] =
+          THCNumerics<Real>::add(ssum2[threadIdx.y][threadIdx.x], ssum2[threadIdx.y][threadIdx.x + s]);
+      }
+      __syncthreads();
+    }
+
+    if (row < num_rows && threadIdx.x == 0) {
+      sum = THCNumerics<Real>::add(ssum[threadIdx.y][0], ssum[threadIdx.y][1]);
+      sum2 = THCNumerics<Real>::add(ssum2[threadIdx.y][0], ssum2[threadIdx.y][1]);
+      tgt[row] = THCTensor_computeVar<Real, flag, apply_sqrt>(sum, sum2, row_size);
+    }
+    __syncthreads();
+  }
+}
+
+template<typename TensorTypeK, typename Real, bool apply_sqrt>
+__host__ void THCTensor_varInnermostDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int flag)
+{
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  // Treat all outer dimensions as a single dimension.
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+
+  // From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions.
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  if (flag) {
+    THCTensor_kernel_varInnermostDim<Real, true, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        TensorUtils<TensorTypeK>::getData(state, tgt), TensorUtils<TensorTypeK>::getData(state, src), num_rows, row_size);
+  } else {
+    THCTensor_kernel_varInnermostDim<Real, false, apply_sqrt><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+        TensorUtils<TensorTypeK>::getData(state, tgt), TensorUtils<TensorTypeK>::getData(state, src), num_rows, row_size);
+  }
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess) {
+    THError(cudaGetErrorString(errcode));
+  }
+}
+
+
+/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
+   These are useful when you not only have to do a reduction, but you might have
+   to preserve the location of contention (for example min/max operations).
+   The structure of the kernels follows the structure of the reduction kernels.
+*/
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceOuterDimIndex(K *tgt1,
+                                   Index *tgt2,
+                                   K *src_,
+                                   unsigned num_orows,
+                                   unsigned num_irows,
+                                   unsigned row_size,
+                                   thrust::pair<K, Index> init,
+                                   BinaryFunction binary_op) {
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
+         irow < num_irows;
+         irow += gridDim.y * blockDim.x) {
+      K *src = src_ + orow * row_size * num_irows + irow;
+      thrust::pair<K, Index> acc = init;
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        // +1 for Lua index
+        acc = binary_op(thrust::make_pair<K, Index>(*src, col + TH_INDEX_BASE),
+                        acc);
+        src += num_irows;
+      }
+
+      tgt1[orow * num_irows + irow] = acc.first;
+      tgt2[orow * num_irows + irow] = acc.second;
+    }
+  }
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceOuterDimIndex(THCState *state,
+                                 TensorTypeK *tgt1,
+                                 TensorTypeIndex *tgt2,
+                                 TensorTypeK *src,
+                                 long rdim,
+                                 const thrust::pair<
+                                 typename TensorUtils<TensorTypeK>::DataType,
+                                 typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                                 BinaryFunction binary_op) {
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned num_orows = 1;
+  for (long dim = 0; dim < rdim; dim++) {
+    num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
+  unsigned num_irows = 1;
+  for (unsigned dim = rdim + 1; dim < ndim; dim++) {
+    num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows),
+            min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  kernelTransformReduceOuterDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      TensorUtils<TensorTypeK>::getData(state, tgt1),
+      TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+      TensorUtils<TensorTypeK>::getData(state, src),
+      num_orows, num_irows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
+ *
+ * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
+ *
+ * - block.x is the innermost dimension, i.e. dimension 0;
+ * - block.y and grid.y make up dimension 1; and
+ * - grid.x and grid z are the remaining two outer dimensions (if any)
+ *
+ * Reduction along other dimensions is handled in a separate kernel.
+ */
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceInnermostDimIndex(K *tgt1,
+                                       Index* tgt2,
+                                       K *src_,
+                                       unsigned num_rows,
+                                       unsigned row_size,
+                                       thrust::pair<K, Index> init,
+                                       BinaryFunction binary_op) {
+  __shared__ K sbuf[32][16 + 1]; // avoid bank conflict
+  __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict
+
+  for (unsigned block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    thrust::pair<K, Index> acc = init;
+    if (row < num_rows) {
+      K *src = src_ + row * row_size;
+      // Sequential reduction within a thread.
+      for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+        acc = binary_op(thrust::make_pair<K, Index>(src[col], col + TH_INDEX_BASE), acc);
+      }
+    }
+
+    sbuf[threadIdx.y][threadIdx.x] = acc.first;
+    ibuf[threadIdx.y][threadIdx.x] = acc.second;
+
+    __syncthreads();
+
+    // Reduce intermediate values to single value.
+    K* sline = &sbuf[threadIdx.y][0];
+    Index* iline = &ibuf[threadIdx.y][0];
+    for (unsigned s = 8; s > 0; s >>= 1) {
+      if (row < num_rows && threadIdx.x < s) {
+        thrust::pair<K, Index> arg1 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
+        thrust::pair<K, Index> arg2 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
+        thrust::pair<K, Index> res = binary_op(arg1, arg2);
+
+        sline[threadIdx.x] = res.first;
+        iline[threadIdx.x] = res.second;
+      }
+      __syncthreads();
+    }
+
+    if (row < num_rows && threadIdx.x == 0) {
+      tgt1[row] = sline[0];
+      tgt2[row] = iline[0];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceInnermostDimIndex(THCState *state,
+                                     TensorTypeK *tgt1,
+                                     TensorTypeIndex *tgt2,
+                                     TensorTypeK *src,
+                                     const thrust::pair<
+                                     typename TensorUtils<TensorTypeK>::DataType,
+                                     typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                                     BinaryFunction binary_op) {
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  kernelTransformReduceInnermostDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      TensorUtils<TensorTypeK>::getData(state, tgt1),
+      TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+      TensorUtils<TensorTypeK>::getData(state, src),
+      num_rows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+void
+THC_reduceDimIndex(THCState *state,
+                   TensorTypeK *tgt1_,
+                   TensorTypeIndex *tgt2_,
+                   TensorTypeK *src,
+                   long dimension,
+                   const thrust::pair<
+                   typename TensorUtils<TensorTypeK>::DataType,
+                   typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                   BinaryFunction binary_op)
+{
+  THArgCheck(dimension >= 0 &&
+             dimension < TensorUtils<TensorTypeK>::getDims(state, src),
+             3, "dimension out of range");
+
+  THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
+  TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
+  TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
+  src = TensorUtils<TensorTypeK>::newContiguous(state, src);
+
+  if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
+    THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
+  } else {
+    THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
+  }
+
+  TensorUtils<TensorTypeK>::free(state, src);
+  TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
+  TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
+}
+
+template <typename T, typename Index>
+struct MaxValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return THCNumerics<T>::ge(a.first, b.first) ? a : b;
+  }
+};
+
+template <typename T, typename Index>
+struct MinValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return THCNumerics<T>::le(a.first, b.first) ? a : b;
+  }
+};
+
+template <typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(T &lhs, T &rhs) {
+    return THCNumerics<T>::add(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct MulOp {
+  __device__ __forceinline__ T operator()(T &lhs, T &rhs) {
+    return THCNumerics<T>::mul(lhs, rhs);
+  }
+};
+
+#endif // THC_TENSORMATH_REDUCE_CUH
diff --git a/lib/THC/THCTensorMathScan.cu b/lib/THC/THCTensorMathScan.cu
new file mode 100644
index 0000000..3345e25
--- /dev/null
+++ b/lib/THC/THCTensorMathScan.cu
@@ -0,0 +1,127 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCBlas.h"
+#include "THCTensorCopy.h"
+#include "THCApply.cuh"
+#include "THCReduce.cuh"
+#include "THCNumerics.cuh"
+#include "THCTensorMathReduce.cuh"
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename T, class BinaryOp>
+__global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
+                                                 unsigned num_orows, unsigned num_irows, unsigned row_size,
+                                                 T init, BinaryOp binary_op)
+{
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      T *src = src_ + orow * row_size * num_irows + irow;
+      T *tgt = tgt_ + orow * row_size * num_irows + irow;
+      T acc = init;
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        acc = binary_op(acc, *src);
+        *tgt = acc;
+
+        src += num_irows;
+        tgt += num_irows;
+      }
+    }
+  }
+}
+
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
+__global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_,
+                                                     unsigned num_rows, unsigned row_size,
+                                                     T init, BinaryFunction binary_op)
+{
+  __shared__ T sbuf[num_threads_y][2 * num_threads_x];
+
+  T* row_buf = sbuf[threadIdx.y];
+
+  for (unsigned block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    T block_total = init;
+
+    T *row_src = src_ + row * row_size;
+    T *row_tgt = tgt_ + row * row_size;
+
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      unsigned col1 = block_col + threadIdx.x;
+      unsigned col2 = block_col + num_threads_x + threadIdx.x;
+      if (row < num_rows) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_src[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = row_src[col2];
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          row_buf[0] = binary_op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction (up-sweep).
+      for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
+        if (row < num_rows && threadIdx.x < s) {
+          unsigned offset = (2 * threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      // Down-sweep.
+      for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
+        if (row < num_rows && threadIdx.x < s - 1) {
+          unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row < num_rows) {
+        if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+#include "generic/THCTensorMathScan.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorRandom.cpp b/lib/THC/THCTensorRandom.cpp
new file mode 100644
index 0000000..d7690b5
--- /dev/null
+++ b/lib/THC/THCTensorRandom.cpp
@@ -0,0 +1,133 @@
+#include "THCTensorRandom.h"
+
+#include <random>
+#include <curand.h>
+
+
+void initializeGenerator(THCState *state, Generator* gen);
+void createGeneratorState(Generator* gen, unsigned long long seed);
+
+
+/* Frees memory allocated during setup. */
+void destroyGenerator(THCState *state, Generator* gen)
+{
+  if (gen->gen_states)
+  {
+    THCudaCheck(THCudaFree(state, gen->gen_states));
+    gen->gen_states = NULL;
+  }
+  if (gen->kernel_params)
+  {
+    THCudaCheck(THCudaFree(state, gen->kernel_params));
+    gen->kernel_params = NULL;
+  }
+}
+
+static unsigned long long createSeed(std::random_device& rd)
+{
+  // limit to 53 bits to ensure unique representation in double
+  unsigned long long seed = (((unsigned long long)rd()) << 32) + rd();
+  return seed & 0x1FFFFFFFFFFFFF;
+}
+
+/* Initialize generator array (must be called before any other function) */
+void THCRandom_init(THCState* state, int devices, int current_device)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  rng_state->num_devices = devices;
+  rng_state->gen = (Generator*)malloc(rng_state->num_devices * sizeof(Generator));
+  std::random_device rd;
+  for (int i = 0; i < rng_state->num_devices; ++i)
+  {
+    rng_state->gen[i].initf = 0;
+    rng_state->gen[i].initial_seed = createSeed(rd);
+    rng_state->gen[i].gen_states = NULL;
+    rng_state->gen[i].kernel_params = NULL;
+  }
+}
+
+/* Destroy generators and free memory */
+void THCRandom_shutdown(THCState* state)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  if (rng_state->gen == NULL) return;
+  for (int i = 0; i < rng_state->num_devices; ++i)
+  {
+    destroyGenerator(state, &rng_state->gen[i]);
+  }
+  free(rng_state->gen);
+  rng_state->gen = NULL;
+}
+
+/* Get the generator for the current device, but does not initialize the state */
+static Generator* THCRandom_rawGenerator(THCState* state)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+  if (device >= rng_state->num_devices) THError("Invalid device index.");
+  return &rng_state->gen[device];
+}
+
+/* Get the generator for the current device and initializes it if necessary */
+Generator* THCRandom_getGenerator(THCState* state)
+{
+  Generator* gen = THCRandom_rawGenerator(state);
+  if (gen->initf == 0)
+  {
+    initializeGenerator(state, gen);
+    createGeneratorState(gen, gen->initial_seed);
+    gen->initf = 1;
+  }
+  return gen;
+}
+
+struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state)
+{
+  return THCRandom_getGenerator(state)->gen_states;
+}
+
+/* Random seed */
+unsigned long long THCRandom_seed(THCState* state)
+{
+  std::random_device rd;
+  unsigned long long s = createSeed(rd);
+  THCRandom_manualSeed(state, s);
+  return s;
+}
+
+unsigned long long THCRandom_seedAll(THCState* state)
+{
+  std::random_device rd;
+  unsigned long long s = createSeed(rd);
+  THCRandom_manualSeedAll(state, s);
+  return s;
+}
+
+/* Manually set the seed */
+void THCRandom_manualSeed(THCState* state, unsigned long long seed)
+{
+  Generator* gen = THCRandom_rawGenerator(state);
+  gen->initial_seed = seed;
+  if (gen->initf) {
+    createGeneratorState(gen, seed);
+  }
+}
+
+void THCRandom_manualSeedAll(THCState* state, unsigned long long seed)
+{
+  THCRNGState* rng_state = THCState_getRngState(state);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+  for (int i = 0; i < rng_state->num_devices; ++i) {
+    THCudaCheck(cudaSetDevice(i));
+    THCRandom_manualSeed(state, seed);
+  }
+  THCudaCheck(cudaSetDevice(currentDevice));
+}
+
+/* Get the initial seed */
+unsigned long long THCRandom_initialSeed(THCState* state)
+{
+  return THCRandom_getGenerator(state)->initial_seed;
+}
diff --git a/lib/THC/THCTensorRandom.cu b/lib/THC/THCTensorRandom.cu
new file mode 100644
index 0000000..78f4dc9
--- /dev/null
+++ b/lib/THC/THCTensorRandom.cu
@@ -0,0 +1,156 @@
+#include "THCTensorRandom.h"
+#include "THCDeviceUtils.cuh"
+#include "THCGeneral.h"
+#include "THCTensorCopy.h"
+#include "THCTensorMath.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorRandom.cuh"
+
+#include <thrust/functional.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_mtgp32_host.h>
+#include <curand_mtgp32dc_p_11213.h>
+
+#define MAX_NUM_BLOCKS 64
+#define BLOCK_SIZE 256
+
+
+Generator* THCRandom_getGenerator(THCState* state);
+
+/* Sets up generator. Allocates but does not create the generator states. */
+__host__ void initializeGenerator(THCState *state, Generator* gen)
+{
+  THCudaCheck(THCudaMalloc(state, (void**)&gen->gen_states, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32)));
+  THCudaCheck(THCudaMalloc(state, (void**)&gen->kernel_params, sizeof(mtgp32_kernel_params)));
+}
+
+/* Creates a new generator state given the seed. */
+__host__ void createGeneratorState(Generator* gen, unsigned long long seed)
+{
+  if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->kernel_params) != CURAND_STATUS_SUCCESS)
+  {
+    THError("Creating MTGP constants failed.");
+  }
+  if (curandMakeMTGP32KernelState(gen->gen_states, mtgp32dc_params_fast_11213,
+                                  gen->kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS)
+  {
+    THError("Creating MTGP kernel state failed.");
+  }
+}
+
+__host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state)
+{
+  Generator* gen = THCRandom_getGenerator(state);
+
+  // The RNG state comprises the MTPG32 states and the seed.
+  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t seed_size = sizeof(gen->initial_seed);
+  static const size_t total_size = states_size + seed_size;
+  THByteTensor_resize1d(rng_state, total_size);
+  THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
+  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
+  THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->gen_states,
+                         states_size, cudaMemcpyDeviceToHost));
+  memcpy(THByteTensor_data(rng_state) + states_size, &gen->initial_seed, seed_size);
+}
+
+__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel)
+{
+  state[threadIdx.x].k = kernel;
+}
+
+__host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state)
+{
+  Generator* gen = THCRandom_getGenerator(state);
+
+  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t seed_size = sizeof(gen->initial_seed);
+  static const size_t total_size = states_size + seed_size;
+  THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
+  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
+
+  THCudaCheck(cudaMemcpy(gen->gen_states, THByteTensor_data(rng_state),
+                         states_size, cudaMemcpyHostToDevice));
+  set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, gen->kernel_params);
+  memcpy(&gen->initial_seed, THByteTensor_data(rng_state) + states_size, seed_size);
+}
+
+#define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)      \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1)      \
+{                                                                              \
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                             \
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                \
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {      \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                              \
+    if (i < size) {                                                            \
+      T y = TRANSFORM;                                                         \
+      result[i] = y;                                                           \
+    }                                                                          \
+  }                                                                            \
+}
+
+#define GENERATE_KERNEL2(NAME, T, ARG1, ARG2, CURAND_T, CURAND_FUNC, TRANSFORM)      \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)      \
+{                                                                                    \
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                                   \
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                      \
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {            \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                                    \
+    if (i < size) {                                                                  \
+      T y = TRANSFORM;                                                               \
+      result[i] = y;                                                                 \
+    }                                                                                \
+  }                                                                                  \
+}
+
+template<typename T, typename U>
+struct is_same { static const bool value = false; };
+
+template<typename T>
+struct is_same<T, T> { static const bool value = true; };
+
+template<typename real, typename prob_type>
+__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
+        real *result, prob_type *probs)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    if (is_same<prob_type, double>::value) {
+      double x = curand_uniform_double(&state[blockIdx.x]);
+      if (i < size)
+        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+    } else {
+      float x = curand_uniform(&state[blockIdx.x]);
+      if (i < size)
+        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+    }
+  }
+}
+
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, x * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, x * (b-a) + a)
+
+GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
+GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
+
+GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(1-x)))
+GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(1-x)))
+
+GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
+GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
+
+#ifdef CUDA_HALF_TENSOR
+GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (ScalarConvert<float, half>::to(x * (b-a) + a)))
+GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, half>::to((x * stdv) + mean)))
+GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert<float, half>::to((float)(-1. / lambda * log(1-x)))))
+GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
+#endif // CUDA_HALF_TENSOR
+
+#include "generic/THCTensorRandom.cu"
+#include "THCGenerateAllTypes.h"
+
+#undef GENERATE_KERNEL1
+#undef GENERATE_KERNEL2
diff --git a/lib/THC/THCTensorRandom.cuh b/lib/THC/THCTensorRandom.cuh
new file mode 100644
index 0000000..d78409f
--- /dev/null
+++ b/lib/THC/THCTensorRandom.cuh
@@ -0,0 +1,282 @@
+#ifndef THC_TENSOR_RANDOM_CUH
+#define THC_TENSOR_RANDOM_CUH
+
+#include "THCNumerics.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorMathReduce.cuh"
+
+#include <curand_kernel.h>
+
+#define MAX_NUM_BLOCKS 64
+#define BLOCK_SIZE 256
+/* Separate kernel because curand_log_normal gets extra parameters. */
+
+template <typename T>
+__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = ScalarConvert<float, T>::to(x);
+    }
+  }
+}
+
+template <>
+__global__ void generateLogNormal<double>(curandStateMtgp32 *state, int size, double *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = x;
+    }
+  }
+}
+
+#undef MAX_NUM_BLOCKS
+#undef BLOCK_SIZE
+
+// Normalizes the L1 norm of every row to 1; used by multinomial
+template <typename T>
+__global__ void renormRowsL1(T* dist, long rows, long cols) {
+  extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+  T *smem = reinterpret_cast<T *>(my_smem);
+
+  for (long row = blockIdx.x; row < rows; row += gridDim.x) {
+    T sum = ScalarConvert<int, T>::to(0);
+    for (long col = threadIdx.x; col < cols; col += blockDim.x) {
+      sum = THCNumerics<T>::add(sum, dist[row * cols + col]);
+    }
+
+    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), ScalarConvert<int, T>::to(0));
+    if (threadIdx.x == 0) {
+      smem[0] = sum;
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    if (THCNumerics<T>::gt(sum, ScalarConvert<int, T>::to(0))) {
+      for (long col = threadIdx.x; col < cols; col += blockDim.x) {
+        dist[row * cols + col] = THCNumerics<T>::div(dist[row * cols + col], sum);
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ int binarySearchForMultinomial(T* dist,
+                                          int size,
+                                          T val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    T midVal = dist[mid];
+    if (THCNumerics<T>::lt(midVal, val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  if (start == size) {
+    // No probability mass or precision problems; just return the
+    // first element
+    start = 0;
+  }
+
+  T curVal = dist[start];
+  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
+
+  return start;
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialOnce(long* dest,
+                      long distributions,
+                      int categories,
+                      T* sampled,
+                      T* dist) {
+  extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+  T *smem = reinterpret_cast<T *>(my_smem);
+  T zero = ScalarConvert<int, T>::to(0);
+
+  for (long curDist = blockIdx.x;
+       curDist < distributions; curDist += gridDim.x) {
+    // Each block handles one distribution
+    // First pass, find the total sum of the distribution
+    T sum = zero;
+    for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
+      sum = THCNumerics<T>::add(sum, dist[curDist * categories + cat]);
+    }
+
+    // threadIdx.x == 0 has the sum value from this
+    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), zero);
+
+    // Broadcast sum and sample value
+    if (threadIdx.x == 0) {
+      smem[0] = sum;
+      smem[1] = sampled[curDist];
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    T sample = smem[1];
+    __syncthreads();
+
+    if (THCNumerics<T>::eq(sum,  zero) || THCNumerics<T>::eq(sample, zero)) {
+      // Choose the first element
+      if (threadIdx.x == 0) {
+        dest[curDist] = 1;
+      }
+
+      continue;
+    }
+
+    int chunks = THCCeilDiv(categories, (int) blockDim.x);
+    T prevHighProb = zero;
+
+    for (int chunk = 0; chunk < chunks; ++chunk) {
+      // All threads in bounds load a value
+      int cat = chunk * blockDim.x + threadIdx.x;
+
+      T val =
+        cat < categories ? THCNumerics<T>::div(dist[curDist * categories + cat], sum) :
+        zero;
+
+      smem[threadIdx.x] = val;
+      __syncthreads();
+
+      // Perform an inclusive prefix sum of the shared memory contents
+      for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        T val = zero;
+
+        if (threadIdx.x >= offset) {
+          val = THCNumerics<T>::add(smem[threadIdx.x - offset], smem[threadIdx.x]);
+        }
+
+        __syncthreads();
+        if (threadIdx.x >= offset) {
+          smem[threadIdx.x] = val;
+        }
+        __syncthreads();
+      }
+
+      // Each thread will check to see if the sample falls in its
+      // bucket
+      T curBucket = THCNumerics<T>::add(smem[threadIdx.x], prevHighProb);
+      T prevBucket =
+        threadIdx.x == 0 ? prevHighProb :
+        THCNumerics<T>::add(smem[threadIdx.x - 1], prevHighProb);
+      bool inBucket =
+        (cat < categories) &&
+        (!THCNumerics<T>::gt(sample, curBucket)) &&
+        (THCNumerics<T>::gt(sample, prevBucket));
+
+      if (inBucket) {
+        // We're done; we have the sample
+        // Torch indices are 1-based
+        // FIXME: broadcast exit flag?
+        dest[curDist] = cat + TH_INDEX_BASE;
+      }
+
+      // Store the previous scan's high value for future use
+      prevHighProb = THCNumerics<T>::add(prevHighProb, smem[blockDim.x - 1]);
+
+      __syncthreads();
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithReplacement(curandStateMtgp32* state,
+                                 int totalSamples,
+                                 long* dest,
+                                 long distributions,
+                                 int categories,
+                                 T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block determines the distribution for which we generate a point
+  for (long curDist = blockIdx.x;
+       curDist < distributions;
+       curDist += gridDim.x) {
+    for (int sampleBase = 0;
+         sampleBase < totalSamples; sampleBase += blockDim.y) {
+      // The warp determines the sample
+      int sample = sampleBase + threadIdx.y;
+
+      // All threads participate in this
+      T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+      if (threadIdx.x == 0 && sample < totalSamples) {
+        // Find the bucket that a uniform sample lies in
+        int choice = binarySearchForMultinomial<T>(
+          normDistPrefixSum + curDist * categories,
+          categories,
+          r);
+
+        // Torch indices are 1-based
+        dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
+                                    int totalSamples,
+                                    int sample,
+                                    long* dest,
+                                    long distributions,
+                                    int categories,
+                                    T* origDist,
+                                    T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block and warp determines the distribution for which we
+  // generate a point
+  for (long curDistBase = blockIdx.x * blockDim.y;
+       curDistBase < distributions;
+       curDistBase += gridDim.x * blockDim.y) {
+    // The warp determines the distribution
+    long curDist = curDistBase + threadIdx.y;
+
+    // All threads must participate in this
+    T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+    if (threadIdx.x == 0 && curDist < distributions) {
+      // Find the bucket that a uniform sample lies in
+      int choice = binarySearchForMultinomial<T>(
+        normDistPrefixSum + curDist * categories,
+        categories,
+        r);
+
+      // Torch indices are 1-based
+      dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+
+      // Without replacement, so update the original probability so it
+      // is not considered a second time
+      origDist[curDist * categories + choice] = ScalarConvert<int, T>::to(0);
+    }
+  }
+}
+
+#endif // THC_TENSOR_RANDOM_CUH
diff --git a/lib/THC/THCTensorRandom.h b/lib/THC/THCTensorRandom.h
new file mode 100644
index 0000000..197a53c
--- /dev/null
+++ b/lib/THC/THCTensorRandom.h
@@ -0,0 +1,37 @@
+#ifndef TH_CUDA_TENSOR_RANDOM_INC
+#define TH_CUDA_TENSOR_RANDOM_INC
+
+#include "THCTensor.h"
+
+#include "generic/THCTensorRandom.h"
+#include "THCGenerateAllTypes.h"
+
+/* Generator */
+typedef struct _Generator {
+  struct curandStateMtgp32* gen_states;
+  struct mtgp32_kernel_params *kernel_params;
+  int initf;
+  unsigned long long initial_seed;
+} Generator;
+
+typedef struct THCRNGState {
+  /* One generator per GPU */
+  Generator* gen;
+  int num_devices;
+} THCRNGState;
+
+struct THCState;
+
+THC_API void THCRandom_init(struct THCState *state, int num_devices, int current_device);
+THC_API void THCRandom_shutdown(struct THCState *state);
+THC_API unsigned long long THCRandom_seed(struct THCState *state);
+THC_API unsigned long long THCRandom_seedAll(struct THCState *state);
+THC_API void THCRandom_manualSeed(struct THCState *state, unsigned long long the_seed_);
+THC_API void THCRandom_manualSeedAll(struct THCState *state, unsigned long long the_seed_);
+THC_API unsigned long long THCRandom_initialSeed(struct THCState *state);
+THC_API void THCRandom_getRNGState(struct THCState *state, THByteTensor *rng_state);
+THC_API void THCRandom_setRNGState(struct THCState *state, THByteTensor *rng_state);
+
+THC_API struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state);
+
+#endif
diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu
new file mode 100644
index 0000000..f3f3928
--- /dev/null
+++ b/lib/THC/THCTensorScatterGather.cu
@@ -0,0 +1,153 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCApply.cuh"
+
+// Compute the offsets into the given tensors for a linear index. For the 't2'
+// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
+// size (with the exception of 't2' in dimension 'dim').
+// This version uses a static number of dimensions.
+template <typename IndexType, typename Real, int Dims>
+struct IndexToScatterGatherOffsets {
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<long, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = Dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      *t1Offset += curDimIndex * t1.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<long, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = Dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+};
+
+// Same as above but using a dynamic number of dimensions.
+template <typename IndexType, typename Real>
+struct IndexToScatterGatherOffsets<IndexType, Real, -1> {
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<long, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = index.dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      *t1Offset += curDimIndex * t1.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+
+  static __device__ void compute(
+      IndexType linearId, const int dim,
+      const TensorInfo<long, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
+    for (int d = index.dims - 1; d >= 0; d--) {
+      IndexType curDimIndex = linearId % index.sizes[d];
+      *indexOffset += curDimIndex * index.strides[d];
+      if (d != dim) {
+        *t2Offset += curDimIndex * t2.strides[d];
+      }
+      linearId /= index.sizes[d];
+    }
+  }
+};
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_gatherKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<Real, IndexType> src,
+    TensorInfo<long, IndexType> index,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType srcOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          tensor, &tensorOffset,
+                                                          src, &srcOffset);
+
+    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    srcOffset += indexValue * src.strides[dim];
+
+    tensor.data[tensorOffset] = src.data[srcOffset];
+  }
+}
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_scatterKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<Real, IndexType> src,
+    TensorInfo<long, IndexType> index,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType srcOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          src, &srcOffset,
+                                                          tensor, &tensorOffset);
+
+    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    tensorOffset += indexValue * tensor.strides[dim];
+
+    tensor.data[tensorOffset] = src.data[srcOffset];
+  }
+}
+
+template <typename IndexType, typename Real, int Dims>
+__global__ void THCudaTensor_scatterFillKernel(
+    TensorInfo<Real, IndexType> tensor,
+    TensorInfo<long, IndexType> index,
+    Real value,
+    const int dim,
+    const IndexType totalElements) {
+  for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+       linearId < totalElements;
+       linearId += gridDim.x * blockDim.x) {
+    IndexType tensorOffset = 0;
+    IndexType indexOffset = 0;
+
+    IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
+                                                          index, &indexOffset,
+                                                          tensor, &tensorOffset);
+
+    IndexType indexValue = (IndexType)index.data[indexOffset] - TH_INDEX_BASE;
+    tensorOffset += indexValue * tensor.strides[dim];
+
+    tensor.data[tensorOffset] = value;
+  }
+}
+
+#include "generic/THCTensorScatterGather.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
new file mode 100644
index 0000000..589d3e9
--- /dev/null
+++ b/lib/THC/THCTensorSort.cu
@@ -0,0 +1,78 @@
+#include "THCTensorSort.cuh"
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+unsigned long nextHighestPowerOf2(unsigned long n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+#ifndef _MSC_VER
+  n |= n >> 32;
+#endif
+  n++;
+
+  return n;
+}
+
+void THCudaLongTensor_fillSliceWithIndex(THCState* state,
+                                         THCudaLongTensor* t,
+                                         int dim) {
+  long dims = THCudaLongTensor_nDimension(state, t);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t inElements = THCudaLongTensor_nElement(state, t);
+  long sliceSize = THCudaLongTensor_size(state, t, dim);
+  ptrdiff_t numSlices = inElements / sliceSize;
+
+  dim3 grid;
+  if (!THC_getGridFromTiles(numSlices, grid)) {
+    THError("Slice to fill with indices is too large");
+  }
+
+  long maxThreads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  long numThreads = sliceSize;
+  if (numThreads > maxThreads) {
+    numThreads = maxThreads;
+  }
+
+  dim3 block(numThreads);
+
+#define FILL_INDEX(T, DIM)                                       \
+  fillSliceWithIndex<T, DIM>                                     \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
+      info, numSlices, sliceSize, info.strides[collapseDim])
+
+  if (TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, t)) {
+    TensorInfo<long, unsigned int> info =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, t);
+    info.reduceDim(dim);
+    int collapseDim = info.collapseDims(dim);
+
+    if (info.isContiguous()) {
+      FILL_INDEX(unsigned int, -2);
+    } else {
+      if (info.dims == 1) {
+        FILL_INDEX(unsigned int, 1);
+      } else if (info.dims == 2) {
+        FILL_INDEX(unsigned int, 2);
+      } else {
+        FILL_INDEX(unsigned int, -1);
+      }
+    }
+  } else {
+    TensorInfo<long, unsigned long> info =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, t);
+    info.reduceDim(dim);
+    int collapseDim = info.collapseDims(dim);
+
+    // catch-all implementation
+    FILL_INDEX(unsigned long, -1);
+  }
+
+#undef FILL_INDEX
+
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/lib/THC/THCTensorSort.cuh b/lib/THC/THCTensorSort.cuh
new file mode 100644
index 0000000..381f111
--- /dev/null
+++ b/lib/THC/THCTensorSort.cuh
@@ -0,0 +1,87 @@
+#ifndef THC_TENSORSORT_CUH
+#define THC_TENSORSORT_CUH
+
+#include "THCReduceApplyUtils.cuh"
+#include "THCSortUtils.cuh"
+#include "THCTensorCopy.h"
+#include "THCTensorTypeUtils.cuh"
+
+#include "THCThrustAllocator.cuh"
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+template <typename T>
+struct ThrustGTOp {
+  __device__ bool operator()(const T& lhs, const T& rhs) const {
+    return THCNumerics<T>::gt(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct ThrustLTOp {
+  __device__ bool operator()(const T& lhs, const T& rhs) const {
+    return THCNumerics<T>::lt(lhs, rhs);
+  }
+};
+
+// `base` is the base address of a tensor
+// For each slice (defined as a linear point of `out`, from 0 ->
+// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
+// `sliceSize - 1`.
+template <typename IndexType, int Dim>
+__global__ void
+fillSliceWithIndex(TensorInfo<long, IndexType> out,
+                   IndexType totalSlices,
+                   IndexType sliceSize,
+                   IndexType sliceStride) {
+  IndexType slice = getLinearBlockId<IndexType>();
+
+  if (slice >= totalSlices) {
+    return;
+  }
+
+  const unsigned long offset =
+    IndexToOffset<long, IndexType, Dim>::get(slice, out);
+  long* base = &out.data[offset];
+
+  for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    // Torch indices are 1-based (hence the +1)
+    base[i * sliceStride] = i + TH_INDEX_BASE;
+  }
+}
+
+// For slice sorting in Thrust; extracts a slice index from a linear
+// index and uses that for comparison
+struct SliceComp {
+  SliceComp(long size) : sliceSize(size) {}
+
+  __device__ bool operator()(const long& a, const long& b) const {
+    // Since the slices are guaranteed to be innermost, the segment is
+    // just via long division
+    long segA = a / sliceSize;
+    long segB = b / sliceSize;
+    return segA < segB;
+  }
+
+  const long sliceSize;
+};
+
+// For sorting in Thurst; extracts a within-slice index from a linear index
+struct GlobalIndexToPerSliceIndex {
+  GlobalIndexToPerSliceIndex(long size) : sliceSize(size) {}
+
+  __device__ inline void operator()(long& v) const {
+    v = v % sliceSize + TH_INDEX_BASE;
+  }
+
+  const long sliceSize;
+};
+
+unsigned long nextHighestPowerOf2(unsigned long n);
+void THCudaLongTensor_fillSliceWithIndex(THCState* state,
+                                         THCudaLongTensor* t,
+                                         int dim);
+#endif // THC_TENSORSORT_CUH
diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu
new file mode 100644
index 0000000..ec26178
--- /dev/null
+++ b/lib/THC/THCTensorTopK.cu
@@ -0,0 +1,535 @@
+#include "THC.h"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorCopy.h"
+#include "THCTensorMath.h"
+#include "THCAsmUtils.cuh"
+#include "THCScanUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
+#include <algorithm> // for std::min
+
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+// Converts a float to an integer representation with the same
+// sorting; i.e., for floats f1, f2:
+// if f1 < f2 then convert(f1) < convert(f2)
+// We use this to enable radix selection of floating-point values.
+// This also gives a relative order for NaNs, but that's ok, as they
+// will all be adjacent
+struct FloatToSortedInt {
+  inline __host__ __device__ FloatToSortedInt() {}
+
+  inline __device__ unsigned int convert(float v) const {
+    unsigned int x = __float_as_int(v);
+    unsigned int mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (x ^ mask);
+  }
+
+  inline __device__ float deconvert(unsigned int v) const {
+    unsigned int mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <typename DataType, typename BitDataType,
+          typename IndexType, typename CountType,
+          typename RadixConverter, int RadixSize, int RadixBits>
+__device__ void countRadixUsingMask(const RadixConverter& conv,
+                                    CountType counts[RadixSize],
+                                    CountType* smem,
+                                    BitDataType desired,
+                                    BitDataType desiredMask,
+                                    int radixDigitPos,
+                                    IndexType sliceSize,
+                                    IndexType withinSliceStride,
+                                    DataType* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+  for (IndexType i = threadIdx.x; i < sliceSize; i += blockDim.x) {
+    BitDataType val = conv.convert(doLdg(&data[i * withinSliceStride]));
+
+    bool hasVal = ((val & desiredMask) == desired);
+    unsigned int digitInRadix = getBitfield(val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (unsigned int j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+      counts[j] += __popc(__ballot(vote));
+    }
+  }
+
+  // Now, for each warp, sum values
+  if (getLaneId() == 0) {
+#pragma unroll
+    for (unsigned int i = 0; i < RadixSize; ++i) {
+      atomicAdd(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (unsigned int i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+#define RADIX_BITS 2 // digits are base-(2 ^ RADIX_BITS)
+#define RADIX_SIZE 4 // 2 ^ RADIX_BITS
+#define RADIX_MASK (RADIX_SIZE - 1)
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename DataType, typename IndexType, typename RadixConverter>
+__device__ float findPattern(const RadixConverter& conv,
+                             DataType* smem,
+                             DataType* data,
+                             IndexType sliceSize,
+                             IndexType withinSliceStride,
+                             unsigned int desired,
+                             unsigned int desiredMask) {
+  if (threadIdx.x < 32) {
+    smem[threadIdx.x] = (DataType) 0;
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  IndexType numIterations = THCRoundUp(sliceSize, (IndexType) blockDim.x);
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    DataType v = inRange ? doLdg(&data[i * withinSliceStride]) : (DataType) 0;
+
+    if (inRange && ((conv.convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = (DataType) 1;
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    DataType found = smem[0];
+    DataType val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (found != (DataType) 0) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  assert(false);
+  return (DataType) 0;
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename DataType, typename BitDataType, typename IndexType,
+          typename RadixConverter, bool Order>
+__device__ void radixSelect(const RadixConverter& conv,
+                            DataType* data,
+                            IndexType k,
+                            IndexType sliceSize,
+                            IndexType withinSliceStride,
+                            int* smem,
+                            DataType* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  unsigned int desired = 0;
+  unsigned int desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+#pragma unroll
+  for (int digitPos = sizeof(BitDataType) * 8 - RADIX_BITS;
+       digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<DataType, BitDataType,
+                        IndexType, int, RadixConverter,
+                        RADIX_SIZE, RADIX_BITS>(
+                          conv, counts, smem,
+                          desired, desiredMask, digitPos,
+                          sliceSize, withinSliceStride, data);
+
+    // All threads participate in the comparisons below to know the
+    // final result
+
+#define CHECK_RADIX(i)                                                  \
+    int count = counts[i];                                              \
+                                                                        \
+    /* All threads have the same value in counts here, so all */        \
+    /* threads will return from the function. */                        \
+    if (count == 1 && kToFind == 1) {                                   \
+      /* There is a unique answer. */                                   \
+      desired = setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The answer is now the unique element v such that: */           \
+      /* (v & desiredMask) == desired */                                \
+      /* However, we do not yet know what the actual element is. We */  \
+      /* need to perform a search through the data to find the */       \
+      /* element that matches this pattern. */                          \
+      *topK = findPattern<DataType, IndexType, RadixConverter>(         \
+        conv, (float*) smem, data, sliceSize,                           \
+        withinSliceStride, desired, desiredMask);                       \
+      return;                                                           \
+    }                                                                   \
+                                                                        \
+    if (count >= kToFind) {                                             \
+      desired = setBitfield(desired, i, digitPos, RADIX_BITS);          \
+      desiredMask =                                                     \
+        setBitfield(desiredMask, RADIX_MASK, digitPos, RADIX_BITS);     \
+                                                                        \
+      /* The top-Kth element v must now be one such that: */            \
+      /* (v & desiredMask == desired) */                                \
+      /* but we haven't narrowed it down; we must check the next */     \
+      /* least-significant digit */                                     \
+      break;                                                            \
+    }                                                                   \
+                                                                        \
+    kToFind -= count                                                    \
+
+    if (Order) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        CHECK_RADIX(i);
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        CHECK_RADIX(i);
+      }
+    }
+#undef CHECK_RADIX
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = conv.deconvert(desired);
+}
+
+template <typename IndexType, int Dim, bool Order>
+__global__ void gatherTopK(TensorInfo<float, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+
+                           IndexType numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           TensorInfo<float, IndexType> topK,
+                           IndexType numTopKSlices,
+                           IndexType topKWithinSliceStride,
+
+                           TensorInfo<long, IndexType> indices,
+                           IndexType indicesWithinSliceStride) {
+  // Indices are limited to integer fp precision, so counts can fit in
+  // int32, regardless of IndexType
+  __shared__ int smem[32]; // one per each warp, up to warp limit
+
+  IndexType slice = getLinearBlockId<IndexType>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    IndexToOffset<float, IndexType, Dim>::get(slice, input);
+  IndexType topKSliceStartIndex =
+    IndexToOffset<float, IndexType, Dim>::get(slice, topK);
+  IndexType indicesSliceStartIndex =
+    IndexToOffset<long, IndexType, Dim>::get(slice, indices);
+
+  float* inputSliceStart = &input.data[sliceStartIndex];
+  float* topKSliceStart = &topK.data[topKSliceStartIndex];
+  long* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  float topKValue = -1.0f;
+  radixSelect<float, unsigned int, IndexType, FloatToSortedInt, Order>(
+    FloatToSortedInt(),
+    inputSliceStart, outputSliceSize,
+    inputSliceSize, inputWithinSliceStride,
+    smem, &topKValue);
+
+  // Every value that is strictly less/greater than `pattern`
+  // (depending on sort dir) in sorted int format is in the top-K.
+  // The top-K value itself might not be unique.
+  //
+  // Since there are a variable number of elements that we see that
+  // are within the top-k, we don't know at what index to write out
+  // the resulting values.
+  // In order to get this, we perform an exclusive prefix sum of
+  // `hasTopK`. This will return the resulting index into which we
+  // need to write the result, if a thread has a result.
+
+  // All threads need to participate in the loop and the prefix sum,
+  // but not necessarily in the load; hence loop bounds being rounded
+  // up to a multiple of the block dim.
+  IndexType numIterations = THCRoundUp(inputSliceSize, (IndexType) blockDim.x);
+  IndexType writeIndexStart = 0;
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    float v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
+    bool hasTopK;
+    if (Order) {
+      hasTopK = inRange && (v > topKValue);
+    } else {
+      hasTopK = inRange && (v < topKValue);
+    }
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
+
+    if (hasTopK) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    writeIndexStart += carry;
+  }
+
+  // We need to fill in the rest with actual == top-K values.
+  // The number that we need is outputSliceSize -
+  // writeIndexStart. There might be more than that number available,
+  // in which case we have to choose the first seen set. We do this
+  // via a prefix sum to calculate indices for writing results.
+  assert(outputSliceSize >= writeIndexStart);
+  IndexType topKRemaining = (outputSliceSize - writeIndexStart);
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    float v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : 0.0f;
+    bool hasTopK = inRange && (v == topKValue);
+
+    int index;
+    int carry;
+    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
+
+    if (hasTopK && index < topKRemaining) {
+      int writeIndex = writeIndexStart + index;
+      assert(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i + TH_INDEX_BASE; // to Lua index
+    }
+
+    if (carry >= topKRemaining) {
+      break;
+    }
+
+    topKRemaining -= carry;
+    writeIndexStart += carry;
+  }
+}
+
+#undef RADIX_BITS
+#undef RADIX_SIZE
+#undef RADIX_MASK
+
+THC_API void THCudaTensor_topk(THCState* state,
+                               THCudaTensor *topK,
+                               THCudaLongTensor *indices,
+                               THCudaTensor *input,
+                               long k, int dim, int dir, int sorted) {
+  THAssert(topK != NULL && indices != NULL && input != NULL);
+  THAssert(THCudaTensor_checkGPU(state, 3, topK, indices, input));
+  THCCheckTensorDims(state, topK, 2);
+  long dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THCCheckTensorDims(state, input, 2);
+
+  int numDims = THCudaTensor_nDimension(state, input);
+  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
+
+  long sliceSize = THCudaTensor_size(state, input, dim);
+  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+
+  // Build the output size, which is the dim being selected set to
+  // size k
+  THLongStorage* topKSize = THCudaTensor_newSizeOf(state, input);
+  THLongStorage_set(topKSize, dim, k);
+  THCudaTensor_resize(state, topK, topKSize, NULL);
+  THCudaLongTensor_resize(state, indices, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+#define RUN_K(INDEX_T, DIM, DIR)                                        \
+  gatherTopK<INDEX_T, DIM, DIR>                                         \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      inputInfo,                                                        \
+      sliceSize,                                                        \
+      k,                                                                \
+      inputSlices,                                                      \
+      /* The actual dimension that the k-selection is running in */     \
+      /* may have changed from collapseDims() */                        \
+      inputInfo.strides[collapseInputDim],                              \
+      topKInfo,                                                         \
+      topKSlices,                                                       \
+      topKInfo.strides[collapseTopKDim],                                \
+      indicesInfo,                                                      \
+      indicesInfo.strides[collapseIndicesDim])
+
+#define RUN_DIR(INDEX_T, DIM)                   \
+  if (dir) {                                    \
+    RUN_K(INDEX_T, DIM, true);                  \
+  } else {                                      \
+    RUN_K(INDEX_T, DIM, false);                 \
+  }
+
+#define RUN_DIM(INDEX_T)                        \
+  if (allDims == 1) {                           \
+    RUN_DIR(INDEX_T, 1);                        \
+  } else if (allDims == 2) {                    \
+    RUN_DIR(INDEX_T, 2);                        \
+  } else if (allDims == 3) {                    \
+    RUN_DIR(INDEX_T, 3);                        \
+  } else {                                      \
+    RUN_DIR(INDEX_T, -1);                       \
+  }
+
+#define RUN_T(INDEX_T)                                                  \
+  TensorInfo<float, INDEX_T> inputInfo =                                \
+    getTensorInfo<THCudaTensor, INDEX_T>(state, input);                 \
+  TensorInfo<float, INDEX_T> topKInfo =                                 \
+    getTensorInfo<THCudaTensor, INDEX_T>(state, topK);                  \
+  TensorInfo<long, INDEX_T> indicesInfo =                               \
+    getTensorInfo<THCudaLongTensor, INDEX_T>(state, indices);           \
+                                                                        \
+  /* We use these structures solely to find the offset to */            \
+  /* each slice we are operating on */                                  \
+  inputInfo.sizes[dim] = 1;                                             \
+  topKInfo.sizes[dim] = 1;                                              \
+  indicesInfo.sizes[dim] = 1;                                           \
+                                                                        \
+  /* Collapse all other dims */                                         \
+  int collapseInputDim = inputInfo.collapseDims(dim);                   \
+  int collapseTopKDim = topKInfo.collapseDims(dim);                     \
+  int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
+                                                                        \
+  long inputSlices = 1;                                                 \
+  long topKSlices = 1;                                                  \
+  for (int i = 0; i < numDims; ++i) {                                   \
+    inputSlices *= inputInfo.sizes[i];                                  \
+    topKSlices *= topKInfo.sizes[i];                                    \
+  }                                                                     \
+                                                                        \
+  dim3 grid;                                                            \
+  if (!THC_getGridFromTiles(inputSlices, grid)) {                       \
+    THError("Slice to sort is too large");                              \
+  }                                                                     \
+                                                                        \
+  dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L));              \
+                                                                        \
+  /* This is used as a template parameter to calculate indices. */      \
+  /* We only specialize it if all collapsed dim sizes are the */        \
+  /* same; otherwise, we use -1 which is the specialization */          \
+  /* parameter for arbitrary dimensions */                              \
+  int allDims = inputInfo.dims;                                         \
+  if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
+    allDims = -1;                                                       \
+  }                                                                     \
+                                                                        \
+  RUN_DIM(INDEX_T);
+
+  // Based on required index size, run the algorithm with the
+  // appropriate index type
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    RUN_T(unsigned int);
+  } else {
+    RUN_T(unsigned long);
+  }
+#undef RUN_T
+#undef RUN_DIM
+#undef RUN_DIR
+#undef RUN_K
+
+  // Sort the results if the user wants them sorted, since our
+  // selection routine does not ensure sorting
+  if (sorted) {
+    // FIXME: the k/v inplace sort along slice only works for size <=
+    // 2048 at the moment
+    if (sliceSize <= 2048) {
+      // This avoids any memory allocations and performs all sorting
+      // work inplace along the slice
+      THCudaTensor_sortKeyValueInplace(state, topK, indices, dim, dir);
+    } else {
+      // Depend upon the backup sort that returns indices, which we
+      // can use in conjunction with gather to produce the original
+      // indices.
+      // This is not the most efficient implementation, especially since
+      // there are memory allocations performed here. If the user desires
+      // greater performance, they should torch.gather() the results
+      // themselves using the reported indices, providing previously
+      // allocated tensors to receive the results.
+      THCudaTensor* sortedTopK = THCudaTensor_new(state);
+      THCudaLongTensor* sortedIndices = THCudaLongTensor_new(state);
+      THCudaTensor_sort(state, sortedTopK, sortedIndices, topK, dim, dir);
+
+      THCudaLongTensor* sortedTopKIndices = THCudaLongTensor_new(state);
+
+      THCudaLongTensor_resizeAs(state, sortedTopKIndices, indices);
+      THCudaLongTensor_gather(state, sortedTopKIndices, indices, dim, sortedIndices);
+
+      THCudaTensor_freeCopyTo(state, sortedTopK, topK);
+      THCudaLongTensor_freeCopyTo(state, sortedTopKIndices, indices);
+      THCudaLongTensor_free(state, sortedIndices);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
diff --git a/lib/THC/THCTensorTopK.h b/lib/THC/THCTensorTopK.h
new file mode 100644
index 0000000..711c047
--- /dev/null
+++ b/lib/THC/THCTensorTopK.h
@@ -0,0 +1,14 @@
+#ifndef TH_CUDA_TENSOR_TOPK_INC
+#define TH_CUDA_TENSOR_TOPK_INC
+
+#include "THCTensor.h"
+
+/* Returns the set of all kth smallest (or largest) elements, depending */
+/* on `dir` */
+THC_API void THCudaTensor_topk(THCState* state,
+                               THCudaTensor* topK,
+                               THCudaLongTensor* indices,
+                               THCudaTensor* input,
+                               long k, int dim, int dir, int sorted);
+
+#endif
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
new file mode 100644
index 0000000..bdcbcbe
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -0,0 +1,263 @@
+#include "THCTensorTypeUtils.cuh"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THCHalf.h"
+#include <stdlib.h>
+
+namespace {
+
+struct SizeAndStride {
+  long size;
+  long stride;
+};
+
+int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+
+  return aS->stride < bS->stride;
+}
+
+}
+
+#define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE)                       \
+                                                                        \
+TENSOR_TYPE*                                                            \
+TensorUtils<TENSOR_TYPE>::newTensor(THCState* state) {                  \
+  return TENSOR_TYPE##_new(state);                                      \
+}                                                                       \
+                                                                        \
+TENSOR_TYPE*                                                            \
+TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state,                \
+                                        TENSOR_TYPE* t) {               \
+  return TENSOR_TYPE##_newContiguous(state, t);                         \
+}                                                                       \
+                                                                        \
+THLongStorage*                                                          \
+TensorUtils<TENSOR_TYPE>::newSizeOf(THCState* state,                    \
+                                    TENSOR_TYPE* t) {                   \
+  return TENSOR_TYPE##_newSizeOf(state, t);                             \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::retain(THCState* state,                       \
+                                 TENSOR_TYPE* t) {                      \
+  TENSOR_TYPE##_retain(state, t);                                       \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::free(THCState* state,                         \
+                               TENSOR_TYPE* t) {                        \
+  TENSOR_TYPE##_free(state, t);                                         \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state,                   \
+                                     TENSOR_TYPE* src,                  \
+                                     TENSOR_TYPE* dst) {                \
+  TENSOR_TYPE##_freeCopyTo(state, src, dst);                            \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::resize(THCState* state,                       \
+                                 TENSOR_TYPE* out,                      \
+                                 THLongStorage* sizes,                  \
+                                 THLongStorage* strides) {              \
+  TENSOR_TYPE##_resize(state, out, sizes, strides);                     \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state,                     \
+                                   TENSOR_TYPE* dst,                    \
+                                   TENSOR_TYPE* src) {                  \
+  TENSOR_TYPE##_resizeAs(state, dst, src);                              \
+}                                                                       \
+                                                                        \
+DATA_TYPE*                                                              \
+TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
+                                  TENSOR_TYPE* t) {                     \
+  /* FIXME: no cast is required except for THCudaHalfTensor */          \
+  return (DATA_TYPE*) TENSOR_TYPE##_data(state, t);                     \
+}                                                                       \
+                                                                        \
+ptrdiff_t                                                               \
+TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state,               \
+                                         TENSOR_TYPE* t) {              \
+  return TENSOR_TYPE##_nElement(state, t);                              \
+}                                                                       \
+                                                                        \
+long                                                                    \
+TensorUtils<TENSOR_TYPE>::getSize(THCState* state,                      \
+                                  TENSOR_TYPE* t,                       \
+                                  int dim) {                            \
+  return TENSOR_TYPE##_size(state, t, dim);                             \
+}                                                                       \
+                                                                        \
+long                                                                    \
+TensorUtils<TENSOR_TYPE>::getStride(THCState* state,                    \
+                                    TENSOR_TYPE* t,                     \
+                                    int dim) {                          \
+  return TENSOR_TYPE##_stride(state, t, dim);                           \
+}                                                                       \
+                                                                        \
+int                                                                     \
+TensorUtils<TENSOR_TYPE>::getDims(THCState* state,                      \
+                                  TENSOR_TYPE* t) {                     \
+  return TENSOR_TYPE##_nDimension(state, t);                            \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::isContiguous(THCState* state,                 \
+                                       TENSOR_TYPE* t) {                \
+  return TENSOR_TYPE##_isContiguous(state, t);                          \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::allContiguous(THCState* state,                \
+                                        TENSOR_TYPE** inputs,           \
+                                        int numInputs) {                \
+  THAssert(numInputs > 0);                                                \
+  for (int i = 0; i < numInputs; ++i) {                                 \
+    if (!TensorUtils<TENSOR_TYPE>::isContiguous(state, inputs[i])) {    \
+      return false;                                                     \
+    }                                                                   \
+  }                                                                     \
+  return true;                                                          \
+}                                                                       \
+                                                                        \
+int                                                                     \
+TensorUtils<TENSOR_TYPE>::getDevice(THCState* state,                    \
+                                    TENSOR_TYPE* t) {                   \
+  return TENSOR_TYPE##_getDevice(state, t);                             \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::allSameDevice(THCState* state,                \
+                                        TENSOR_TYPE** inputs,           \
+                                        int numInputs) {                \
+  THAssert(numInputs > 0);                                                \
+  int device = TensorUtils<TENSOR_TYPE>::getDevice(state, inputs[0]);          \
+  for (int i = 1; i < numInputs; ++i) {                                 \
+    if (TensorUtils<TENSOR_TYPE>::getDevice(state, inputs[i]) != device) {     \
+      return false;                                                     \
+    }                                                                   \
+  }                                                                     \
+  return true;                                                          \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::copyIgnoringOverlaps(THCState* state,         \
+                                               TENSOR_TYPE* dst,        \
+                                               TENSOR_TYPE* src) {      \
+  return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src);           \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state,           \
+                                             TENSOR_TYPE* t) {          \
+  /* In this function, we don't care about permutations of the */       \
+  /* size/stride arrays (transpositions). */                            \
+  /* We order the size/stride arrays by stride, skipping dimensions of */ \
+  /* size 1. Strides of dimensions of size 1 don't matter, since there */ \
+  /* is only one addressing point in them. */                           \
+  /* In this reordered view, the tensor is contiguous if */             \
+  /* stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`. */   \
+  /* The tensor has holes if */                                         \
+  /* stride[dim] > size[dim + 1] * stride[dim + 1] for one or more */   \
+  /* `dim`. */                                                          \
+  /* The tensor has overlaps if */                                      \
+  /* stride[dim] < size[dim + 1] * stride[dim + 1] for one or more */   \
+  /* `dim`, or the innermost stride is 0. */                            \
+                                                                        \
+  /* Extract size/stride arrays; only consider size >1 dims. */         \
+  SizeAndStride info[MAX_CUTORCH_DIMS];                                 \
+                                                                        \
+  int dims = TensorUtils<TENSOR_TYPE>::getDims(state, t);               \
+  int nonSize1Dims = 0;                                                 \
+  for (int i = 0; i < dims; ++i) {                                      \
+    long size = TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
+    if (size > 1) {                                                     \
+      info[nonSize1Dims].size = size;                                   \
+      info[nonSize1Dims].stride =                                       \
+        TensorUtils<TENSOR_TYPE>::getStride(state, t, i);               \
+      ++nonSize1Dims;                                                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (nonSize1Dims == 0) {                                              \
+    /* no overlap */                                                    \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  /* Ascending order (innermost dimension in sorted view is at [0]) */  \
+  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \
+                                                                        \
+  /* Base case: innermost dimension must have stride >= 1 */            \
+  if (info[nonSize1Dims - 1].stride < 1) {                              \
+    return true;                                                        \
+  }                                                                     \
+                                                                        \
+  /* Subsequent dimensions, if any */                                   \
+  for (int i = nonSize1Dims - 2; i >= 0; --i) {                         \
+    if (info[i].stride < info[i + 1].size * info[i + 1].stride) {       \
+      /* There are overlaps */                                          \
+      return true;                                                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  /* Tensor has holes or is contiguous */                               \
+  return false;                                                         \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state,         \
+                                               TENSOR_TYPE* t) {        \
+  ptrdiff_t elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t);   \
+  if (elements >= UINT_MAX) {                                           \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  ptrdiff_t offset = 0;                                                 \
+  ptrdiff_t linearId = elements - 1;                                    \
+                                                                        \
+  for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \
+    ptrdiff_t curDimIndex =                                             \
+      linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i);        \
+    ptrdiff_t curDimOffset = curDimIndex *                              \
+      TensorUtils<TENSOR_TYPE>::getStride(state, t, i);                 \
+    offset += curDimOffset;                                             \
+    linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
+  }                                                                     \
+                                                                        \
+  if (offset >= UINT_MAX) {                                             \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  return true;                                                          \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::all32BitIndexable(THCState* state,            \
+                                            TENSOR_TYPE** inputs,       \
+                                            int numInputs) {            \
+  for (int i = 0; i < numInputs; ++i) {                                 \
+    if (!TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(state, inputs[i])) { \
+      return false;                                                     \
+    }                                                                   \
+  }                                                                     \
+  return true;                                                          \
+}
+
+IMPL_TENSOR_UTILS(THCudaByteTensor, unsigned char)
+IMPL_TENSOR_UTILS(THCudaCharTensor, char)
+IMPL_TENSOR_UTILS(THCudaShortTensor, short)
+IMPL_TENSOR_UTILS(THCudaIntTensor, int)
+IMPL_TENSOR_UTILS(THCudaLongTensor, long)
+IMPL_TENSOR_UTILS(THCudaTensor, float)
+IMPL_TENSOR_UTILS(THCudaDoubleTensor, double)
+
+#ifdef CUDA_HALF_TENSOR
+IMPL_TENSOR_UTILS(THCudaHalfTensor, half);
+#endif
+
+#undef IMPL_TENSOR_UTILS
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
new file mode 100644
index 0000000..273606e
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -0,0 +1,180 @@
+#ifndef THC_TENSOR_TYPE_UTILS_INC
+#define THC_TENSOR_TYPE_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensor.h"
+#include "THCTensorInfo.cuh"
+
+/// A utility for accessing THCuda*Tensor types in a generic manner
+
+/// Equivalent to C++11's type_traits std::is_same; used for comparing
+/// equality of types. Don't assume the existence of C++11
+template <typename T, typename U>
+struct SameType {
+  static const bool same = false;
+};
+
+template <typename T>
+struct SameType<T, T> {
+  static const bool same = true;
+};
+
+template <typename T, typename U>
+bool isSameType() {
+  return SameType<T, U>::same;
+}
+
+template <typename TensorType>
+struct TensorUtils {
+};
+
+#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE, ACC_DATA_TYPE)             \
+  template <>                                                           \
+  struct THC_CLASS TensorUtils<TENSOR_TYPE> {                                     \
+    typedef DATA_TYPE DataType;                                         \
+    typedef ACC_DATA_TYPE AccDataType;                                  \
+                                                                        \
+    static TENSOR_TYPE* newTensor(THCState* state);                     \
+    static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \
+    static THLongStorage* newSizeOf(THCState* state, TENSOR_TYPE* t);   \
+    static void retain(THCState* state, TENSOR_TYPE* t);                \
+    static void free(THCState* state, TENSOR_TYPE* t);                  \
+    static void freeCopyTo(THCState* state, TENSOR_TYPE* src,           \
+                           TENSOR_TYPE* dst);                           \
+    static void resize(THCState* state, TENSOR_TYPE* out,               \
+                       THLongStorage* sizes,                            \
+                       THLongStorage* strides);                         \
+    static void resizeAs(THCState* state, TENSOR_TYPE* dst,             \
+                         TENSOR_TYPE* src);                             \
+    static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t);         \
+    static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t);        \
+    static long getSize(THCState* state, TENSOR_TYPE* t, int dim);      \
+    static long getStride(THCState* state, TENSOR_TYPE* t, int dim);    \
+    static int getDims(THCState* state, TENSOR_TYPE* t);                \
+    static bool isContiguous(THCState* state, TENSOR_TYPE* t);          \
+    static bool allContiguous(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
+    static int getDevice(THCState* state, TENSOR_TYPE* t);              \
+    static bool allSameDevice(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
+    static void copyIgnoringOverlaps(THCState* state,                   \
+                                     TENSOR_TYPE* dst, TENSOR_TYPE* src); \
+    /* Determines if the given tensor has overlapping data points (i.e., */ \
+    /* is there more than one index into the tensor that references */  \
+    /* the same piece of data)? */                                      \
+    static bool overlappingIndices(THCState* state, TENSOR_TYPE* t);    \
+    /* Can we use 32 bit math for indexing? */                          \
+    static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t);  \
+    /* Are all tensors 32-bit indexable? */                             \
+    static bool all32BitIndexable(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
+  }
+
+TENSOR_UTILS(THCudaByteTensor, unsigned char, long);
+TENSOR_UTILS(THCudaCharTensor, char, long);
+TENSOR_UTILS(THCudaShortTensor, short, long);
+TENSOR_UTILS(THCudaIntTensor, int, long);
+TENSOR_UTILS(THCudaLongTensor, long, long);
+TENSOR_UTILS(THCudaTensor, float, float);
+TENSOR_UTILS(THCudaDoubleTensor, double, double);
+
+#ifdef CUDA_HALF_TENSOR
+TENSOR_UTILS(THCudaHalfTensor, half, float);
+#endif
+
+#undef TENSOR_UTILS
+
+// Utility function for constructing TensorInfo structs. In this case, the
+// two template parameters are:
+//
+// 1. The TensorType, e.g. THCTensor in generic functions, or THCudaTensor,
+// THCudaLongTensor etc.
+//
+// 2. The IndexType. This is always going to be an unsigned integral value,
+// but depending on the size of the Tensor you may select unsigned int,
+// unsigned long, unsigned long long etc.
+//
+// Internally we use the TensorUtils static functions to get the necessary
+// dims, sizes, stride etc.
+//
+// For example, suppose we have a THCudaTensor t, with dim = 2, size = [3, 4],
+// stride = [4, 1], offset = 8, and we set our index type to be unsigned int.
+// Then we yield a TensorInfo struct templatized with float, unsigned int and
+// the following fields:
+//
+// data is a float* to the underlying storage at position 8
+// dims is 2
+// sizes is a MAX_CUTORCH_DIMS element array with [3, 4] in its first two positions
+// strides is a MAX_CUTORCH_DIMS element array with [4, 1] in its first two positions
+//
+// TensorInfos can then be passed to CUDA kernels, but we can use the static functions
+// defined above to perform Tensor Operations that are appropriate for each
+// TensorType.
+template <typename TensorType, typename IndexType>
+TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>
+getTensorInfo(THCState* state, TensorType* t) {
+  IndexType sz[MAX_CUTORCH_DIMS];
+  IndexType st[MAX_CUTORCH_DIMS];
+
+  int dims = TensorUtils<TensorType>::getDims(state, t);
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = TensorUtils<TensorType>::getSize(state, t, i);
+    st[i] = TensorUtils<TensorType>::getStride(state, t, i);
+  }
+
+  return TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>(
+    TensorUtils<TensorType>::getData(state, t), dims, sz, st);
+}
+
+template <typename T>
+struct ScalarNegate {
+  static __host__ __device__ T to(const T v) { return -v; }
+};
+
+template <typename T>
+struct ScalarInv {
+  static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ScalarNegate<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(v);
+#else
+    return __float2half(-__half2float(v));
+#endif
+#else
+    half out = v;
+    out.x ^= 0x8000; // toggle sign bit
+    return out;
+#endif
+  }
+};
+
+template <>
+struct ScalarInv<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+    return __float2half(1.0f / __half2float(v));
+#else
+    float fv = THC_half2float(v);
+    fv = 1.0f / fv;
+    return THC_float2half(fv);
+#endif
+  }
+};
+
+inline bool operator==(half a, half b) {
+  return a.x == b.x;
+}
+
+inline bool operator!=(half a, half b) {
+  return a.x != b.x;
+}
+
+#endif // CUDA_HALF_TENSOR
+
+#endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/THCThreadLocal.c b/lib/THC/THCThreadLocal.c
new file mode 100644
index 0000000..3cc95c3
--- /dev/null
+++ b/lib/THC/THCThreadLocal.c
@@ -0,0 +1,46 @@
+#include "THCThreadLocal.h"
+#include "THCGeneral.h"
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+
+THCThreadLocal THCThreadLocal_alloc(void)
+{
+#ifndef _WIN32
+  pthread_key_t key;
+  THAssert(pthread_key_create(&key, NULL) == 0);
+  return key;
+#else
+  DWORD key = TlsAlloc();
+  THAssert(key != TLS_OUT_OF_INDEXES);
+  return key;
+#endif
+}
+
+void THCThreadLocal_free(THCThreadLocal local)
+{
+#ifndef _WIN32
+  THAssert(pthread_key_delete(local) == 0);
+#else
+  THAssert(TlsFree(local));
+#endif
+}
+
+void* THCThreadLocal_get(THCThreadLocal local)
+{
+#ifndef _WIN32
+  return pthread_getspecific(local);
+#else
+  return TlsGetValue(local);
+#endif
+}
+
+void THCThreadLocal_set(THCThreadLocal local, void* value)
+{
+#ifndef _WIN32
+  THAssert(pthread_setspecific(local, value) == 0);
+#else
+  THAssert(TlsSetValue(local, value));
+#endif
+}
diff --git a/lib/THC/THCThreadLocal.h b/lib/THC/THCThreadLocal.h
new file mode 100644
index 0000000..a733cac
--- /dev/null
+++ b/lib/THC/THCThreadLocal.h
@@ -0,0 +1,17 @@
+#ifndef THC_THREAD_LOCAL_INC
+#define THC_THREAD_LOCAL_INC
+
+#ifdef _WIN32
+#include <intsafe.h>
+typedef DWORD THCThreadLocal;
+#else
+#include <pthread.h>
+typedef pthread_key_t THCThreadLocal;
+#endif
+
+THCThreadLocal THCThreadLocal_alloc(void);
+void THCThreadLocal_free(THCThreadLocal local);
+void* THCThreadLocal_get(THCThreadLocal local);
+void THCThreadLocal_set(THCThreadLocal local, void* value);
+
+#endif // THC_THREAD_LOCAL_INC
diff --git a/lib/THC/THCThrustAllocator.cuh b/lib/THC/THCThrustAllocator.cuh
new file mode 100644
index 0000000..5e8a6c5
--- /dev/null
+++ b/lib/THC/THCThrustAllocator.cuh
@@ -0,0 +1,33 @@
+#ifndef THC_THRUST_ALLOCATOR_INC
+#define THC_THRUST_ALLOCATOR_INC
+
+#include <cstddef>
+
+/// Allocator for Thrust to re-route its internal device allocations
+/// to the THC allocator
+class THCThrustAllocator {
+ public:
+  typedef char value_type;
+
+  THCThrustAllocator(THCState* state)
+      : state_(state) {
+  }
+
+  ~THCThrustAllocator() {
+  }
+
+  char* allocate(std::ptrdiff_t size) {
+    char* out = NULL;
+    THCudaCheck(THCudaMalloc(state_, (void**) &out, size));
+    return out;
+  }
+
+  void deallocate(char* p, size_t size) {
+    THCudaCheck(THCudaFree(state_, p));
+  }
+
+ private:
+  THCState* state_;
+};
+
+#endif // THC_THRUST_ALLOCATOR_INC
diff --git a/lib/THC/cmake/FindMAGMA.cmake b/lib/THC/cmake/FindMAGMA.cmake
new file mode 100644
index 0000000..f9cce95
--- /dev/null
+++ b/lib/THC/cmake/FindMAGMA.cmake
@@ -0,0 +1,27 @@
+# - Find MAGMA library
+# This module finds an installed MAGMA library, a matrix algebra library
+# similar to LAPACK for GPU and multicore systems
+# (see http://icl.cs.utk.edu/magma/).
+#
+# This module sets the following variables:
+#  MAGMA_FOUND - set to true if the MAGMA library is found.
+#  MAGMA_LIBRARIES - list of libraries to link against to use MAGMA
+#  MAGMA_INCLUDE_DIR - include directory
+
+IF(NOT MAGMA_FOUND)
+
+include(FindPackageHandleStandardArgs)
+
+SET(MAGMA_LIBRARIES)
+SET(MAGMA_INCLUDE_DIR)
+
+FIND_LIBRARY(MAGMA_LIBRARIES magma /usr/local/magma/lib)
+FIND_PATH(MAGMA_INCLUDE_DIR magma.h /usr/local/magma/include)
+
+IF (MAGMA_LIBRARIES)
+  SET(MAGMA_FOUND TRUE)
+ELSE (MAGMA_LIBRARIES)
+  SET(MAGMA_FOUND FALSE)
+ENDIF (MAGMA_LIBRARIES)
+
+ENDIF(NOT MAGMA_FOUND)
diff --git a/lib/THC/cmake/select_compute_arch.cmake b/lib/THC/cmake/select_compute_arch.cmake
new file mode 100644
index 0000000..4b27441
--- /dev/null
+++ b/lib/THC/cmake/select_compute_arch.cmake
@@ -0,0 +1,200 @@
+# Synopsis:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Fermi" "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
+
+if (CUDA_VERSION VERSION_GREATER "6.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
+endif ()
+
+if (CUDA_VERSION VERSION_GREATER "7.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
+else()
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
+endif ()
+
+
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
+#
+function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+                    "-ccbin" ${CMAKE_CXX_COMPILER}
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
+    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
+  else()
+    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
+# Usage:
+#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
+function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
+  set(CUDA_ARCH_LIST "${ARGN}")
+
+  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
+    set(CUDA_ARCH_LIST "Auto")
+  endif()
+
+  set(cuda_arch_bin)
+  set(cuda_arch_ptx)
+
+  if("${CUDA_ARCH_LIST}" STREQUAL "All")
+    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
+    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
+    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+  endif()
+
+  # Now process the list and look for names
+  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+  foreach(arch_name ${CUDA_ARCH_LIST})
+    set(arch_bin)
+    set(add_ptx FALSE)
+    # Check to see if we are compiling PTX
+    if(arch_name MATCHES "(.*)\\+PTX$")
+      set(add_ptx TRUE)
+      set(arch_name ${CMAKE_MATCH_1})
+    endif()
+    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+      set(arch_bin ${CMAKE_MATCH_1})
+      set(arch_ptx ${arch_bin})
+    else()
+      # Look for it in our list of known architectures
+      if(${arch_name} STREQUAL "Fermi")
+        set(arch_bin "2.0 2.1(2.0)")
+      elseif(${arch_name} STREQUAL "Kepler+Tegra")
+        set(arch_bin 3.2)
+      elseif(${arch_name} STREQUAL "Kepler+Tesla")
+        set(arch_bin 3.7)
+      elseif(${arch_name} STREQUAL "Kepler")
+        set(arch_bin 3.0 3.5)
+        set(arch_ptx 3.5)
+      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+        set(arch_bin 5.3)
+      elseif(${arch_name} STREQUAL "Maxwell")
+        set(arch_bin 5.0 5.2)
+        set(arch_ptx 5.2)
+      elseif(${arch_name} STREQUAL "Pascal")
+        set(arch_bin 6.0 6.1)
+        set(arch_ptx 6.1)
+      else()
+        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+      endif()
+    endif()
+    if(NOT arch_bin)
+      message(SEND_ERROR "arch_bin wasn't set for some reason")
+    endif()
+    list(APPEND cuda_arch_bin ${arch_bin})
+    if(add_ptx)
+      if (NOT arch_ptx)
+        set(arch_ptx ${arch_bin})
+      endif()
+      list(APPEND cuda_arch_ptx ${arch_ptx})
+    endif()
+  endforeach()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+  if(cuda_arch_bin)
+    list(REMOVE_DUPLICATES cuda_arch_bin)
+  endif()
+  if(cuda_arch_ptx)
+    list(REMOVE_DUPLICATES cuda_arch_ptx)
+  endif()
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified ARCH for the concrete CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
diff --git a/lib/THC/generated/THCTensorMaskedByte.cu b/lib/THC/generated/THCTensorMaskedByte.cu
new file mode 100644
index 0000000..802f873
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorMaskedChar.cu b/lib/THC/generated/THCTensorMaskedChar.cu
new file mode 100644
index 0000000..3fb9fd7
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorMaskedDouble.cu b/lib/THC/generated/THCTensorMaskedDouble.cu
new file mode 100644
index 0000000..063de42
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorMaskedFloat.cu b/lib/THC/generated/THCTensorMaskedFloat.cu
new file mode 100644
index 0000000..08da574
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorMaskedHalf.cu b/lib/THC/generated/THCTensorMaskedHalf.cu
new file mode 100644
index 0000000..caebd6c
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorMaskedInt.cu b/lib/THC/generated/THCTensorMaskedInt.cu
new file mode 100644
index 0000000..1b4d1d5
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorMaskedLong.cu b/lib/THC/generated/THCTensorMaskedLong.cu
new file mode 100644
index 0000000..5fadbba
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorMaskedShort.cu b/lib/THC/generated/THCTensorMaskedShort.cu
new file mode 100644
index 0000000..e1f6823
--- /dev/null
+++ b/lib/THC/generated/THCTensorMaskedShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMasked.cuh"
+#include "../generic/THCTensorMasked.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareByte.cu b/lib/THC/generated/THCTensorMathCompareByte.cu
new file mode 100644
index 0000000..4312d73
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareChar.cu b/lib/THC/generated/THCTensorMathCompareChar.cu
new file mode 100644
index 0000000..0356a74
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareDouble.cu b/lib/THC/generated/THCTensorMathCompareDouble.cu
new file mode 100644
index 0000000..59e406c
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareFloat.cu b/lib/THC/generated/THCTensorMathCompareFloat.cu
new file mode 100644
index 0000000..2efa667
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareHalf.cu b/lib/THC/generated/THCTensorMathCompareHalf.cu
new file mode 100644
index 0000000..d07e6d7
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareInt.cu b/lib/THC/generated/THCTensorMathCompareInt.cu
new file mode 100644
index 0000000..d1a58f1
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareLong.cu b/lib/THC/generated/THCTensorMathCompareLong.cu
new file mode 100644
index 0000000..ab70999
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareShort.cu b/lib/THC/generated/THCTensorMathCompareShort.cu
new file mode 100644
index 0000000..e264c0c
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompare.cuh"
+#include "../generic/THCTensorMathCompare.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTByte.cu b/lib/THC/generated/THCTensorMathCompareTByte.cu
new file mode 100644
index 0000000..3069ea4
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTChar.cu b/lib/THC/generated/THCTensorMathCompareTChar.cu
new file mode 100644
index 0000000..c536fa0
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTDouble.cu b/lib/THC/generated/THCTensorMathCompareTDouble.cu
new file mode 100644
index 0000000..6539160
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTFloat.cu b/lib/THC/generated/THCTensorMathCompareTFloat.cu
new file mode 100644
index 0000000..f857260
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTHalf.cu b/lib/THC/generated/THCTensorMathCompareTHalf.cu
new file mode 100644
index 0000000..a311831
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTInt.cu b/lib/THC/generated/THCTensorMathCompareTInt.cu
new file mode 100644
index 0000000..3168b2b
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTLong.cu b/lib/THC/generated/THCTensorMathCompareTLong.cu
new file mode 100644
index 0000000..4566960
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorMathCompareTShort.cu b/lib/THC/generated/THCTensorMathCompareTShort.cu
new file mode 100644
index 0000000..46bf67a
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathCompareTShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathCompareT.cuh"
+#include "../generic/THCTensorMathCompareT.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseByte.cu b/lib/THC/generated/THCTensorMathPointwiseByte.cu
new file mode 100644
index 0000000..7f26e88
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseChar.cu b/lib/THC/generated/THCTensorMathPointwiseChar.cu
new file mode 100644
index 0000000..d196948
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseDouble.cu b/lib/THC/generated/THCTensorMathPointwiseDouble.cu
new file mode 100644
index 0000000..2e9ad72
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseFloat.cu b/lib/THC/generated/THCTensorMathPointwiseFloat.cu
new file mode 100644
index 0000000..061bd70
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseHalf.cu b/lib/THC/generated/THCTensorMathPointwiseHalf.cu
new file mode 100644
index 0000000..42bef21
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseInt.cu b/lib/THC/generated/THCTensorMathPointwiseInt.cu
new file mode 100644
index 0000000..daa9cae
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseLong.cu b/lib/THC/generated/THCTensorMathPointwiseLong.cu
new file mode 100644
index 0000000..d5e38a7
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorMathPointwiseShort.cu b/lib/THC/generated/THCTensorMathPointwiseShort.cu
new file mode 100644
index 0000000..6867ce2
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathPointwiseShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathPointwise.cuh"
+#include "../generic/THCTensorMathPointwise.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceByte.cu b/lib/THC/generated/THCTensorMathReduceByte.cu
new file mode 100644
index 0000000..3806f4e
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceChar.cu b/lib/THC/generated/THCTensorMathReduceChar.cu
new file mode 100644
index 0000000..5afe076
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceDouble.cu b/lib/THC/generated/THCTensorMathReduceDouble.cu
new file mode 100644
index 0000000..e1bb7c4
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceFloat.cu b/lib/THC/generated/THCTensorMathReduceFloat.cu
new file mode 100644
index 0000000..d0fdd5d
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceHalf.cu b/lib/THC/generated/THCTensorMathReduceHalf.cu
new file mode 100644
index 0000000..f4d9d99
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceInt.cu b/lib/THC/generated/THCTensorMathReduceInt.cu
new file mode 100644
index 0000000..98dd6a4
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceLong.cu b/lib/THC/generated/THCTensorMathReduceLong.cu
new file mode 100644
index 0000000..6c47b5d
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorMathReduceShort.cu b/lib/THC/generated/THCTensorMathReduceShort.cu
new file mode 100644
index 0000000..de2117a
--- /dev/null
+++ b/lib/THC/generated/THCTensorMathReduceShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorMathReduce.cuh"
+#include "../generic/THCTensorMathReduce.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generated/THCTensorSortByte.cu b/lib/THC/generated/THCTensorSortByte.cu
new file mode 100644
index 0000000..6103c48
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortByte.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateByteType.h"
diff --git a/lib/THC/generated/THCTensorSortChar.cu b/lib/THC/generated/THCTensorSortChar.cu
new file mode 100644
index 0000000..bf10336
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortChar.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateCharType.h"
diff --git a/lib/THC/generated/THCTensorSortDouble.cu b/lib/THC/generated/THCTensorSortDouble.cu
new file mode 100644
index 0000000..577af85
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortDouble.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateDoubleType.h"
diff --git a/lib/THC/generated/THCTensorSortFloat.cu b/lib/THC/generated/THCTensorSortFloat.cu
new file mode 100644
index 0000000..dd84b46
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortFloat.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateFloatType.h"
diff --git a/lib/THC/generated/THCTensorSortHalf.cu b/lib/THC/generated/THCTensorSortHalf.cu
new file mode 100644
index 0000000..e2025f2
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortHalf.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateHalfType.h"
diff --git a/lib/THC/generated/THCTensorSortInt.cu b/lib/THC/generated/THCTensorSortInt.cu
new file mode 100644
index 0000000..af7a153
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortInt.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateIntType.h"
diff --git a/lib/THC/generated/THCTensorSortLong.cu b/lib/THC/generated/THCTensorSortLong.cu
new file mode 100644
index 0000000..c65ca26
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortLong.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateLongType.h"
diff --git a/lib/THC/generated/THCTensorSortShort.cu b/lib/THC/generated/THCTensorSortShort.cu
new file mode 100644
index 0000000..03e1a9a
--- /dev/null
+++ b/lib/THC/generated/THCTensorSortShort.cu
@@ -0,0 +1,3 @@
+#include "../THCTensorSort.cuh"
+#include "../generic/THCTensorSort.cu"
+#include "../THCGenerateShortType.h"
diff --git a/lib/THC/generic/THCDeviceTensorUtils.cu b/lib/THC/generic/THCDeviceTensorUtils.cu
new file mode 100644
index 0000000..db6b5e7
--- /dev/null
+++ b/lib/THC/generic/THCDeviceTensorUtils.cu
@@ -0,0 +1,55 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
+#else
+
+/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
+/// error if the dimensionality does not match exactly.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t);
+
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  if (Dim != THCTensor_(nDimension)(state, t)) {
+    THError("THCudaTensor dimension mismatch");
+  }
+  // Determine the maximum offset into the tensor achievable; `IndexT`
+  // must be smaller than this type in order to use it.
+  ptrdiff_t maxOffset = 0;
+  IndexT sizes[Dim];
+  IndexT strides[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    long size = THCTensor_(size)(state, t, i);
+    long stride = THCTensor_(stride)(state, t, i);
+
+    maxOffset += (size - 1) * stride;
+
+    sizes[i] = (IndexT) size;
+    strides[i] = (IndexT) stride;
+  }
+
+  if (maxOffset > std::numeric_limits<IndexT>::max()) {
+    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
+  }
+
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
+    THCTensor_(data)(state, t), sizes, strides);
+}
+
+#endif
diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c
new file mode 100644
index 0000000..8a6cede
--- /dev/null
+++ b/lib/THC/generic/THCStorage.c
@@ -0,0 +1,190 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.c"
+#else
+
+real* THCStorage_(data)(THCState *state, const THCStorage *self)
+{
+  return self->data;
+}
+
+ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage *self)
+{
+  return self->size;
+}
+
+int THCStorage_(elementSize)(THCState *state)
+{
+  return sizeof(real);
+}
+
+void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value)
+{
+  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
+  THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real),
+                         cudaMemcpyHostToDevice));
+}
+
+real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
+{
+  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
+  real value;
+  THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real),
+                         cudaMemcpyDeviceToHost));
+  return value;
+}
+
+THCStorage* THCStorage_(new)(THCState *state)
+{
+  return THCStorage_(newWithSize)(state, 0);
+}
+
+THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
+{
+  return THCStorage_(newWithAllocator)(
+    state, size,
+    state->cudaDeviceAllocator,
+    state->cudaDeviceAllocator->state);
+}
+
+THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
+                                          THCDeviceAllocator* allocator,
+                                          void* allocatorContext)
+{
+  THArgCheck(size >= 0, 2, "invalid size");
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
+  memset(storage, 0, sizeof(THCStorage));
+  storage->refcount = 1;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocator = allocator;
+  storage->allocatorContext = allocatorContext;
+  storage->size = size;
+  storage->device = device;
+
+  if(size > 0)
+  {
+    // update heap *before* attempting malloc, to free space for the malloc
+    THCHeapUpdate(state, size * sizeof(real));
+    cudaError_t err =
+      (*allocator->malloc)(allocatorContext,
+                           (void**)&(storage->data),
+                           size * sizeof(real),
+                           THCState_getCurrentStream(state));
+    if(err != cudaSuccess){
+      THCHeapUpdate(state, -size * sizeof(real));
+      free(storage);
+    }
+    THCudaCheck(err);
+  } else {
+    storage->data = NULL;
+  }
+  return storage;
+}
+
+THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 1);
+  THCStorage_(set)(state, self, 0, data0);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 2);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 3);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  THCStorage_(set)(state, self, 2, data2);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3)
+{
+  THCStorage *self = THCStorage_(newWithSize)(state, 4);
+  THCStorage_(set)(state, self, 0, data0);
+  THCStorage_(set)(state, self, 1, data1);
+  THCStorage_(set)(state, self, 2, data2);
+  THCStorage_(set)(state, self, 3, data3);
+  return self;
+}
+
+THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, ptrdiff_t size, int isShared)
+{
+  THError("not available yet for THCStorage");
+  return NULL;
+}
+
+THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size)
+{
+  return THCStorage_(newWithDataAndAllocator)(state, data, size,
+                                              state->cudaDeviceAllocator,
+                                              state->cudaDeviceAllocator->state);
+}
+
+THCStorage* THCStorage_(newWithDataAndAllocator)(
+  THCState *state, real *data, ptrdiff_t size,
+  THCDeviceAllocator *allocator, void *allocatorContext) {
+  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
+  memset(storage, 0, sizeof(THCStorage));
+  storage->data = data;
+  storage->size = size;
+  storage->refcount = 1;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocator = allocator;
+  storage->allocatorContext = allocatorContext;
+  int device;
+  if (data) {
+    struct cudaPointerAttributes attr;
+    THCudaCheck(cudaPointerGetAttributes(&attr, data));
+    device = attr.device;
+  } else {
+    THCudaCheck(cudaGetDevice(&device));
+  }
+  storage->device = device;
+  return storage;
+}
+
+void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag)
+{
+  storage->flag |= flag;
+}
+
+void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag)
+{
+  storage->flag &= ~flag;
+}
+
+void THCStorage_(retain)(THCState *state, THCStorage *self)
+{
+  if(self && (self->flag & TH_STORAGE_REFCOUNTED))
+    THAtomicIncrementRef(&self->refcount);
+}
+
+void THCStorage_(free)(THCState *state, THCStorage *self)
+{
+  if(!(self->flag & TH_STORAGE_REFCOUNTED))
+    return;
+
+  if (THAtomicDecrementRef(&self->refcount))
+  {
+    if(self->flag & TH_STORAGE_FREEMEM) {
+      THCHeapUpdate(state, -self->size * sizeof(real));
+      THCudaCheck(
+        (*self->allocator->free)(self->allocatorContext, self->data));
+    }
+    if(self->flag & TH_STORAGE_VIEW) {
+      THCStorage_(free)(state, self->view);
+    }
+    THFree(self);
+  }
+}
+#endif
diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu
new file mode 100644
index 0000000..f14b006
--- /dev/null
+++ b/lib/THC/generic/THCStorage.cu
@@ -0,0 +1,94 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.cu"
+#else
+
+void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
+{
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<real> self_data(self->data);
+  thrust::fill(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    self_data, self_data+self->size, value);
+}
+
+void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
+{
+  THArgCheck(size >= 0, 2, "invalid size");
+  THAssert(self->allocator != NULL);
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  if(!(self->flag & TH_STORAGE_RESIZABLE))
+    THError("Trying to resize storage that is not resizable");
+
+  if (self->allocator->realloc) {
+    THCHeapUpdate(state, (size - self->size) * sizeof(real));
+    cudaError_t err = (*self->allocator->realloc)(
+      self->allocatorContext,
+      (void**)&(self->data),
+      self->size * sizeof(real),
+      size * sizeof(real), THCState_getCurrentStream(state));
+    if (err != cudaSuccess) {
+      THCHeapUpdate(state, (self->size - size) * sizeof(real));
+      THCudaCheck(err);
+    }
+    self->size = size;
+    self->device = device;
+    return;
+  }
+
+  if(size == 0)
+  {
+    if(self->flag & TH_STORAGE_FREEMEM) {
+      THCudaCheck(
+        (*self->allocator->free)(self->allocatorContext, self->data));
+      THCHeapUpdate(state, -self->size * sizeof(real));
+    }
+    self->data = NULL;
+    self->size = 0;
+    self->device = device;
+  }
+  else
+  {
+    real *data = NULL;
+    // update heap *before* attempting malloc, to free space for the malloc
+    THCHeapUpdate(state, size * sizeof(real));
+    cudaError_t err =
+      (*self->allocator->malloc)(self->allocatorContext,
+                                 (void**)&(data),
+                                 size * sizeof(real),
+                                 THCState_getCurrentStream(state));
+    if(err != cudaSuccess) {
+      THCHeapUpdate(state, -size * sizeof(real));
+    }
+    THCudaCheck(err);
+
+    if (self->data) {
+      // Enable p2p access when the memcpy is across devices
+      THCState_getPeerToPeerAccess(state, device, self->device);
+
+      THCudaCheck(cudaMemcpyAsync(data,
+                                  self->data,
+                                  THMin(self->size, size) * sizeof(real),
+                                  cudaMemcpyDeviceToDevice,
+                                  THCState_getCurrentStream(state)));
+      if(self->flag & TH_STORAGE_FREEMEM) {
+        THCudaCheck(
+          (*self->allocator->free)(self->allocatorContext, self->data));
+        THCHeapUpdate(state, -self->size * sizeof(real));
+      }
+    }
+
+    self->data = data;
+    self->size = size;
+    self->device = device;
+  }
+}
+
+THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
+  return storage->device;
+}
+
+#endif
diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h
new file mode 100644
index 0000000..e768ec6
--- /dev/null
+++ b/lib/THC/generic/THCStorage.h
@@ -0,0 +1,60 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorage.h"
+#else
+
+#define TH_STORAGE_REFCOUNTED 1
+#define TH_STORAGE_RESIZABLE  2
+#define TH_STORAGE_FREEMEM    4
+
+typedef struct THCStorage
+{
+    real *data;
+    ptrdiff_t size;
+    int refcount;
+    char flag;
+    THCDeviceAllocator *allocator;
+    void *allocatorContext;
+    struct THCStorage *view;
+    int device;
+} THCStorage;
+
+
+THC_API real* THCStorage_(data)(THCState *state, const THCStorage*);
+THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*);
+THC_API int THCStorage_(elementSize)(THCState *state);
+
+/* slow access -- checks everything */
+THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, real);
+THC_API real THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
+
+THC_API THCStorage* THCStorage_(new)(THCState *state);
+THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
+THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real);
+THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real);
+THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real);
+THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real);
+THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared);
+
+/* takes ownership of data */
+THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size);
+
+THC_API THCStorage* THCStorage_(newWithAllocator)(
+  THCState *state, ptrdiff_t size,
+  THCDeviceAllocator* allocator,
+  void *allocatorContext);
+THC_API THCStorage* THCStorage_(newWithDataAndAllocator)(
+  THCState *state, real* data, ptrdiff_t size,
+  THCDeviceAllocator* allocator,
+  void *allocatorContext);
+
+THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag);
+THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag);
+THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
+
+THC_API void THCStorage_(free)(THCState *state, THCStorage *storage);
+THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, ptrdiff_t size);
+THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value);
+
+THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage);
+
+#endif
diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c
new file mode 100644
index 0000000..1306906
--- /dev/null
+++ b/lib/THC/generic/THCStorageCopy.c
@@ -0,0 +1,60 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.c"
+#else
+
+void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
+{
+  THArgCheck(self->size == src->size, 2, "size does not match");
+  THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyHostToDevice));
+}
+
+#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC)                          \
+void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
+{                                                                      \
+  THCTensor* selfTensor =                                              \
+      THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1);     \
+  struct TH##TYPEC##Tensor* srcTensor =                                \
+      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size, 1);        \
+  THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor);               \
+  TH##TYPEC##Tensor_free(srcTensor);                                   \
+  THCTensor_(free)(state, selfTensor);                                 \
+}
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Char)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Short)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Int)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Long)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Float)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Half)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
+
+void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
+{
+  THArgCheck(self->size == src->size, 2, "size does not match");
+  THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyDeviceToHost));
+}
+
+#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                             \
+void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
+{                                                                           \
+  TH##TYPEC##Tensor* selfTensor =                                           \
+      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size, 1);           \
+  struct THCTensor* srcTensor =                                             \
+      THCTensor_(newWithStorage1d)(state, src, 0, src->size, 1);            \
+  TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \
+  THCTensor_(free)(state, srcTensor);                                       \
+  TH##TYPEC##Tensor_free(selfTensor);                                   \
+}
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half)
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double)
+
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPY
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO
+
+#endif
diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu
new file mode 100644
index 0000000..35f27dd
--- /dev/null
+++ b/lib/THC/generic/THCStorageCopy.cu
@@ -0,0 +1,46 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.cu"
+#else
+
+void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src)
+{
+  THCudaCheck(cudaMemcpyAsync(self->data, src, self->size * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
+}
+
+// conversions are delegated to THCTensor implementation
+#define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA)                                 \
+void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src)  \
+{                                                                                       \
+  THArgCheck(self->size == src->size, 2, "size does not match");                        \
+  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size, 1);  \
+  struct THCuda##TYPECUDA##Tensor* srcTensor =                                          \
+      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size, 1);           \
+  THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor);                            \
+  THCuda##TYPECUDA##Tensor_free(state, srcTensor);                                      \
+  THCTensor_(free)(state, selfTensor);                                                  \
+}
+
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Byte,Byte)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Char,Char)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Short,Short)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Int,Int)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long)
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,)  // i.e. float
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double)
+#ifdef CUDA_HALF_TENSOR
+THC_CUDA_STORAGE_IMPLEMENT_COPY(Half,Half)
+#endif
+
+#undef THC_CUDA_STORAGE_IMPLEMENT_COPY
+
+void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src)
+{
+  THCStorage_(TH_CONCAT_2(copyCuda, Real))(state, self, src);
+}
+
+void THCStorage_(copy)(THCState *state, THCStorage *self, THCStorage *src)
+{
+  THCStorage_(copyCuda)(state, self, src);
+}
+
+#endif
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
new file mode 100644
index 0000000..7a4ef6b
--- /dev/null
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -0,0 +1,42 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCStorageCopy.h"
+#else
+
+/* Support for copy between different Storage types */
+
+THC_API void THCStorage_(rawCopy)(THCState *state, THCStorage *storage, real *src);
+THC_API void THCStorage_(copy)(THCState *state, THCStorage *storage, THCStorage *src);
+THC_API void THCStorage_(copyByte)(THCState *state, THCStorage *storage, struct THByteStorage *src);
+THC_API void THCStorage_(copyChar)(THCState *state, THCStorage *storage, struct THCharStorage *src);
+THC_API void THCStorage_(copyShort)(THCState *state, THCStorage *storage, struct THShortStorage *src);
+THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct THIntStorage *src);
+THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src);
+THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src);
+THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src);
+THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src);
+
+THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src);
+THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src);
+THC_API void THCStorage_(copyCudaShort)(THCState *state, THCStorage *storage, struct THCudaShortStorage *src);
+THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, struct THCudaIntStorage *src);
+THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
+THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
+THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
+#endif
+
+THC_API void TH_CONCAT_2(THByteStorage_copyCuda  , Real)(THCState *state, THByteStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THCharStorage_copyCuda  , Real)(THCState *state, THCharStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THShortStorage_copyCuda , Real)(THCState *state, THShortStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THIntStorage_copyCuda   , Real)(THCState *state, THIntStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THLongStorage_copyCuda  , Real)(THCState *state, THLongStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src);
+THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src);
+
+THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src);
+THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src);
+THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src);
+
+#endif
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
new file mode 100644
index 0000000..1770535
--- /dev/null
+++ b/lib/THC/generic/THCTensor.c
@@ -0,0 +1,858 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.c"
+#else
+
+/**** access methods ****/
+THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self)
+{
+  return self->storage;
+}
+
+ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
+{
+  return self->storageOffset;
+}
+
+int THCTensor_(nDimension)(THCState *state, const THCTensor *self)
+{
+  return self->nDimension;
+}
+
+long THCTensor_(size)(THCState *state, const THCTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
+  return self->size[dim];
+}
+
+long THCTensor_(stride)(THCState *state, const THCTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
+  return self->stride[dim];
+}
+
+THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
+{
+  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self)
+{
+  THLongStorage *stride = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(stride, self->stride);
+  return stride;
+}
+
+real *THCTensor_(data)(THCState *state, const THCTensor *self)
+{
+  if(self->storage)
+    return (self->storage->data+self->storageOffset);
+  else
+    return NULL;
+}
+
+void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag)
+{
+  self->flag |= flag;
+}
+
+void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
+{
+  self->flag &= ~flag;
+}
+
+/**** creation methods ****/
+
+static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
+static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
+
+
+/* Empty init */
+THCTensor *THCTensor_(new)(THCState *state)
+{
+  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
+  THCTensor_(rawInit)(state, self);
+  return self;
+}
+
+/* Pointer-copy init */
+THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
+{
+  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
+  THCTensor_(rawInit)(state, self);
+  THCTensor_(rawSet)(state,
+                      self,
+                      tensor->storage,
+                      tensor->storageOffset,
+                      tensor->nDimension,
+                      tensor->size,
+                      tensor->stride);
+  return self;
+}
+
+/* Storage init */
+THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
+{
+  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
+  if(size && stride)
+    THArgCheck(size->size == stride->size, 4, "inconsistent size");
+
+  THCTensor_(rawInit)(state, self);
+  THCTensor_(rawSet)(state,
+                      self,
+                      storage,
+                      storageOffset,
+                      (size ? size->size : (stride ? stride->size : 0)),
+                      (size ? size->data : NULL),
+                      (stride ? stride->data : NULL));
+
+  return self;
+}
+THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               long size0, long stride0)
+{
+  return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, -1, -1,  -1, -1,  -1, -1);
+}
+
+THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1)
+{
+  return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1,  -1, -1,  -1, -1);
+}
+
+THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1,
+                               long size2, long stride2)
+{
+  return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1,  size2, stride2,  -1, -1);
+}
+
+THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1,
+                               long size2, long stride2,
+                               long size3, long stride3)
+{
+  long size[4] = {size0, size1, size2, size3};
+  long stride[4] = {stride0, stride1, stride2, stride3};
+
+  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
+  THCTensor_(rawInit)(state, self);
+  THCTensor_(rawSet)(state, self, storage, storageOffset, 4, size, stride);
+
+  return self;
+}
+
+THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size, THLongStorage *stride)
+{
+  return THCTensor_(newWithStorage)(state, NULL, 0, size, stride);
+}
+
+THCTensor *THCTensor_(newWithSize1d)(THCState *state, long size0)
+{
+  return THCTensor_(newWithSize4d)(state, size0, -1, -1, -1);
+}
+
+THCTensor *THCTensor_(newWithSize2d)(THCState *state, long size0, long size1)
+{
+  return THCTensor_(newWithSize4d)(state, size0, size1, -1, -1);
+}
+
+THCTensor *THCTensor_(newWithSize3d)(THCState *state, long size0, long size1, long size2)
+{
+  return THCTensor_(newWithSize4d)(state, size0, size1, size2, -1);
+}
+
+THCTensor *THCTensor_(newWithSize4d)(THCState *state, long size0, long size1, long size2, long size3)
+{
+  long size[4] = {size0, size1, size2, size3};
+
+  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
+  THCTensor_(rawInit)(state, self);
+  THCTensor_(rawResize)(state, self, 4, size, NULL);
+
+  return self;
+}
+
+THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self)
+{
+  THCTensor *tensor = THCTensor_(new)(state);
+  THCTensor_(resizeAs)(state, tensor, self);
+  THCTensor_(copy)(state, tensor, self);
+  return tensor;
+}
+
+THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *self)
+{
+  if(!THCTensor_(isContiguous)(state, self))
+    return THCTensor_(newClone)(state, self);
+  else
+  {
+    THCTensor_(retain)(state, self);
+    return self;
+  }
+}
+
+THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, long sliceIndex_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(select)(state, self, NULL, dimension_, sliceIndex_);
+  return self;
+}
+
+THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, long firstIndex_, long size_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(narrow)(state, self, NULL, dimension_, firstIndex_, size_);
+  return self;
+}
+
+THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(transpose)(state, self, NULL, dimension1_, dimension2_);
+  return self;
+}
+
+THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, long size_, long step_)
+{
+  THCTensor *self = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(unfold)(state, self, NULL, dimension_, size_, step_);
+  return self;
+}
+
+/* Resize */
+void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
+{
+  THArgCheck(size != NULL, 2, "invalid size");
+  if(stride)
+    THArgCheck(stride->size == size->size, 3, "invalid stride");
+
+  THCTensor_(rawResize)(state, self, size->size, size->data, (stride ? stride->data : NULL));
+}
+
+void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  int isSame = 0;
+  int d;
+  if(self->nDimension == src->nDimension)
+  {
+    isSame = 1;
+    for(d = 0; d < self->nDimension; d++)
+    {
+      if(self->size[d] != src->size[d])
+      {
+        isSame = 0;
+        break;
+      }
+    }
+  }
+
+  if(!isSame)
+    THCTensor_(rawResize)(state, self, src->nDimension, src->size, NULL);
+}
+
+void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, long size0)
+{
+  THCTensor_(resize4d)(state, tensor, size0, -1, -1, -1);
+}
+
+void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, long size0, long size1)
+{
+  THCTensor_(resize4d)(state, tensor, size0, size1, -1, -1);
+}
+
+void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, long size0, long size1, long size2)
+{
+  THCTensor_(resize4d)(state, tensor, size0, size1, size2, -1);
+}
+
+void THCTensor_(resize4d)(THCState *state, THCTensor *self, long size0, long size1, long size2, long size3)
+{
+  long size[4] = {size0, size1, size2, size3};
+
+  THCTensor_(rawResize)(state, self, 4, size, NULL);
+}
+
+void THCTensor_(resize5d)(THCState *state, THCTensor *self, long size0, long size1, long size2, long size3, long size4)
+{
+    long size[5] = {size0, size1, size2, size3, size4};
+
+  THCTensor_(rawResize)(state, self, 5, size, NULL);
+}
+
+void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  if(self != src)
+    THCTensor_(rawSet)(state,
+                        self,
+                        src->storage,
+                        src->storageOffset,
+                        src->nDimension,
+                        src->size,
+                        src->stride);
+}
+
+void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
+{
+  if(size_ && stride_)
+    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
+
+  THCTensor_(rawSet)(state,
+                      self,
+                      storage_,
+                      storageOffset_,
+                      (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                      (size_ ? size_->data : NULL),
+                      (stride_ ? stride_->data : NULL));
+}
+
+void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             long size0_, long stride0_)
+{
+  THCTensor_(setStorage4d)(state, self, storage_, storageOffset_,
+                            size0_, stride0_,
+                            -1, -1,
+                            -1, -1,
+                            -1, -1);
+}
+
+void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_)
+{
+  THCTensor_(setStorage4d)(state, self, storage_, storageOffset_,
+                            size0_, stride0_,
+                            size1_, stride1_,
+                            -1, -1,
+                            -1, -1);
+}
+
+void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_,
+                             long size2_, long stride2_)
+{
+  THCTensor_(setStorage4d)(state, self, storage_, storageOffset_,
+                            size0_, stride0_,
+                            size1_, stride1_,
+                            size2_, stride2_,
+                            -1, -1);
+}
+
+void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_,
+                             long size2_, long stride2_,
+                             long size3_, long stride3_)
+{
+
+  long size[4] = {size0_, size1_, size2_, size3_};
+  long stride[4] = {stride0_, stride1_, stride2_, stride3_};
+
+  THCTensor_(rawSet)(state, self, storage_, storageOffset_, 4, size, stride);
+}
+
+
+void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension, long firstIndex, long size)
+{
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension >= 0) && (dimension < src->nDimension), 3, "out of range");
+  THArgCheck( (firstIndex >= 0) && (firstIndex < src->size[dimension]), 4, "out of range");
+  THArgCheck( (size > 0) && (firstIndex+size <= src->size[dimension]), 5, "out of range");
+
+  THCTensor_(set)(state, self, src);
+
+  if(firstIndex > 0)
+    self->storageOffset += firstIndex*self->stride[dimension];
+
+  self->size[dimension] = size;
+}
+
+void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension, long sliceIndex)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck(src->nDimension > 1, 1, "cannot select on a vector");
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 3, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 4, "out of range");
+
+  THCTensor_(set)(state, self, src);
+  THCTensor_(narrow)(state, self, NULL, dimension, sliceIndex, 1);
+  for(d = dimension; d < self->nDimension-1; d++)
+  {
+    self->size[d] = self->size[d+1];
+    self->stride[d] = self->stride[d+1];
+  }
+  self->nDimension--;
+}
+
+void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1, int dimension2)
+{
+  long z;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->nDimension), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->nDimension), 2, "out of range");
+
+  THCTensor_(set)(state, self, src);
+
+  if(dimension1 == dimension2)
+    return;
+
+  z = self->stride[dimension1];
+  self->stride[dimension1] = self->stride[dimension2];
+  self->stride[dimension2] = z;
+  z = self->size[dimension1];
+  self->size[dimension1] = self->size[dimension2];
+  self->size[dimension2] = z;
+}
+
+void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension, long size, long step)
+{
+  long *newSize;
+  long *newStride;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (src->nDimension > 0), 1, "cannot unfold an empty tensor");
+  THArgCheck(dimension < src->nDimension, 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(step > 0, 4, "invalid step");
+
+  THCTensor_(set)(state, self, src);
+
+  newSize = (long*)THAlloc(sizeof(long)*(self->nDimension+1));
+  newStride = (long*)THAlloc(sizeof(long)*(self->nDimension+1));
+
+  newSize[self->nDimension] = size;
+  newStride[self->nDimension] = self->stride[dimension];
+  for(d = 0; d < self->nDimension; d++)
+  {
+    if(d == dimension)
+    {
+      newSize[d] = (self->size[d] - size) / step + 1;
+      newStride[d] = step*self->stride[d];
+    }
+    else
+    {
+      newSize[d] = self->size[d];
+      newStride[d] = self->stride[d];
+    }
+  }
+
+  THFree(self->size);
+  THFree(self->stride);
+
+  self->size = newSize;
+  self->stride = newStride;
+  self->nDimension++;
+}
+
+/* we have to handle the case where the result is a number */
+void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  int ndim = 0;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THCTensor_(set)(state, self, src);
+
+  for(d = 0; d < src->nDimension; d++)
+  {
+    if(src->size[d] != 1)
+    {
+      if(d != ndim)
+      {
+        self->size[ndim] = src->size[d];
+        self->stride[ndim] = src->stride[d];
+      }
+      ndim++;
+    }
+  }
+
+  /* right now, we do not handle 0-dimension tensors */
+  if(ndim == 0 && src->nDimension > 0)
+  {
+    self->size[0] = 1;
+    self->stride[0] = 1;
+    ndim = 1;
+  }
+  self->nDimension = ndim;
+}
+
+void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck(dimension < src->nDimension, 3, "dimension out of range");
+
+  THCTensor_(set)(state, self, src);
+
+  if(src->size[dimension] == 1 && src->nDimension > 1)
+  {
+    for(d = dimension; d < self->nDimension-1; d++)
+    {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->nDimension--;
+  }
+}
+
+int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
+{
+  long z = 1;
+  int d;
+  for(d = self->nDimension-1; d >= 0; d--)
+  {
+    if(self->size[d] != 1)
+    {
+      if(self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return 0;
+    }
+  }
+  return 1;
+}
+
+int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims)
+{
+  int d;
+  if (self->nDimension != dims->size)
+    return 0;
+
+  for (d = 0; d < self->nDimension; ++d)
+  {
+    if (self->size[d] != dims->data[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src)
+{
+  if (self->storage == src->storage &&
+      self->storageOffset == src->storageOffset &&
+      self->nDimension == src->nDimension)
+  {
+    int d;
+    for (d = 0; d < self->nDimension; ++d)
+    {
+      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor* src)
+{
+  int d;
+  if (self->nDimension != src->nDimension)
+    return 0;
+  for(d = 0; d < self->nDimension; ++d)
+  {
+    if(self->size[d] != src->size[d])
+      return 0;
+  }
+  return 1;
+}
+
+ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self)
+{
+  if(self->nDimension == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < self->nDimension; d++)
+      nElement *= self->size[d];
+    return nElement;
+  }
+}
+
+void THCTensor_(retain)(THCState *state, THCTensor *self)
+{
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+    THAtomicIncrementRef(&self->refcount);
+}
+
+void THCTensor_(free)(THCState *state, THCTensor *self)
+{
+  if(!self)
+    return;
+
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+  {
+    if(THAtomicDecrementRef(&self->refcount))
+    {
+      THFree(self->size);
+      THFree(self->stride);
+      if(self->storage)
+        THCStorage_(free)(state, self->storage);
+      THFree(self);
+    }
+  }
+}
+
+void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst)
+{
+  if(self != dst)
+    THCTensor_(copy)(state, dst, self);
+
+  THCTensor_(free)(state, self);
+}
+
+/*******************************************************************************/
+
+static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
+{
+  self->refcount = 1;
+  self->storage = NULL;
+  self->storageOffset = 0;
+  self->size = NULL;
+  self->stride = NULL;
+  self->nDimension = 0;
+  self->flag = TH_TENSOR_REFCOUNTED;
+}
+
+static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+{
+  /* storage */
+  if(self->storage != storage)
+  {
+    if(self->storage)
+      THCStorage_(free)(state, self->storage);
+
+    if(storage)
+    {
+      self->storage = storage;
+      THCStorage_(retain)(state, self->storage);
+    }
+    else
+      self->storage = NULL;
+  }
+
+  /* storageOffset */
+  if(storageOffset < 0)
+    THError("Tensor: invalid storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THCTensor_(rawResize)(state, self, nDimension, size, stride);
+}
+
+void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
+{
+  int d;
+  int nDimension_;
+  ptrdiff_t totalSize;
+  int hascorrectsize = 1;
+
+  nDimension_ = 0;
+  for(d = 0; d < nDimension; d++)
+  {
+    if(size[d] > 0)
+    {
+      nDimension_++;
+      if((self->nDimension > d) && (size[d] != self->size[d]))
+        hascorrectsize = 0;
+
+      if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
+        hascorrectsize = 0;
+    }
+    else
+      break;
+  }
+  nDimension = nDimension_;
+
+  if(nDimension != self->nDimension)
+    hascorrectsize = 0;
+
+  if(hascorrectsize)
+    return;
+
+  if(nDimension > 0)
+  {
+    if(nDimension != self->nDimension)
+    {
+      self->size = (long*)THRealloc(self->size, sizeof(long)*nDimension);
+      self->stride = (long*)THRealloc(self->stride, sizeof(long)*nDimension);
+      self->nDimension = nDimension;
+    }
+
+    totalSize = 1;
+    for(d = self->nDimension-1; d >= 0; d--)
+    {
+      self->size[d] = size[d];
+      if(stride && (stride[d] >= 0) )
+        self->stride[d] = stride[d];
+      else
+      {
+        if(d == self->nDimension-1)
+          self->stride[d] = 1;
+        else
+          self->stride[d] = self->size[d+1]*self->stride[d+1];
+      }
+      totalSize += (self->size[d]-1)*self->stride[d];
+    }
+
+    if(totalSize+self->storageOffset > 0)
+    {
+      if(!self->storage)
+        self->storage = THCStorage_(new)(state);
+      if(totalSize+self->storageOffset > self->storage->size)
+        THCStorage_(resize)(state, self->storage, totalSize+self->storageOffset);
+    }
+  }
+  else
+    self->nDimension = 0;
+}
+
+void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value)
+{
+  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+}
+
+real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0)
+{
+  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+}
+
+void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value)
+{
+  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+}
+
+real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1)
+{
+  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+}
+
+void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value)
+{
+  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+}
+
+real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2)
+{
+  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+}
+
+void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value)
+{
+  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+}
+
+real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3)
+{
+  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+}
+
+int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
+{
+  /* FIXME: remove this flag after any users stop using it since it is
+     now superseded by the runtime option */
+#ifdef DISABLE_CHECK_GPU
+  return 1;
+#else
+  int kernelP2PEnabled =
+    THCState_getKernelPeerToPeerAccessEnabled(state);
+
+  int curDev = -1;
+  THCudaCheck(cudaGetDevice(&curDev));
+  va_list(args);
+  va_start(args, nTensors);
+  int valid = 1;
+  for (unsigned int i = 0; i < nTensors; i++) {
+    THCTensor* tensor = va_arg(args, THCTensor*);
+    if (tensor == NULL) {
+      continue;
+    }
+    int tensorDev = THCTensor_(getDevice)(state, tensor);
+    if (tensorDev == -1) {
+      /* This tensor does not have GPU memory (empty) */
+      continue;
+    }
+
+    if (tensorDev != curDev) {
+      if (kernelP2PEnabled) {
+        /* Kernel p2p access is allowed */
+        /* Can `curDev` access `tensorDev` directly? */
+        if (!THCState_getPeerToPeerAccess(state, curDev, tensorDev)) {
+          valid = 0;
+          break;
+        }
+      } else {
+        /* No kernel p2p access allowed */
+        valid = 0;
+        break;
+      }
+    }
+  }
+
+  va_end(args);
+  return valid;
+#endif // DISABLE_CHECK_GPU
+}
+
+THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor) {
+  const int L = THC_DESC_BUFF_LEN;
+  THCDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < tensor->nDimension; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
+    if(i < tensor->nDimension-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensor.cu b/lib/THC/generic/THCTensor.cu
new file mode 100644
index 0000000..29561ca
--- /dev/null
+++ b/lib/THC/generic/THCTensor.cu
@@ -0,0 +1,36 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.cu"
+#else
+
+cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  cudaTextureObject_t texObj;
+  struct cudaResourceDesc resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+  resDesc.resType = cudaResourceTypeLinear;
+  resDesc.res.linear.devPtr = THCTensor_(data)(state, self);
+  resDesc.res.linear.sizeInBytes = THCTensor_(nElement)(state, self) * 4;
+  resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0,
+                                                  cudaChannelFormatKindFloat);
+  struct cudaTextureDesc texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+  cudaError errcode = cudaGetLastError();
+  if(errcode != cudaSuccess) {
+    if (THCTensor_(nElement)(state, self) > 2>>27)
+      THError("Failed to create texture object, "
+              "nElement:%ld exceeds 27-bit addressing required for tex1Dfetch. Cuda Error: %s",
+              THCTensor_(nElement)(state, self), cudaGetErrorString(errcode));
+    else
+      THError("Failed to create texture object: %s", cudaGetErrorString(errcode));
+  }
+  return texObj;
+}
+
+THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
+  if (!tensor->storage) return -1;
+  return THCStorage_(getDevice)(state, tensor->storage);
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
new file mode 100644
index 0000000..9cd4807
--- /dev/null
+++ b/lib/THC/generic/THCTensor.h
@@ -0,0 +1,133 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensor.h"
+#else
+
+#define TH_TENSOR_REFCOUNTED 1
+
+typedef struct THCTensor
+{
+    long *size;
+    long *stride;
+    int nDimension;
+
+    THCStorage *storage;
+    ptrdiff_t storageOffset;
+    int refcount;
+
+    char flag;
+
+} THCTensor;
+
+
+/**** access methods ****/
+THC_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self);
+THC_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self);
+THC_API long THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
+THC_API long THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
+THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self);
+THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self);
+THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self);
+
+THC_API void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag);
+THC_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag);
+
+
+/**** creation methods ****/
+THC_API THCTensor *THCTensor_(new)(THCState *state);
+THC_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor);
+/* stride might be NULL */
+THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                long size0_, long stride0_);
+THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_);
+THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_,
+                                long size2_, long stride2_);
+THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_,
+                                long size2_, long stride2_,
+                                long size3_, long stride3_);
+
+/* stride might be NULL */
+THC_API THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size_, THLongStorage *stride_);
+THC_API THCTensor *THCTensor_(newWithSize1d)(THCState *state, long size0_);
+THC_API THCTensor *THCTensor_(newWithSize2d)(THCState *state, long size0_, long size1_);
+THC_API THCTensor *THCTensor_(newWithSize3d)(THCState *state, long size0_, long size1_, long size2_);
+THC_API THCTensor *THCTensor_(newWithSize4d)(THCState *state, long size0_, long size1_, long size2_, long size3_);
+
+THC_API THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self);
+THC_API THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *tensor);
+THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, long sliceIndex_);
+THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, long firstIndex_, long size_);
+THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
+THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, long size_, long step_);
+
+THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
+THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
+THC_API void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, long size0_);
+THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, long size0_, long size1_);
+THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_);
+THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_);
+THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
+THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride);
+
+THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    long size0_, long stride0_);
+THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_);
+THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_,
+                                    long size2_, long stride2_);
+THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_,
+                                    long size2_, long stride2_,
+                                    long size3_, long stride3_);
+
+THC_API void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, long firstIndex_, long size_);
+THC_API void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, long sliceIndex_);
+THC_API void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1_, int dimension2_);
+THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, long size_, long step_);
+
+THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+
+THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
+THC_API int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src);
+THC_API int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims);
+THC_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self);
+
+THC_API void THCTensor_(retain)(THCState *state, THCTensor *self);
+THC_API void THCTensor_(free)(THCState *state, THCTensor *self);
+THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
+
+/* Slow access methods [check everything] */
+THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value);
+THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value);
+THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value);
+THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value);
+
+THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0);
+THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1);
+THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2);
+THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3);
+
+/* CUDA-specific functions */
+THC_API cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self);
+THC_API int THCTensor_(getDevice)(THCState *state, const THCTensor *self);
+THC_API int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...);
+
+/* debug methods */
+THC_API THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor);
+
+#endif
diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c
new file mode 100644
index 0000000..874a71e
--- /dev/null
+++ b/lib/THC/generic/THCTensorCopy.c
@@ -0,0 +1,169 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.c"
+#else
+
+/* specific methods */
+
+void THCTensor_(copyCPU)(THCState *state, THCTensor *self, struct THTensor *src)
+{
+  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
+
+  {
+    THCTensor *selfc = THCTensor_(newContiguous)(state, self);
+    src = THTensor_(newContiguous)(src);
+
+    THCudaCheck(cudaMemcpy(THCTensor_(data)(state,selfc),
+                           THTensor_(data)(src),
+                           THTensor_(nElement)(src) * sizeof(real),
+                           cudaMemcpyHostToDevice));
+
+    THTensor_(free)(src);
+    THCTensor_(freeCopyTo)(state, selfc, self);
+  }
+}
+
+#define IMPLEMENT_TH_CUDA_TENSOR_COPY(TYPEC)                            \
+void THCTensor_(copy##TYPEC)(THCState *state, THCTensor *self, struct TH##TYPEC##Tensor *src)                \
+{                                                                       \
+  THArgCheck(THCTensor_(nElement)(state, self) == TH##TYPEC##Tensor_nElement(src), 2, "sizes do not match"); \
+  if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {               \
+    THCTensor_(copyCPU)(state, self, (THTensor*) src);  /* cast just removes warnings */                     \
+  } else {                                                              \
+    THLongStorage *size = TH##TYPEC##Tensor_newSizeOf(src);             \
+    THTensor *srcf = THTensor_(newWithSize)(size, NULL);                \
+                                                                        \
+    THTensor_(copy##TYPEC)(srcf, src);                                  \
+    THCTensor_(copyCPU)(state, self, srcf);                             \
+                                                                        \
+    THLongStorage_free(size);                                           \
+    THTensor_(free)(srcf);                                              \
+  }                                                                     \
+}
+
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Byte)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Char)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Short)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Int)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Long)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Float)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Double)
+IMPLEMENT_TH_CUDA_TENSOR_COPY(Half)
+
+/* copyCuda */
+
+void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
+{
+  THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
+
+  {
+    THTensor *selfc = THTensor_(newContiguous)(self);
+    src = THCTensor_(newContiguous)(state, src);
+
+    THCudaCheck(cudaMemcpy(THTensor_(data)(selfc),
+                           THCTensor_(data)(state, src),
+                           THCTensor_(nElement)(state, src) * sizeof(real),
+                           cudaMemcpyDeviceToHost));
+
+    THCTensor_(free)(state, src);
+    THTensor_(freeCopyTo)(selfc, self);
+  }
+}
+
+#define IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(TYPEC)                           \
+  void TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(THCState *state, TH##TYPEC##Tensor *self, struct THCTensor *src) \
+  {                                                                       \
+    THArgCheck(TH##TYPEC##Tensor_nElement(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");       \
+    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {   \
+      THTensor_(copyCuda)(state, (THTensor*) self, src);  /* cast just removes compiler warning */                   \
+    } else {                                                              \
+      THLongStorage *size = THCTensor_(newSizeOf)(state, src);            \
+      THTensor *srcf = THTensor_(newWithSize)(size, NULL);                \
+                                                                          \
+      THTensor_(copyCuda)(state, srcf, src);                              \
+      TH_CONCAT_4(TH,TYPEC,Tensor_copy,Real)(self, srcf);                 \
+                                                                          \
+      THLongStorage_free(size);                                           \
+      THTensor_(free)(srcf);                                              \
+    }                                                                     \
+  }
+
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Byte)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Char)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Short)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Int)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double)
+IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Half)
+
+void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THCTensor_(copy)(state, self, src);
+}
+
+void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src)
+{
+  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
+  THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous");
+
+  if (THCTensor_(nElement)(state, self) == 0) return;
+
+  // Perform the copy wrt the current stream on the CudaTensor's device.
+  int tensorDevice = THCTensor_(getDevice)(state, self);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(tensorDevice));
+  }
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
+                              THTensor_(data)(src),
+                              THTensor_(nElement)(src) * sizeof(real),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+  THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(currentDevice));
+  }
+}
+
+void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src)
+{
+  THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
+  THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous");
+  THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous");
+
+  if (THTensor_(nElement)(self) == 0) return;
+
+  // Perform the copy wrt the current stream on the CudaTensor's device.
+  int tensorDevice = THCTensor_(getDevice)(state, src);
+  int currentDevice;
+  THCudaCheck(cudaGetDevice(&currentDevice));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(tensorDevice));
+  }
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
+                              THCTensor_(data)(state, src),
+                              THCTensor_(nElement)(state, src) * sizeof(real),
+                              cudaMemcpyDeviceToHost,
+                              stream));
+
+  THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
+
+  if (currentDevice != tensorDevice) {
+    THCudaCheck(cudaSetDevice(currentDevice));
+  }
+}
+
+#undef IMPLEMENT_TH_CUDA_TENSOR_COPY
+#undef IMPLEMENT_TH_CUDA_TENSOR_COPY_TO
+
+#endif
diff --git a/lib/THC/generic/THCTensorCopy.cu b/lib/THC/generic/THCTensorCopy.cu
new file mode 100644
index 0000000..4198025
--- /dev/null
+++ b/lib/THC/generic/THCTensorCopy.cu
@@ -0,0 +1,47 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.cu"
+#else
+
+THC_API void
+THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
+  THC_copyTensor<THCTensor, THCTensor>(state, dst, src);
+}
+
+THC_API void
+THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+  // Called when we are copying into an overlapping index `dst`, but
+  // we don't care which writer wins. Hacky but it works.
+  // This is itself invoked by pointwiseApply2 / THCTensor_copy in
+  // case that there are write overlaps.
+  // FIXME: really, overlapping writes should be illegal/an error in Torch
+  THC_pointwiseApply2(
+    state, dst, src,
+    CopyOp<typename TensorUtils<THCTensor>::DataType,
+           typename TensorUtils<THCTensor>::DataType>(),
+    ReadOnly, /* ignore overwrites */
+    ReadOnly);
+}
+
+#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA)                 \
+  THC_API void                                                          \
+  THCTensor_(copyCuda##TYPEC)(THCState *state,                          \
+                              THCTensor *self,                          \
+                              THCuda##TYPECUDA##Tensor *src) {          \
+    THC_copyTensor<THCTensor, THCuda##TYPECUDA##Tensor>(state, self, src); \
+  }
+
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long)
+// THCudaTensor aka the non-existent THCudaFloatTensor
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, )
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double)
+#ifdef CUDA_HALF_TENSOR
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half)
+#endif
+
+#undef IMPLEMENT_THC_CUDA_TENSOR_COPY
+
+#endif
diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h
new file mode 100644
index 0000000..e549f09
--- /dev/null
+++ b/lib/THC/generic/THCTensorCopy.h
@@ -0,0 +1,43 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorCopy.h"
+#else
+
+THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src);
+THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src);
+THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src);
+THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor *src);
+THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src);
+THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src);
+THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src);
+THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src);
+
+THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src);
+THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src);
+THC_API void THCTensor_(copyCudaShort)(THCState *state, THCTensor *dst, struct THCudaShortTensor *src);
+THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *dst, struct THCudaIntTensor *src);
+THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *dst, struct THCudaLongTensor *src);
+THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *dst, struct THCudaTensor *src);
+THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *dst, struct THCudaDoubleTensor *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *dst, struct THCudaHalfTensor *src);
+#endif
+
+THC_API void TH_CONCAT_2(THByteTensor_copyCuda  , Real)  (THCState *state, THByteTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THCharTensor_copyCuda  , Real)  (THCState *state, THCharTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THShortTensor_copyCuda , Real)  (THCState *state, THShortTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THIntTensor_copyCuda   , Real)  (THCState *state, THIntTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THLongTensor_copyCuda  , Real)  (THCState *state, THLongTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real)  (THCState *state, THFloatTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real)  (THCState *state, THDoubleTensor *self, THCTensor *src);
+THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real)    (THCState *state, THHalfTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src);
+
+THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src);
+
+THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src);
+THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src);
+
+#endif
diff --git a/lib/THC/generic/THCTensorIndex.cu b/lib/THC/generic/THCTensorIndex.cu
new file mode 100644
index 0000000..ce4c790
--- /dev/null
+++ b/lib/THC/generic/THCTensorIndex.cu
@@ -0,0 +1,507 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorIndex.cu"
+#else
+
+void THCTensor_(indexCopy_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+
+  THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
+  THCudaLongTensor_copyLong(state, indices_, indices);
+
+  THCTensor_(indexCopy)(state, dst, dim, indices_, src);
+
+  THCudaLongTensor_free(state, indices_);
+}
+
+void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+
+  long dims = THCTensor_(nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+
+  long srcDims = THCTensor_(nDimension)(state, src);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THArgCheck(THCudaLongTensor_nDimension(state, indices) == 1, 3,
+             "expecting vector of indices");
+  THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds");
+  THArgCheck(srcDims > 0, 2, "Source tensor is empty");
+  THArgCheck(numIndices == src->size[dim], 4, "length of src.size[dim] is not equal to length of indices");
+
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
+  long dstCopyDimSize = THCTensor_(size)(state, dst, dim);
+  ptrdiff_t sliceSize = srcTotalSize / numIndices;
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexCopySmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>       \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
+      dstInfo, srcInfo, indicesInfo,                            \
+      dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexCopyLargeIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>       \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(           \
+      dstInfo, srcInfo, indicesInfo,                            \
+      dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, dst);
+    int dstCopyDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstCopyDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, src);
+    int srcCopyDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcCopyDim);
+
+    TensorInfo<long, unsigned int> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        LARGE_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        LARGE_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    }
+  } else {
+    TensorInfo<real, unsigned long> dstInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, dst);
+    int dstCopyDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstCopyDim);
+
+    TensorInfo<real, unsigned long> srcInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, src);
+    int srcCopyDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcCopyDim);
+
+    TensorInfo<long, unsigned long> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, unsigned long, -1, -1, -1);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+void THCTensor_(indexAdd_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+
+  THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
+  THCudaLongTensor_copyLong(state, indices_, indices);
+
+  THCTensor_(indexAdd)(state, dst, dim, indices_, src);
+
+  THCudaLongTensor_free(state, indices_);
+}
+
+void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+
+  long dims = THCTensor_(nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+
+  long srcDims = THCTensor_(nDimension)(state, src);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THArgCheck(THCudaLongTensor_nDimension(state, indices) == 1, 3,
+             "expecting vector of indices");
+  THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds");
+  THArgCheck(srcDims > 0, 2, "Source tensor is empty");
+  THArgCheck(numIndices == src->size[dim], 4, "length of src.size[dim] is not equal to length of indices");
+
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
+  long dstAddDimSize = THCTensor_(size)(state, dst, dim);
+  ptrdiff_t sliceSize = srcTotalSize / numIndices;
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexAddSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, srcInfo, indicesInfo,                    \
+      dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexAddLargeIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(   \
+      dstInfo, srcInfo, indicesInfo,                    \
+      dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, dst);
+    int dstAddDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstAddDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, src);
+    int srcAddDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcAddDim);
+
+    TensorInfo<long, unsigned int> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        LARGE_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        LARGE_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    }
+  } else {
+    TensorInfo<real, unsigned long> dstInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, dst);
+    int dstAddDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstAddDim);
+
+    TensorInfo<real, unsigned long> srcInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, src);
+    int srcAddDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcAddDim);
+
+    TensorInfo<long, unsigned long> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, unsigned long, -1, -1, -1);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+void THCTensor_(indexFill_long)(THCState *state, THCTensor *dst, int dim, THLongTensor *indices, real val)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, dst));
+
+  THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
+  THCudaLongTensor_copyLong(state, indices_, indices);
+
+  THCTensor_(indexFill)(state, dst, dim, indices_, val);
+
+  THCudaLongTensor_free(state, indices_);
+}
+
+void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongTensor *indices, real val)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, dst));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  long dims = THCTensor_(nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+
+  long srcDims = THCTensor_(nDimension)(state, dst);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THArgCheck(THCudaLongTensor_nDimension(state, indices) == 1, 3,
+             "expecting vector of indices");
+  THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds");
+  THArgCheck(srcDims > 0, 2, "Source tensor is empty");
+
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
+  long dstFillDimSize = THCTensor_(size)(state, dst, dim);
+  ptrdiff_t sliceSize = dstTotalSize / dstFillDimSize;
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM)  \
+  indexFillSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM> \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, indicesInfo,                             \
+      dstFillDim, sliceSize, dstFillDimSize, val);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM)  \
+  indexFillLargeIndex<TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM> \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(   \
+      dstInfo, indicesInfo,                             \
+      dstFillDim, sliceSize, dstFillDimSize, val);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, dst);
+    int dstFillDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstFillDim);
+
+    TensorInfo<long, unsigned int> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, -2);
+      } else if (dstInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, -2);
+      } else if (dstInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1);
+      }
+    } else {
+      if (dstInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, -2);
+      } else if (dstInfo.dims == 2 && indContig) {
+        LARGE_INDEX(real, unsigned int, 2, -2);
+      } else if (dstInfo.dims == 3 && indContig) {
+        LARGE_INDEX(real, unsigned int, 3, -2);
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1);
+      }
+    }
+  } else {
+    TensorInfo<real, unsigned long> dstInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, dst);
+    int dstFillDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstFillDim);
+
+    TensorInfo<long, unsigned long> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, unsigned long, -1, -1);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+
+void THCTensor_(indexSelect_long)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THLongTensor *indices)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, dst, src));
+  THArgCheck(indices->nDimension == 1, 3, "Index is supposed to be a vector");
+
+  THCudaLongTensor *indices_ = THCudaLongTensor_newWithSize1d(state, indices->size[0]);
+  THCudaLongTensor_copyLong(state, indices_, indices);
+
+  THCTensor_(indexSelect)(state, dst, src, dim, indices_);
+
+  THCudaLongTensor_free(state, indices_);
+}
+
+void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, int dim, THCudaLongTensor *indices)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, dst, src, indices));
+
+  long dims = THCTensor_(nDimension)(state, dst);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(nDimension)(state, src);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
+
+  long srcDims = THCTensor_(nDimension)(state, src);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  THArgCheck(THCudaLongTensor_nDimension(state, indices) == 1, 3,
+             "expecting vector of indices");
+  THArgCheck(dim < srcDims, 4, "Indexing dim is out of bounds");
+  THArgCheck(srcDims > 0, 2, "Source tensor is empty");
+
+  THLongStorage *newSize = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(newSize, dim, numIndices);
+  THCTensor_(resize)(state, dst, newSize, NULL);
+  THLongStorage_free(newSize);
+
+  int indContig = THCudaLongTensor_isContiguous(state, indices);
+
+  // The `src` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
+  long srcSelectDimSize = THCTensor_(size)(state, src, dim);
+  ptrdiff_t sliceSize = dstTotalSize / numIndices;
+
+  int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+  indexSelectSmallIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>     \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
+      dstInfo, srcInfo, indicesInfo,                            \
+      dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize);
+
+#define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM)         \
+  indexSelectLargeIndex<TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>     \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                   \
+      dstInfo, srcInfo, indicesInfo,                                    \
+      dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize);
+
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<real, unsigned int> dstInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, dst);
+    int dstSelectDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstSelectDim);
+
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, src);
+    int srcSelectDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcSelectDim);
+
+    TensorInfo<long, unsigned int> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, indices);
+    indicesInfo.collapseDims();
+
+    // A reasonable choice for when to have each thread iterate over
+    // indices to choose
+    if (numIndices <= 16) {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        SMALL_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        SMALL_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        SMALL_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    } else {
+      if (dstInfo.dims == 1 && srcInfo.dims == 1 && indContig) {
+        LARGE_INDEX(real, unsigned int, 1, 1, -2);
+      } else if (dstInfo.dims == 2 && srcInfo.dims == 2 && indContig) {
+        LARGE_INDEX(real, unsigned int, 2, 2, -2);
+      } else if (dstInfo.dims == 3 && srcInfo.dims == 3 && indContig) {
+        LARGE_INDEX(real, unsigned int, 3, 3, -2);
+      } else {
+        LARGE_INDEX(real, unsigned int, -1, -1, -1);
+      }
+    }
+  } else {
+    TensorInfo<real, unsigned long> dstInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, dst);
+    int dstSelectDim = dstInfo.collapseDims(dim);
+    dstInfo.reduceDim(dstSelectDim);
+
+    TensorInfo<real, unsigned long> srcInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, src);
+    int srcSelectDim = srcInfo.collapseDims(dim);
+    srcInfo.reduceDim(srcSelectDim);
+
+    TensorInfo<long, unsigned long> indicesInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, indices);
+    indicesInfo.collapseDims();
+
+    LARGE_INDEX(real, unsigned long, -1, -1, -1);
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorIndex.h b/lib/THC/generic/THCTensorIndex.h
new file mode 100644
index 0000000..cf8a33a
--- /dev/null
+++ b/lib/THC/generic/THCTensorIndex.h
@@ -0,0 +1,15 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorIndex.h"
+#else
+
+THC_API void THCTensor_(indexCopy)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexAdd)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexFill)(THCState *state, THCTensor *tensor, int dim, THCudaLongTensor *index, real val);
+THC_API void THCTensor_(indexSelect)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
+
+THC_API void THCTensor_(indexCopy_long)(THCState *state, THCTensor *res_, int dim, THLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexAdd_long)(THCState *state, THCTensor *res_, int dim, THLongTensor *indices, THCTensor *src);
+THC_API void THCTensor_(indexFill_long)(THCState *state, THCTensor *tensor, int dim, THLongTensor *index, real val);
+THC_API void THCTensor_(indexSelect_long)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THLongTensor *index);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu
new file mode 100644
index 0000000..05d9360
--- /dev/null
+++ b/lib/THC/generic/THCTensorMasked.cu
@@ -0,0 +1,193 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.cu"
+#else
+
+
+THC_API void
+THCTensor_(maskedFill)(THCState* state,
+                       THCTensor *tensor, THCudaByteTensor *mask, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor, mask));
+  THArgCheck(THCTensor_(nElement)(state, tensor) ==
+             THCudaByteTensor_nElement(state, mask),
+             2, "sizes do not match");
+
+  if (!THC_pointwiseApply2(state, tensor, mask,
+                           TensorMaskedFillOp<real, unsigned char>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedFillByte)(THCState* state,
+                           THCTensor *tensor, THByteTensor *mask, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedFill)(state, tensor, maskCuda, value);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedCopy)(THCState* state,
+                       THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
+  ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
+  ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
+
+  // `mask` and `tensor` must have the same number of elements
+  THArgCheck(maskSize == tensorSize, 2,
+             "mask and tensor must have the same number of elements");
+
+  // Determine our output size
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
+
+  // The number of `1` elements present in the mask must be <= the
+  // number of elements available in `src`
+  if (totalElements > srcSize) {
+    THArgCheck(false, 2, "source nElements must be == mask `1` elements");
+  }
+
+  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+  // iterator prefix sums? Convert `mask` to the same datatype as what
+  // we're accumulating the prefix sum in (long) to get around it
+  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+  THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+  THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+  THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<long>
+    maskData(THCudaLongTensor_data(state, maskLong));
+  thrust::device_ptr<long>
+    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+  thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    maskData,
+    maskData + THCudaLongTensor_nElement(state, maskLong),
+    maskPrefixSumData);
+
+  // We are getting elements from `src` based on an offset from
+  // `maskPrefixSum`, so that should be made contiguous too
+  THCTensor* contigSrc = THCTensor_(newContiguous)(state, src);
+
+  // update `tensor` where `mask` == 1 but pull from `src` at
+  // maskPrefixSum
+  bool status = THC_pointwiseApply3(
+    state, tensor, mask, maskPrefixSum,
+    TensorMaskedCopyOp<real, unsigned char, long>(
+      THCTensor_(data)(state, contigSrc)));
+
+  THCTensor_(free)(state, contigSrc);
+  THCudaLongTensor_free(state, maskLong);
+  THCudaLongTensor_free(state, maskPrefixSum);
+
+  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedCopyByte)(THCState* state,
+                           THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedCopy)(state, tensor, maskCuda, src);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedSelect)(THCState* state,
+                         THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
+  THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+  THArgCheck(THCudaByteTensor_nElement(state, mask) ==
+             THCTensor_(nElement)(state, src),
+             2, "sizes do not match");
+
+  // Determine our output size
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
+  THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor);
+
+  THCTensor_(resize1d)(state, tensorContig, totalElements);
+  if (tensor != tensorContig) {
+    THCTensor_(resize1d)(state, tensor, totalElements);
+  }
+
+  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+  // iterator prefix sums? Convert `mask` to the same datatype as what
+  // we're accumulating the prefix sum in (long) to get around it
+  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+  THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+  THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+  THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+
+  THCThrustAllocator thrustAlloc(state);
+  thrust::device_ptr<long>
+    maskData(THCudaLongTensor_data(state, maskLong));
+  thrust::device_ptr<long>
+    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+  thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    maskData,
+    maskData + THCudaLongTensor_nElement(state, maskLong),
+    maskPrefixSumData);
+
+  // Then copy over the masked elements at their desired output index
+  bool status = THC_pointwiseApply3(
+    state, mask, maskPrefixSum,
+    src, TensorMaskedSelectOp<real, unsigned char, long>(
+      THCTensor_(data)(state, tensor)));
+
+  THCudaLongTensor_free(state, maskLong);
+  THCudaLongTensor_free(state, maskPrefixSum);
+
+  if (tensor != tensorContig) {
+    THCTensor_(freeCopyTo)(state, tensorContig, tensor);
+  } else {
+    THCTensor_(free)(state, tensorContig);
+  }
+
+  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+  THCudaCheck(cudaGetLastError());
+}
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void
+THCTensor_(maskedSelectByte)(THCState* state,
+                             THCTensor *tensor, THCTensor *src, THByteTensor *mask)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+  THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+  THLongStorage_free(maskSizes);
+  THCudaByteTensor_copyByte(state, maskCuda, mask);
+  THCTensor_(maskedSelect)(state, tensor, src, maskCuda);
+  THCudaByteTensor_free(state, maskCuda);
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMasked.h b/lib/THC/generic/THCTensorMasked.h
new file mode 100644
index 0000000..98f5aee
--- /dev/null
+++ b/lib/THC/generic/THCTensorMasked.h
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.h"
+#else
+
+THC_API void THCTensor_(maskedFill)(THCState *state,
+                                    THCTensor *tensor,
+                                    THCudaByteTensor *mask,
+                                    real value);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedFillByte)(THCState *state,
+                                        THCTensor *tensor,
+                                        THByteTensor *mask,
+                                        real value);
+
+THC_API void THCTensor_(maskedCopy)(THCState *state,
+                                    THCTensor *tensor,
+                                    THCudaByteTensor *mask,
+                                    THCTensor *src);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedCopyByte)(THCState *state,
+                                        THCTensor *tensor,
+                                        THByteTensor *mask,
+                                        THCTensor *src);
+
+THC_API void THCTensor_(maskedSelect)(THCState *state,
+                                      THCTensor *tensor,
+                                      THCTensor *src,
+                                      THCudaByteTensor *mask);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedSelectByte)(THCState *state,
+                                          THCTensor *tensor,
+                                          THCTensor *src,
+                                          THByteTensor *mask);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
new file mode 100644
index 0000000..46746f7
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -0,0 +1,394 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.cu"
+#else
+
+THC_API void
+THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+
+  if (!THC_pointwiseApply1(
+        state, self_, TensorFillOp<real>(value))) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zero)(THCState *state, THCTensor *self_)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  if (THCTensor_(isContiguous)(state, self_)) {
+    THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
+                                0,
+                                sizeof(real) * THCTensor_(nElement)(state, self_),
+                                THCState_getCurrentStream(state)));
+  } else {
+    if (!THC_pointwiseApply1(
+          state, self_,
+          TensorFillOp<real>(ScalarConvert<int, real>::to(0)))) {
+      THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(zero)(state, r_);
+}
+
+THC_API void
+THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
+}
+
+THC_API void
+THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, r_, t));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(copy)(state, r_, t);
+}
+
+ptrdiff_t
+THCTensor_(numel)(THCState *state, THCTensor *t)
+{
+  return THCTensor_(nElement)(state, t);
+}
+
+void THCTensor_(cat)(THCState *state, THCTensor *result,
+		     THCTensor *ta, THCTensor *tb, int dimension)
+{
+  THCTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THCTensor_(catArray)(state, result, inputs, 2, dimension);
+}
+
+void THCTensor_(catArray)(THCState *state, THCTensor *result,
+			  THCTensor **inputs, int numInputs, int dimension)
+{
+  THLongStorage *size;
+  int i, j, cohortMax;
+  long offset;
+  bool hasEmptyInput = false;
+
+  // Even in the case where dimension is negative (i.e. when we want
+  // to cat along the last dimension), this logic still works, as the
+  // loop below will overwrite the value
+  int maxDim = dimension + 1;
+
+  // ldimension is the actual dimension we cat along (minus 1, for 0-based indexing)
+  int ldimension = dimension;
+
+  for (i = 0; i < numInputs; i++)
+  {
+    int inputDim = THCTensor_(nDimension)(state, inputs[i]);
+    hasEmptyInput |= !inputDim;
+    maxDim = THMax(maxDim, inputDim);
+  }
+
+  // In the event that the user specified -1 as the concat dimension, then
+  // we want to pick the maxDim  as dimension to cat along (and thus maxDim - 1 as the
+  // value due to 0-based indexing). If the maxDim is // 0 (i.e. we are catting all
+  // empty tensors), then we set ldimension to be 0
+  if (dimension + TH_INDEX_BASE == -1) {
+    ldimension = maxDim ? (maxDim - 1) : 0;
+  }
+
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+  THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+
+  size = THLongStorage_newWithSize(maxDim);
+  for(i = 0; i < maxDim; i++)
+  {
+    // dimSize is either the size of the dim if it exists, either 1 if #dim > 0, otherwise 0
+    long dimSize = i < THCTensor_(nDimension)(state, inputs[0])
+                       ? THCTensor_(size)(state, inputs[0], i)
+                       : THMin(THCTensor_(nDimension)(state, inputs[0]), 1);
+    if (i == ldimension)
+    {
+      for (j = 1; j < numInputs; j++)
+      {
+        // accumulate the size over the dimension we want to cat on.
+        // Empty tensors are allowed
+        dimSize += i < THCTensor_(nDimension)(state, inputs[j])
+                       ? THCTensor_(size)(state, inputs[j], i)
+                       : THMin(THCTensor_(nDimension)(state, inputs[j]), 1);
+      }
+    }
+    else
+    {
+      for (j = 1; j < numInputs; j++)
+      {
+        long sz = i < THCTensor_(nDimension)(state, inputs[j])
+                      ? THCTensor_(size)(state, inputs[j], i)
+                      : THMin(THCTensor_(nDimension)(state, inputs[j]), 1);
+
+        // If it's a dimension we're not catting on
+        // Then fail if sizes are different AND > 0
+        if (dimSize != sz && dimSize && sz) {
+          THLongStorage_free(size);
+          THError("inconsistent tensor sizes");
+        }
+        else if(!dimSize)
+        {
+          dimSize = sz;
+        }
+      }
+    }
+    size->data[i] = dimSize;
+  }
+
+  THCTensor_(resize)(state, result, size, NULL);
+  THLongStorage_free(size);
+
+  // We parallelize the copy if all 6 conditions pass:
+  //
+  // 1. There is more than one input tensor
+  // 2. No empty inputs
+  // 3. The result tensor is 32-bit indexable
+  // 4. The number of dimensions is <= 4
+  // 5. All input tensors are contiguous (output tensor may be non-contig)
+  // 6. All input tensors can use 32-bit indexing
+  // 7. All input tensors are on the same device
+
+  if (numInputs > 1 &&
+      !hasEmptyInput &&
+      THCTensor_(nDimension)(state, result) <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, result) &&
+      TensorUtils<THCTensor>::allContiguous(state, inputs, numInputs) &&
+      TensorUtils<THCTensor>::all32BitIndexable(state, inputs, numInputs) &&
+      TensorUtils<THCTensor>::allSameDevice(state, inputs, numInputs)) {
+
+    // First, let's set up our kernel parameters. We start with a raw pointer to the storage
+    // for the output Tensor.
+    real *data = THCTensor_(data)(state, result);
+
+    // Kernel Parameter
+    CatArrInputTensor<real, unsigned int> stackInputs[CAT_ARRAY_BATCH_SIZE];
+    CatArrInputTensor<real, unsigned int> *d_inputs;
+
+    // Attempt to re-use stream's scratch space for the input metadata
+    bool usedScratch = false;
+    size_t tensorMetadataSize = sizeof(CatArrInputTensor<real, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
+    if (THCState_getCurrentDeviceScratchSpaceSize(state) > tensorMetadataSize) {
+      void* space = THCState_getCurrentDeviceScratchSpace(state);
+      if (space) {
+        d_inputs = (CatArrInputTensor<real, unsigned int> *) space;
+        usedScratch = true;
+      }
+    }
+    if (!usedScratch) {
+      // Fallback to allocating GPU memory
+      THCudaCheck(THCudaMalloc(state, (void**) &d_inputs, tensorMetadataSize));
+    }
+
+    OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
+
+    // Next, let's initialize the size, stride arrays for the output Tensor.
+    for (i = 0; i < maxDim; ++i) {
+      param.outputSize[i] = THCTensor_(size)(state, result, i);
+      param.outputStride[i] = THCTensor_(stride)(state, result, i);
+    }
+
+    // Template Declarations for dim = 1, 2, 3, 4
+#define HANDLE_CASE(DIMS) \
+  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, ldimension, param.outputStride[dimension]);
+
+    // Now we loop
+    offset = 0;
+    for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
+      cohortMax = 0;
+      for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
+        long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[i+j])
+          ? THCTensor_(size)(state, inputs[i+j], ldimension)
+          : 1;
+
+        stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
+        stackInputs[j].offset = offset;
+        stackInputs[j].dimSize = dimSize;
+        stackInputs[j].nElements = THCTensor_(nElement)(state, inputs[i+j]);
+        cohortMax = cohortMax > stackInputs[j].nElements ? cohortMax : stackInputs[j].nElements;
+
+        // update offset
+        offset += dimSize;
+      }
+      cudaMemcpy(d_inputs, stackInputs, j * sizeof(CatArrInputTensor<real, unsigned int>), cudaMemcpyHostToDevice);
+
+      // Next, let's consider how we set our kernel launch parameters.
+      // We borrow from THCApply, which the kernel's internal indexing
+      // is based on.
+      dim3 applyBlock = getApplyBlock();
+
+      // We also re-use the applyGrid - but note that we use the maximum number of
+      // elements for a given tensor in this grouping to determine the count
+      dim3 applyGrid;
+      getApplyGrid(state, cohortMax, applyGrid);
+
+      // Next, we set our grid's y component to be the number of tensors in
+      // the batch. This will allow the kernel to determine which input
+      // tensor it is responsible for copying
+      applyGrid.y = j;
+
+      switch (maxDim) {
+        case 1:
+          HANDLE_CASE(1);
+          break;
+        case 2:
+          HANDLE_CASE(2);
+          break;
+        case 3:
+          HANDLE_CASE(3);
+          break;
+        case 4:
+          HANDLE_CASE(4);
+          break;
+      }
+      THCudaCheck(cudaGetLastError());
+    }
+    if (!usedScratch) {
+      THCudaCheck(THCudaFree(state, (void *)d_inputs));
+    }
+#undef HANDLE_CASE
+  } else {
+    offset = 0;
+    for (j = 0; j < numInputs; j++)
+    {
+      // No reason to copy when input is empty
+      if (!THCTensor_(nDimension)(state, inputs[j])) continue;
+
+      long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[j])
+               ? THCTensor_(size)(state, inputs[j], ldimension)
+               : 1;
+
+      THCTensor *nt = THCTensor_(newWithTensor)(state, result);
+      THCTensor_(narrow)(state, nt, NULL, ldimension, offset, dimSize);
+      THCTensor_(copy)(state, nt, inputs[j]);
+      THCTensor_(free)(state, nt);
+      offset += dimSize;
+    }
+  }
+}
+
+void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
+                          THCTensor *self)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self  ));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, tensor));
+
+
+  using namespace thrust::placeholders;
+  THCThrustAllocator thrustAlloc(state);
+  self = THCTensor_(newContiguous)(state, self);
+  thrust::device_ptr<real> self_data(THCTensor_(data)(state, self));
+
+  int num_dim = THCTensor_(nDimension)(state, self);
+  long N = THCTensor_(nElement)(state, self);
+
+  THCudaLongTensor_resize2d(state, tensor, N, num_dim);
+  tensor = THCudaLongTensor_newContiguous(state, tensor);
+  thrust::device_ptr<long> tensor_data(THCudaLongTensor_data(state, tensor));
+
+  thrust::counting_iterator<long> idxfirst(0);
+  thrust::counting_iterator<long> idxlast = idxfirst + N;
+
+  typedef thrust::device_ptr<long> Iter;
+  strided_range<Iter> strided_tensor(tensor_data,
+                                     tensor_data+N*num_dim, num_dim);
+
+#if CUDA_VERSION >= 7000
+  cudaStream_t stream = THCState_getCurrentStream(state);
+#endif
+
+  strided_range<Iter>::iterator dend = thrust::copy_if(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(stream),
+#endif
+    idxfirst,
+    idxlast,
+    self_data,
+    strided_tensor.begin(),
+    NonZeroOp<real>()
+  );
+
+  long num_nonzeros = thrust::distance(strided_tensor.begin(), dend);
+
+  long div = 1;
+  for (int dim = num_dim-1; dim >= 0; dim--) {
+    strided_range<Iter> stride_dim(tensor_data+dim,
+                                   tensor_data+N*num_dim, num_dim);
+    thrust::transform(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(stream),
+#endif
+      strided_tensor.begin(),
+      strided_tensor.end(),
+      stride_dim.begin(),
+      idx_functor(div, self->size[dim])
+    );
+    div *= self->size[dim];
+  }
+
+  THCudaLongTensor_resize2d(state, tensor, num_nonzeros, num_dim);
+
+  THCTensor_(free)(state, self);
+  THCudaLongTensor_free(state, tensor);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, long k){
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  int nDimension = THCTensor_(nDimension)(state, src_);
+  THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
+  if (nDimension == 2) {
+    long stride0 = THCTensor_(stride)(state, src_, 0);
+    long stride1 = THCTensor_(stride)(state, src_, 1);
+    long size0 = THCTensor_(size)(state, src_, 0);
+    long size1 = THCTensor_(size)(state, src_, 1);
+    long size = (k > 0) ? min((long long)size0, (long long)size1 - k) : min((long long)size0 + k, (long long)size1);
+    THCTensor_(resize1d)(state, self_, size);
+    long strideSelf = THCTensor_(stride)(state, self_, 0);
+    const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
+    dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
+    long start = (k >= 0 ? k * stride1 : -k * stride0);
+    THCTensor_copyFromDiagonal<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>
+    (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, size, stride0 + stride1, strideSelf);
+  } else {
+    ptrdiff_t totalElements = THCTensor_(nElement)(state, src_);
+    ptrdiff_t size = (k > 0) ? totalElements + k : totalElements - k;
+    long strideSrc = THCTensor_(stride)(state, src_, 0);
+    THCTensor_(resize2d)(state, self_, size, size);
+    THCTensor_(zero)(state, self_);
+    long stride0 = THCTensor_(stride)(state, self_, 0);
+    long stride1 = THCTensor_(stride)(state, self_, 1);
+    const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
+    dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (ptrdiff_t)threads.x)));
+    ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0);
+    THCTensor_copyToDiagonal<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>
+    (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, totalElements, stride0 + stride1, strideSrc);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+accreal THCTensor_(trace)(THCState *state, THCTensor *src_) {
+  THAssert(THCTensor_(checkGPU)(state, 1, src_));
+  THArgCheck((src_->nDimension == 2), 1, "expected a matrix");
+  THCTensor *diag = THCTensor_(new)(state);
+  THCTensor_(diag)(state, diag, src_, 0);
+  accreal trace = THCTensor_(sumall)(state, diag);
+  THCTensor_(free)(state, diag);
+  return trace;
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
new file mode 100644
index 0000000..2b8f563
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.h
@@ -0,0 +1,22 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.h"
+#else
+
+THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value);
+THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size);
+THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);
+THC_API void THCTensor_(cat)(THCState *state, THCTensor *result, THCTensor *ta, THCTensor *tb, int dimension);
+THC_API void THCTensor_(catArray)(THCState *state, THCTensor *result, THCTensor **inputs, int numInputs, int dimension);
+THC_API void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, THCTensor *self);
+
+THC_API void THCTensor_(tril)(THCState *state, THCTensor *self, THCTensor *src, long k);
+THC_API void THCTensor_(triu)(THCState *state, THCTensor *self, THCTensor *src, long k);
+THC_API void THCTensor_(diag)(THCState *state, THCTensor *self, THCTensor *src, long k);
+THC_API accreal THCTensor_(trace)(THCState *state, THCTensor *self);
+
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
new file mode 100644
index 0000000..f8d85cf
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -0,0 +1,600 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.cu"
+#else
+
+THC_API accreal
+THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THArgCheck(THCTensor_(nElement)(state, self) ==
+             THCTensor_(nElement)(state, src), 2, "sizes do not match");
+
+  self = THCTensor_(newContiguous)(state, self);
+  src = THCTensor_(newContiguous)(state, src);
+
+#ifdef THC_REAL_IS_FLOAT
+  accreal result = THCudaBlas_Sdot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1);
+#elif defined(THC_REAL_IS_DOUBLE)
+  accreal result = THCudaBlas_Ddot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1);
+#elif defined(THC_REAL_IS_HALF)
+  accreal result = THCudaBlas_Hdot(state,
+                                THCTensor_(nElement)(state, self),
+                                THCTensor_(data)(state, self), 1,
+                                THCTensor_(data)(state, src), 1);
+#endif
+
+  THCTensor_(free)(state, src);
+  THCTensor_(free)(state, self);
+  return result;
+
+#else
+  THError("unimplemented data type");
+  return ScalarConvert<int, accreal>::to(0);
+#endif
+}
+
+THC_API void
+THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
+  if( (mat->nDimension != 2) || (vec->nDimension != 1) )
+    THError("matrix and vector expected");
+
+  if( mat->size[1] != vec->size[0] )
+    THError("size mismatch");
+
+  if(t->nDimension != 1)
+    THError("size mismatch");
+
+  if(t->size[0] != mat->size[0])
+    THError("size mismatch");
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  if(r_ != t)
+  {
+    THCTensor_(resizeAs)(state, r_, t);
+    THCTensor_(copy)(state, r_, t);
+  }
+
+  if(mat->stride[0] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else if(mat->stride[1] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, mat), mat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
+                     alpha, THCTensor_(data)(state, mat), mat->stride[0],
+                     THCTensor_(data)(state, vec), vec->stride[0],
+                     beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else
+  {
+    THCTensor *cmat = THCTensor_(newContiguous)(state, mat);
+
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+                    THCTensor_(data)(state, vec), vec->stride[0],
+                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+
+    THCTensor_(free)(state, cmat);
+  }
+
+#elif defined(THC_REAL_IS_HALF)
+    // Currently no Hgemv/SgemvEx in Cublas
+    THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec);
+    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size[0], 1);
+
+    THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t);
+    THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size[0], 1);
+
+    THCTensor_(addmm)(state, r_, beta, tAsMatrix, alpha, mat, vecAsMatrix);
+
+    // r_ will have answer as matrix, need to return a vecotr
+    THCTensor_(resize1d)(state, r_, r_->size[0]);
+    THCTensor_(free)(state, vecAsMatrix);
+    THCTensor_(free)(state, tAsMatrix);
+#endif
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
+  if ( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) {
+    THError("vector and vector expected");
+  }
+
+  if (t->nDimension != 2) {
+    THError("size mismatch");
+  }
+
+  if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+    THError("size mismatch");
+  }
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  if (r_ != t) {
+    THCTensor_(resizeAs)(state, r_, t);
+    THCTensor_(copy)(state, r_, t);
+  }
+
+  if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
+    THCTensor_(mul)(state, r_, r_, beta);
+  }
+
+  if(r_->stride[0] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec1->size[0], vec2->size[0],
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[1]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec1->size[0], vec2->size[0],
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[1]);
+#endif
+  }
+  else if(r_->stride[1] == 1)
+  {
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+  }
+  else
+  {
+    THCTensor *cr = THCTensor_(newClone)(state, r_);
+
+#ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, cr), cr->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+                   THCTensor_(data)(state, vec1), vec1->stride[0],
+                   THCTensor_(data)(state, cr), cr->stride[0]);
+#endif
+
+    THCTensor_(freeCopyTo)(state, cr, r_);
+  }
+#elif defined(THC_REAL_IS_HALF)
+  // currently no Hger/SgerEx in Cublas.
+  THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2);
+  THCTensor_(resize2d)(state, vec2T, vec2T->size[0], 1);
+  THCTensor_(transpose)(state, vec2T, NULL, 0, 1);
+
+  THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1);
+  THCTensor_(resize2d)(state, vec1M, vec1M->size[0], 1);
+
+  THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T);
+  THCTensor_(free)(state, vec2T);
+  THCTensor_(free)(state, vec1M);
+#endif
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *m1, THCTensor *m2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+  THAssert(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
+  char transpose_r, transpose_m1, transpose_m2;
+  THCTensor *r__, *m1_, *m2_;
+
+  if( (m1->nDimension != 2) || (m2->nDimension != 2) )
+    THError("matrix and matrix expected");
+
+  if(t->nDimension != 2)
+    THError("size mismatch");
+
+  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) )
+    THError("size mismatch");
+
+  if(t != r_)
+  {
+    THCTensor_(resizeAs)(state, r_, t);
+    THCTensor_(copy)(state, r_, t);
+  }
+
+  /* r_ */
+  if(r_->stride[0] == 1 &&
+     r_->stride[1] != 0)
+  {
+    transpose_r = 'n';
+    r__ = r_;
+  }
+  else if(r_->stride[1] == 1 &&
+          r_->stride[0] != 0)
+  {
+    THCTensor *swap = m2;
+    m2 = m1;
+    m1 = swap;
+    transpose_r = 't';
+    r__ = r_;
+  }
+  else
+  {
+    transpose_r = 'n';
+
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, r_, 0, 1);
+    r__ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, r__, NULL, 0, 1);
+  }
+
+  /* m1 */
+  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m1 = 'n';
+    m1_ = m1;
+  }
+  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m1 = 't';
+    m1_ = m1;
+  }
+  else
+  {
+    transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
+    m1_ = THCTensor_(newContiguous)(state, m1);
+  }
+
+  /* m2 */
+  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m2 = 'n';
+    m2_ = m2;
+  }
+  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m2 = 't';
+    m2_ = m2;
+  }
+  else
+  {
+    transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
+    m2_ = THCTensor_(newContiguous)(state, m2);
+  }
+
+#ifdef THC_REAL_IS_HALF
+  THCudaBlas_Hgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_Dgemm(state,
+                   transpose_m1,
+                   transpose_m2,
+                   r__->size[(transpose_r == 'n' ? 0 : 1)],
+                   r__->size[(transpose_r == 'n' ? 1 : 0)],
+                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   alpha,
+                   THCTensor_(data)(state, m1_),
+                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   THCTensor_(data)(state, m2_),
+                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   beta,
+                   THCTensor_(data)(state, r__),
+                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#endif
+
+  /* free intermediate variables */
+  if(m1_ != m1) {
+    THCTensor_(free)(state, m1_);
+  }
+
+  if(m2_ != m2) {
+    THCTensor_(free)(state, m2_);
+  }
+
+  if(r__ != r_) {
+    THCTensor_(freeCopyTo)(state, r__, r_);
+  }
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+                   real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+  THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+
+  long batchnum = THCTensor_(size)(state, batch1, 0);
+  long m1d1 = THCTensor_(size)(state, batch1, 1);
+  long innerdim = THCTensor_(size)(state, batch1, 2);
+  long m2d2 = THCTensor_(size)(state, batch2, 2);
+
+  THArgCheck(batchnum == THCTensor_(size)(state, batch2, 0), 7,
+      "equal number of batches expected");
+  // M is t, as listed in the docs under addbmm
+  THArgCheck(m1d1 == THCTensor_(size)(state, t, 0), 6,
+      "first dimension must match first dimension of M");
+  THArgCheck(m2d2 == THCTensor_(size)(state, t, 1), 7,
+      "second dimension must match second dimension of M");
+  THArgCheck(innerdim == THCTensor_(size)(state, batch2, 1), 6,
+      "second dimension must match first dimension of batch2");
+
+  if (t != result) {
+    THCTensor_(resizeAs)(state, result, t);
+    THCTensor_(copy)(state, result, t);
+  }
+
+  THCTensor *slice1 = THCTensor_(new)(state);
+  THCTensor *slice2 = THCTensor_(new)(state);
+  for (long i=0; i<batchnum; i++) {
+    THCTensor_(select)(state, slice1, batch1, 0, i);
+    THCTensor_(select)(state, slice2, batch2, 0, i);
+
+    THCTensor_(addmm)(state, result, beta, result, alpha, slice1, slice2);
+    beta = ScalarConvert<int, real>::to(1);
+  }
+  THCTensor_(free)(state, slice1);
+  THCTensor_(free)(state, slice2);
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+__global__ void createBatchGemmBuffer(const real** buffer, real* data,
+                                      long stride, long num_batches) {
+  const long idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_batches) {
+    buffer[idx] = data + idx * stride;
+  }
+}
+
+THC_API void
+THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+                    real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+  THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+  THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6,
+             "equal number of batches expected");
+  THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7,
+             "equal number of batches expected");
+  THArgCheck(THCTensor_(size)(state, t, 1) == THCTensor_(size)(state, batch1, 1), 6,
+             "wrong matrix size");
+  THArgCheck(THCTensor_(size)(state, t, 2) == THCTensor_(size)(state, batch2, 2), 7,
+             "wrong matrix size");
+  THArgCheck(THCTensor_(size)(state, batch1, 2) == THCTensor_(size)(state, batch2, 1), 6,
+             "wrong matrix size");
+
+  if (t != result) {
+    THCTensor_(resizeAs)(state, result, t);
+    THCTensor_(copy)(state, result, t);
+  }
+
+  bool transpose_result;
+  char transpose_batch1, transpose_batch2;
+  long lda, ldb, ldc;
+  THCTensor *result_, *batch1_, *batch2_;
+  if (result->stride[1] == 1)
+  {
+    transpose_result = false;
+    result_ = result;
+    ldc = result_->stride[2];
+  }
+  else if (result->stride[2] == 1)
+  {
+    transpose_result = true;
+
+    THCTensor *swap = batch2;
+    batch2 = batch1;
+    batch1 = swap;
+
+    result_ = result;
+    ldc = result_->stride[1];
+  }
+  else
+  {
+    transpose_result = false;
+
+    THCTensor *transp_r_ = THCTensor_(newTranspose)(state, result, 1, 2);
+    result_ = THCTensor_(newClone)(state, transp_r_);
+    THCTensor_(free)(state, transp_r_);
+    THCTensor_(transpose)(state, result_, NULL, 1, 2);
+
+    ldc = result_->stride[2];
+  }
+
+  if (batch1->stride[transpose_result ? 2 : 1] == 1)
+  {
+    transpose_batch1 = 'n';
+    batch1_ = batch1;
+    lda = batch1_->stride[transpose_result ? 1 : 2];
+  }
+  else if (batch1->stride[transpose_result ? 1 : 2] == 1)
+  {
+    transpose_batch1 = 't';
+    batch1_ = batch1;
+    lda = batch1_->stride[transpose_result ? 2 : 1];
+  }
+  else
+  {
+    transpose_batch1 = transpose_result ? 'n' : 't';
+    batch1_ = THCTensor_(newContiguous)(state, batch1);
+    lda = batch1_->stride[1];
+  }
+
+  if (batch2->stride[transpose_result ? 2 : 1] == 1)
+  {
+    transpose_batch2 = 'n';
+    batch2_ = batch2;
+    ldb = batch2_->stride[transpose_result ? 1 : 2];
+  }
+  else if (batch2->stride[transpose_result ? 1 : 2] == 1)
+  {
+    transpose_batch2 = 't';
+    batch2_ = batch2;
+    ldb = batch2_->stride[transpose_result ? 2 : 1];
+  }
+  else
+  {
+    transpose_batch2 = transpose_result ? 'n' : 't';
+    batch2_ = THCTensor_(newContiguous)(state, batch2);
+    ldb = batch2_->stride[1];
+  }
+
+  // Compute pointers to matrices in each batch.
+  long num_batches = result_->size[0];
+  size_t matrices_size = num_batches * sizeof(real*);
+
+  // Copy pointers to device.
+  const real **d_matrices1, **d_matrices2;
+  real **d_result_matrices;
+  THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
+  THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
+  THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size));
+
+  const long block = 512;
+  const long grid = (num_batches + block - 1) / block;
+
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    d_matrices1, THCTensor_(data)(state, batch1_), batch1_->stride[0],
+    num_batches);
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    d_matrices2, THCTensor_(data)(state, batch2_), batch2_->stride[0],
+    num_batches);
+  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    (const real**)d_result_matrices, THCTensor_(data)(state,result_),
+    result_->stride[0], num_batches);
+
+#ifdef THC_REAL_IS_FLOAT
+  THCudaBlas_SgemmBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      d_matrices1, lda,
+      d_matrices2, ldb,
+      beta,
+      d_result_matrices, ldc,
+      num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+  THCudaBlas_DgemmBatched(
+      state,
+      transpose_batch1,
+      transpose_batch2,
+      result_->size[transpose_result ? 2 : 1],
+      result_->size[transpose_result ? 1 : 2],
+      batch1_->size[transpose_result ? 1 : 2],
+      alpha,
+      d_matrices1, lda,
+      d_matrices2, ldb,
+      beta,
+      d_result_matrices, ldc,
+      num_batches);
+#endif
+
+  THCudaFree(state, d_matrices1);
+  THCudaFree(state, d_matrices2);
+  THCudaFree(state, d_result_matrices);
+
+  if (batch1_ != batch1) {
+    THCTensor_(free)(state, batch1_);
+  }
+
+  if (batch2_ != batch2) {
+    THCTensor_(free)(state, batch2_);
+  }
+
+  if (result_ != result) {
+    THCTensor_(freeCopyTo)(state, result_, result);
+  }
+
+#else
+  THError("unimplemented data type");
+#endif
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.h b/lib/THC/generic/THCTensorMathBlas.h
new file mode 100644
index 0000000..f37910c
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathBlas.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.h"
+#else
+
+THC_API accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(addmv)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec);
+THC_API void THCTensor_(addmm)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat1, THCTensor *mat2);
+THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2);
+THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompare.cu b/lib/THC/generic/THCTensorMathCompare.cu
new file mode 100644
index 0000000..77f1ab5
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompare.cu
@@ -0,0 +1,101 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   unsigned char>(value));
+}
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THC_logicalValue(state, self_, src,
+                   TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
+                   typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompare.h b/lib/THC/generic/THCTensorMathCompare.h
new file mode 100644
index 0000000..7b8837c
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompare.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.h"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompareT.cu b/lib/THC/generic/THCTensorMathCompareT.cu
new file mode 100644
index 0000000..4b59abf
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompareT.cu
@@ -0,0 +1,113 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu"
+#else
+
+THC_API void
+THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorLTOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorGTOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorLEOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorGEOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorEQOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorNEOp<typename TensorUtils<THCTensor>::DataType,
+                    unsigned char>());
+}
+
+THC_API void
+THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorLTOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorGTOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorLEOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorGEOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorEQOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THC_logicalTensor(state, self_, src1, src2,
+                    TensorNEOp<typename TensorUtils<THCTensor>::DataType,
+                    typename TensorUtils<THCTensor>::DataType>());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompareT.h b/lib/THC/generic/THCTensorMathCompareT.h
new file mode 100644
index 0000000..0d76835
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompareT.h
@@ -0,0 +1,19 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.h"
+#else
+
+THC_API void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+
+THC_API void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathMagma.cu b/lib/THC/generic/THCTensorMathMagma.cu
new file mode 100644
index 0000000..635834d
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathMagma.cu
@@ -0,0 +1,650 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.cu"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#ifdef USE_MAGMA
+
+static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int k)
+{
+  long size[1] = { k };
+  long stride[1] = { 1 };
+  THCTensor_(rawResize)(state, self, 1, size, stride);
+  size_t len = k * sizeof(real);
+  THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n)
+{
+  long size[2] = { m, n };
+  long stride[2] = { 1, m };
+  THCTensor_(rawResize)(state, self, 2, size, stride);
+  size_t len = m * n * sizeof(real);
+  THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self)
+{
+  THAssert(self->nDimension == 2);
+  size_t len = THCTensor_(nElement)(state, self)*sizeof(real);
+  THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1);
+  THCTensor *selfc = THCTensor_(newContiguous)(state, temp);
+  THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
+  THCTensor_(free)(state, temp);
+  THCTensor_(free)(state, selfc);
+}
+
+#endif // USE_MAGMA
+
+static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
+{
+  THAssert(src->nDimension == 2);
+  if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
+  {
+    THCTensor_(retain)(state, self);
+    return self;
+  }
+
+  if (self == src)
+    self = THCTensor_(new)(state);
+  else
+    THCTensor_(retain)(state, self);
+
+  long size[2] = { src->size[0], src->size[1] };
+  long stride[2] = { 1, src->size[0] };
+
+  THCTensor_(rawResize)(state, self, 2, size, stride);
+  THCTensor_(copy)(state, self, src);
+  return self;
+}
+
+
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
+  THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+
+  int n = a_->size[0];
+  int nrhs = b_->size[1];
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+  real *a_data = THCTensor_(data)(state, a);
+  real *b_data = THCTensor_(data)(state, b);
+
+  int *ipiv = th_magma_malloc_pinned<int>(n);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#else
+  magma_dgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#endif
+
+  if (info < 0)
+    THError("MAGMA gesv : Argument %d : illegal value", -info);
+  else if (info > 0)
+    THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info);
+
+  magma_free_pinned(ipiv);
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+  THError(NoMagma(gesv));
+#endif
+}
+
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional");
+  THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b");
+  THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n");
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+  real *a_data = THCTensor_(data)(state, a);
+  real *b_data = THCTensor_(data)(state, b);
+
+  int m = a->size[0];
+  int n = a->size[1];
+  int nrhs = b->size[1];
+  real wkopt;
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#else
+  magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#endif
+
+  real *hwork = th_magma_malloc_pinned<real>((size_t)wkopt);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#else
+  magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#endif
+
+  magma_free_pinned(hwork);
+
+  if (info != 0)
+    THError("MAGMA gels : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, a, ra_);
+  THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+  THError(NoMagma(gels));
+#endif
+}
+
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
+{
+#ifdef USE_MAGMA
+  int n = a->size[0];
+  int lda = n;
+
+  magma_uplo_t uplo = uplos[0] == 'U' ?  MagmaUpper : MagmaLower;
+  magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  // eigen values and workspace
+  real *w = th_magma_malloc_pinned<real>(n);
+  real *wA = th_magma_malloc_pinned<real>(lda);
+
+  // compute optimal size of work array
+  int info;
+  real lwork;
+  int liwork;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#else
+  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#endif
+
+  real *work = th_magma_malloc_pinned<real>((size_t)lwork);
+  int *iwork = th_magma_malloc_pinned<int>(liwork);
+
+  // compute eigenvalues and, optionally, eigenvectors
+#if defined(THC_REAL_IS_FLOAT)
+  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#else
+  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#endif
+
+  // copy eigen values from w to re_
+  if (info == 0)
+    THCTensor_(copyArray1d)(state, re_, w, n);
+
+  magma_free_pinned(iwork);
+  magma_free_pinned(work);
+  magma_free_pinned(wA);
+  magma_free_pinned(w);
+
+  // check error value
+  if (info > 0)
+    THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+  else if (info < 0)
+    THError("MAGMA syev : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, input, rv_);
+#else
+  THError(NoMagma(syev));
+#endif
+}
+
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
+
+  magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
+  int n = a_->size[0];
+
+  real *a_data = th_magma_malloc_pinned<real>(n * n);
+  THCTensor_(copyTensor2d)(state, a_data, a_);
+
+  real *wr = th_magma_malloc_pinned<real>(n);
+  real *wi = th_magma_malloc_pinned<real>(n);
+
+  real *vr_data = NULL;
+  int ldvr = 1;
+  if (jobvr == MagmaVec)
+  {
+    vr_data = th_magma_malloc_pinned<real>(n * n);
+    ldvr = n;
+  }
+
+  real wkopt;
+  int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#else
+  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#endif
+
+  int lwork = (int) wkopt;
+  real *work_data = th_magma_malloc_pinned<real>(lwork);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#else
+  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+  else if (info < 0)
+    THError("MAGMA geev : Argument %d : illegal value", -info);
+
+  {
+    THCTensor_(resize2d)(state, re_, 2, n);
+    THCTensor *re = THCTensor_(newContiguous)(state, re_);
+    THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
+    THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+    THCTensor_(freeCopyTo)(state, re, re_);
+    THCTensor_(transpose)(state, re_, NULL, 0, 1);
+  }
+
+  if (jobvr == MagmaVec)
+    THCTensor_(copyArray2d)(state, rv_, vr_data, n, n);
+
+  magma_free_pinned(work_data);
+  magma_free_pinned(vr_data);
+  magma_free_pinned(wi);
+  magma_free_pinned(wr);
+  magma_free_pinned(a_data);
+
+#else
+  THError(NoMagma(geev));
+#endif
+}
+
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+{
+#ifdef USE_MAGMA
+  THCTensor *ra_ = THCTensor_(new)(state);
+  THCTensor_(gesvd2)(state, ru_, rs_, rv_,  ra_, a, jobu);
+  THCTensor_(free)(state, ra_);
+#else
+  THError(NoMagma(gesvd));
+#endif
+}
+
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+
+  magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
+  magma_vec_t jobvt = jobu;
+
+  int m = a->size[0];
+  int n = a->size[1];
+  int k = m < n ? m : n;
+  int j = (jobu == MagmaAllVec) ? m : k;
+
+  real *a_data = th_magma_malloc_pinned<real>(m * n);
+  THCTensor_(copyTensor2d)(state, a_data, a);
+
+  real *rs_data = th_magma_malloc_pinned<real>(k);
+  real *ru_data = th_magma_malloc_pinned<real>(m * j);
+  real *rv_data = th_magma_malloc_pinned<real>(n * n);
+
+  real wkopt;
+  int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+#else
+  magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+#endif
+
+  int lwork = (int) wkopt;
+  real *work_data = th_magma_malloc_pinned<real>(lwork);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+#else
+  magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
+  else if (info < 0)
+    THError("MAGMA gesvd : Argument %d : illegal value", -info);
+
+  THCTensor_(copyArray2d)(state, rv_, rv_data, n, n);
+  THCTensor_(transpose)(state, rv_, NULL, 0, 1);
+  THCTensor_(copyArray2d)(state, ru_, ru_data, m, j);
+  THCTensor_(copyArray1d)(state, rs_, rs_data, k);
+  THCTensor_(copyArray2d)(state, ra_, a_data,  m, n);
+
+  magma_free_pinned(work_data);
+  magma_free_pinned(rv_data);
+  magma_free_pinned(ru_data);
+  magma_free_pinned(rs_data);
+  magma_free_pinned(a_data);
+#else
+  THError(NoMagma(gesvd2));
+#endif
+}
+
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int info;
+  int n = a->size[0];
+  int lwork = n * magma_get_sgetri_nb(n);
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int *ipiv = th_magma_malloc_pinned<int>(n);
+
+  THCTensor *work = THCTensor_(newWithSize1d)(state, lwork);
+  real *work_data = THCTensor_(data)(state, work);
+
+  // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#else
+  magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("MAGMA getrf : Argument %d : illegal value", -info);
+
+  // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#else
+  magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("MAGMA getri : Argument %d : illegal value", -info);
+
+  THCTensor_(free)(state, work);
+  magma_free_pinned(ipiv);
+  THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int n = a->size[0];
+
+  // input
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  // output
+  THCTensor *output = THCTensor_(newColumnMajor)(state, ra_, a);
+
+  size_t matrices_size = sizeof(real*);
+
+  real **matrices1 = (real **)THAlloc(matrices_size);
+  const real **matrices1_const = (const real **)THAlloc(matrices_size);
+  real **matrices2 = (real **)THAlloc(matrices_size);
+  matrices1[0] = THCTensor_(data)(state, input);
+  matrices1_const[0] = THCTensor_(data)(state, input);
+  matrices2[0] = THCTensor_(data)(state, output);
+
+  // Copy pointers to device.
+  real **d_matrices1, **d_matrices2;
+  const real **d_matrices1_const;
+  THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
+  THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size));
+  THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
+
+  THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
+                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+  THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size,
+                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+  THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
+                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+  int info;
+  int *info_gpu;
+  THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int)));
+
+  int *ipiv_gpu;
+  THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int)));
+
+  // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#else
+  THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#endif
+
+  THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (info > 0)
+    THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("CUBLAS getrf : Argument %d : illegal value", -info);
+
+  // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+  THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#else
+  THCudaBlas_Dgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#endif
+
+  if (info > 0)
+    THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
+  else if (info < 0)
+    THError("CUBLAS getri : Argument %d : illegal value", -info);
+
+  THCudaCheck(THCudaFree(state, ipiv_gpu));
+  THCudaCheck(THCudaFree(state, info_gpu));
+  THCTensor_(freeCopyTo)(state, output, input);
+#endif
+}
+
+__global__ void THCTensor_(copyUpperSymmetric)(real *input, int n, int len)
+{
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+    const int r = idx % n;
+    const int c = idx / n;
+    if (r > c) {
+      input[idx] = input[r*n + c];
+    }
+  }
+}
+
+__global__ void THCTensor_(copyLowerSymmetric)(real *input, int n, int len)
+{
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+    const int r = idx % n;
+    const int c = idx / n;
+    if (r < c) {
+      input[idx] = input[r*n + c];
+    }
+  }
+}
+
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int n = a->size[0];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotri_gpu(ul, n, input_data, n, &info);
+#else
+  magma_dpotri_gpu(ul, n, input_data, n, &info);
+#endif
+
+  if (info > 0)
+    THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info);
+  else if (info < 0)
+    THError("MAGMA potri : Argument %d : illegal value", -info);
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  const int len = n*n;
+  dim3 blocks(std::min(DIVUP(len, 128), 65535));
+  dim3 threads(128);
+  if (uplo[0] == 'U') {
+    THCTensor_(copyUpperSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+  } else {
+    THCTensor_(copyLowerSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+  }
+
+  THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+  THError(NoMagma(potri));
+#endif
+}
+
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int n = a->size[0];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+  real *input_data = THCTensor_(data)(state, input);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotrf_gpu(ul, n, input_data, n, &info);
+#else
+  magma_dpotrf_gpu(ul, n, input_data, n, &info);
+#endif
+
+  // check error value
+  if (info > 0)
+    THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
+  else if (info < 0)
+    THError("MAGMA potrf : Argument %d : illegal value", -info);
+
+  if (uplo[0] == 'U') {
+    THCTensor_(triu)(state, ra_, input, 0);
+  } else {
+    THCTensor_(tril)(state, ra_, input, 0);
+  }
+  THCTensor_(free)(state, input);
+#else
+  THError(NoMagma(potrf));
+#endif
+}
+
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int n = a->size[0];
+  int nrhs = b->size[1];
+  magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
+
+  THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b);
+  real *b_data = THCTensor_(data)(state, b_);
+  THCTensor *a_ = THCTensor_(newColumnMajor)(state, a, a);
+  real *a_data = THCTensor_(data)(state, a_);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_spotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#else
+  magma_dpotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#endif
+
+  // check error value
+  if (info < 0)
+    THError("MAGMA potrs : Argument %d : illegal value", -info);
+
+  THCTensor_(freeCopyTo)(state, b_, rb_);
+  THCTensor_(free)(state, a_);
+#else
+  THError(NoMagma(potrs));
+#endif
+}
+
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+  THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
+
+  THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_);
+  int m = a->size[0];
+  int n = a->size[1];
+  int k = (m < n ? m : n);
+
+#ifdef MAGMA_V2
+  int nb = magma_get_sgeqrf_nb(m, n);
+#else
+  int nb = magma_get_sgeqrf_nb(m);
+#endif
+
+  real *a_data = THCTensor_(data)(state, a);
+  real *tau_data = th_magma_malloc_pinned<real>(n*n);
+
+  THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + ((n+31)/32)*32)*nb);
+  real *work_data = THCTensor_(data)(state, work);
+
+  int info;
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#else
+  magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA geqrf : Argument %d : illegal value.", -info);
+
+  THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a);
+  real *q_data = THCTensor_(data)(state, q);
+
+  THCTensor_(narrow)(state, a, a, 0, 0, k);
+  THCTensor_(triu)(state, rr_, a, 0);
+  THCTensor_(free)(state, a);
+
+#if defined(THC_REAL_IS_FLOAT)
+  magma_sorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
+#else
+  magma_dorgqr_gpu(m, k, k, q_data, m, tau_data, work_data, nb, &info);
+#endif
+
+  if (info != 0)
+    THError("MAGMA orgqr : Argument %d : illegal value.", -info);
+
+  THCTensor_(free)(state, work);
+  magma_free_pinned(tau_data);
+
+  THCTensor_(narrow)(state, q, q, 1, 0, k);
+  THCTensor_(freeCopyTo)(state, q, rq_);
+#else
+  THError(NoMagma(qr));
+#endif
+}
+
+#endif
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathMagma.h b/lib/THC/generic/THCTensorMathMagma.h
new file mode 100644
index 0000000..938daea
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathMagma.h
@@ -0,0 +1,23 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+// MAGMA (i.e. CUDA implementation of LAPACK functions)
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo);
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr);
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a);
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *a, THCTensor *b, const char *uplo);
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a);
+
+
+#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..0b4094b
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -0,0 +1,213 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
+#else
+
+THC_API void
+THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorSubConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorSubConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorDivConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorFmodOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorFmodOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorRemainderOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorRemainderOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
+
+  THCTensor *src = src_;
+  if (self_ == src_)
+    src = THCTensor_(newContiguous)(state, src_);
+
+  long stride0 = src->stride[0];
+  long stride1 = src->stride[1];
+  real *start = THCTensor_(data)(state, src) + src->storageOffset;
+
+  TensorTriOp<real, 0> op(start, stride0, stride1, k);
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, src, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  if (self_ == src_)
+    THCTensor_(freeCopyTo)(state, src, src_);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, long k)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(src_->nDimension == 2, 1, "expected a matrix");
+
+  THCTensor *src = src_;
+  if (self_ == src_)
+    src = THCTensor_(newContiguous)(state, src_);
+
+  long stride0 = src->stride[0];
+  long stride1 = src->stride[1];
+  real *start = THCTensor_(data)(state, src) + src->storageOffset;
+
+  TensorTriOp<real, 1> op(start, stride0, stride1, k);
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, src, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, op)) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  if (self_ == src_)
+    THCTensor_(freeCopyTo)(state, src, src_);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
+    return 0;
+  }
+
+  // This is not as efficient as TH, but the basic idea: create a buffer that stores
+  // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value
+  // in this buffer is 1, the two tensors are equal, otherwise they are not
+
+  THLongStorage *size = THCTensor_(newSizeOf)(state, self_);
+  THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, size, NULL);
+
+  if (!THC_pointwiseApply3(state, buf, self_, src_, TensorEQOp<real, unsigned char>())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  unsigned char min = THCudaByteTensor_minall(state, buf);
+
+  THLongStorage_free(size);
+  THCudaByteTensor_free(state, buf);
+
+  return min != 0;
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h
new file mode 100644
index 0000000..261c203
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.h
@@ -0,0 +1,14 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h"
+#else
+
+THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, real value);
+
+THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
new file mode 100644
index 0000000..b97908a
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -0,0 +1,522 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu"
+#else
+
+#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)             \
+  struct Tensor_##NAME##_##REAL##_Op {                                  \
+    __device__ __forceinline__ void operator()(real* out, real* in) const { \
+      *out = CFUNC(*in);                                                \
+    }                                                                   \
+                                                                        \
+    __device__ __forceinline__ void operator()(real* v) const {         \
+      *v = CFUNC(*v);                                                   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
+    THAssert(THCTensor_(checkGPU)(state, 2, self_, src));               \
+    if (self_ == src) {                                                 \
+      if (!THC_pointwiseApply1(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
+      }                                                                 \
+    } else {                                                            \
+      THCTensor_(resizeAs)(state, self_, src);                          \
+                                                                        \
+      if (!THC_pointwiseApply2(state, self_, src, Tensor_##NAME##_##REAL##_Op())) { \
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    THCudaCheck(cudaGetLastError());                                    \
+  }
+
+#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC, REAL) \
+  IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  log, THCNumerics<real>::log,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics<real>::log1p, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  exp, THCNumerics<real>::exp,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  cos, THCNumerics<real>::cos,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  sin, THCNumerics<real>::sin,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sqrt, THCNumerics<real>::sqrt,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(rsqrt, THCNumerics<real>::rsqrt, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( ceil, THCNumerics<real>::ceil,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(floor, THCNumerics<real>::floor, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(trunc, THCNumerics<real>::trunc, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  neg, THCNumerics<real>::neg,   Real)
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( acos, THCNumerics<real>::acos,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cosh, THCNumerics<real>::cosh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( asin, THCNumerics<real>::asin,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sinh, THCNumerics<real>::sinh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  tan, THCNumerics<real>::tan,   Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( atan, THCNumerics<real>::atan,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( tanh, THCNumerics<real>::tanh,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(round, THCNumerics<real>::round, Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( frac, THCNumerics<real>::frac,  Real)
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( cinv, THCNumerics<real>::cinv,  Real)
+
+#endif
+
+IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  abs, THCNumerics<real>::abs,   Real)
+
+#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_
+#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
+
+void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1(state, self_, TensorSignOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, TensorSignOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, real min_value,
+  real max_value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1(state, self_, TensorClampOp<real>(min_value, max_value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, TensorClampOp<real>(min_value, max_value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self, x, y));
+
+  int i;
+  long nd = THCTensor_(nDimension)(state, x);
+  ptrdiff_t nelem = THCTensor_(nElement)(state, x);
+  THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions");
+  for (i = 0; i < nd; i++) {
+    THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i);
+    if (dimension < 0 && THCTensor_(size)(state, x, i) == 3) {
+      dimension = i;
+    }
+  }
+
+  THArgCheck(dimension >= 0 && dimension < nd, 3, "dimension %d out of range", dimension+1);
+  THArgCheck(THCTensor_(size)(state, x, dimension) == 3, 3,
+      "dimension %d does not have size 3", dimension+1);
+  THCTensor_(resizeAs)(state, self, x);
+
+  long sx = THCTensor_(stride)(state, x, dimension);
+  long sy = THCTensor_(stride)(state, y, dimension);
+  long so = THCTensor_(stride)(state, self, dimension);
+  THCTensor *nx = THCTensor_(newNarrow)(state, x, dimension, 0, 1);
+  THCTensor *ny = THCTensor_(newNarrow)(state, y, dimension, 0, 1);
+  THCTensor *nself = THCTensor_(newNarrow)(state, self, dimension, 0, 1);
+  if (!THC_pointwiseApply3(state, nself, nx, ny, TensorCrossOp<real>(sx, sy, so))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+  THCTensor_(free)(state, nx);
+  THCTensor_(free)(state, ny);
+  THCTensor_(free)(state, nself);
+}
+
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+void THCTensor_(sigmoid)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, TensorSigmoidOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(pow)(THCState *state, THCTensor *self_, THCTensor *src, real value) {
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1(state, self_, TensorPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, TensorPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THCTensor_(tpow)(THCState *state, THCTensor *self_, real value, THCTensor *src)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1(state, self_, TensorTPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2(state, self_, src, TensorTPowOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, result, a, b));
+  THArgCheck(THCTensor_(nElement)(state, a) ==
+             THCTensor_(nElement)(state, b), 3, "sizes do not match");
+  THCTensor_(resizeAs)(state, result, a);
+
+  if (!THC_pointwiseApply3(state, result, a, b, TensorLerpOp<real>(w))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
+
+THC_API void
+THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self += src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += value * src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 + src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 + value * src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self -= src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += -value * src2
+      if (!THC_pointwiseApply2(state, self_, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 - src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 - value * src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self *= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 * src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self = pow(self, src2)
+    if (!THC_pointwiseApply2(state, self_, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = pow(src1, src2)
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorMaxOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorMaxOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorMinOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorMinOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorCRemainderOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorCRemainderOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
+
+  if (self == src1) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorCFmodOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src1);
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorCFmodOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+
+  if (self == src) {
+    if (!THC_pointwiseApply1(state, self, TensorMaxValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src);
+    if (!THC_pointwiseApply2(state, self, src, TensorMaxValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+
+  if (self == src) {
+    if (!THC_pointwiseApply1(state, self, TensorMinValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self, src);
+    if (!THC_pointwiseApply2(state, self, src, TensorMinValueOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+}
+
+THC_API void
+THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  if(self_ != t)
+  {
+    THCTensor_(resizeAs)(state, self_, t);
+    THCTensor_(copy)(state, self_, t);
+  }
+  else
+  {
+    THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1),
+               1, "sizes do not match");
+  }
+
+  THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2),
+             3, "sizes do not match");
+
+  if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp<real>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, real value, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
+  if(self_ != t)
+  {
+    THCTensor_(resizeAs)(state, self_, t);
+    THCTensor_(copy)(state, self_, t);
+  }
+  else
+  {
+    THArgCheck(THCTensor_(nElement)(state, self_) == THCTensor_(nElement)(state, src1),
+               1, "sizes do not match");
+  }
+  THArgCheck(THCTensor_(nElement)(state, src1) == THCTensor_(nElement)(state, src2),
+             3, "sizes do not match");
+
+  if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp<real>(value))) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h
new file mode 100644
index 0000000..34e594a
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.h
@@ -0,0 +1,57 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(log)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(cos)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(acos)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(cosh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sin)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(asin)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sinh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(tan)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(atan)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(atan2)(THCState *state, THCTensor *r_, THCTensor *tx, THCTensor *ty);
+THC_API void THCTensor_(tanh)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(pow)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(tpow)(THCState *state, THCTensor *self, real value, THCTensor *src);
+THC_API void THCTensor_(sqrt)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(rsqrt)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(ceil)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(floor)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(round)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(trunc)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(frac)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, real w);
+
+THC_API void THCTensor_(neg)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(cinv)(THCState *state, THCTensor *self, THCTensor *src);
+
+#endif
+
+THC_API void THCTensor_(abs)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, real min_value, real max_value);
+THC_API void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
+
+THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, real value);
+
+THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, real value, THCTensor *src1, THCTensor *src2);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
new file mode 100644
index 0000000..ed0e204
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -0,0 +1,364 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu"
+#else
+
+THC_API void
+THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  if (!THC_reduceDim(state, self, src,
+                     thrust::identity<real>(),
+                     ReduceAdd<real, real>(),
+                     ScalarConvert<int, real>::to(0),
+                     dimension)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  if (!THC_reduceDim(state, self, src,
+                     thrust::identity<real>(),
+                     ReduceMultiply<real, real>(),
+                     ScalarConvert<int, real>::to(1),
+                     dimension)) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCTensor_(sum)(state, self, src, dim);
+  THCTensor_(div)(state, self, self, ScalarConvert<long, real>::to(THCTensor_(size)(state, src, dim)));
+}
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void
+THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real maxnorm)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  THCTensor *self_;
+  THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
+  THCTensor *data = THCTensor_(newClone)(state, src_);
+  ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0];
+
+  THArgCheck(dimension >= 0 && dimension < THCTensor_(nDimension)(state, src), 3, "invalid dimension");
+  THArgCheck(THCNumerics<real>::gt(value, ScalarConvert<int, real>::to(0)), 2, "non-positive-norm not supported");
+  THArgCheck(THCTensor_(nDimension)(state, src) > 1, 1, "need at least 2 dimensions");
+
+  dim3 grid(data->size[0]);
+  dim3 threads(32);
+
+  THCTensor_kernel_renorm<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(THCTensor_(data)(state, data), value, size, maxnorm);
+
+  cudaError errcode = cudaGetLastError();
+  if(errcode != cudaSuccess)
+    THError(cudaGetErrorString(errcode));
+
+  THCTensor_(free)(state, src_);
+  self_ = THCTensor_(newTranspose)(state, data, dimension, 0);
+  THCTensor_(resizeAs)(state, self, self_);
+  THCTensor_(freeCopyTo)(state, self_, self);
+  THCTensor_(free)(state, data);
+}
+
+THC_API void
+THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, self_, dim, NULL);
+  THLongStorage_free(dim);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (dimension == THCTensor_(nDimension)(state, src) - 1) {
+    THCTensor_varInnermostDim<THCTensor, real, true>(state, self, src, flag);
+  } else {
+    THCTensor_varOuterDim<THCTensor, real, true>(state, self, src, dimension, flag);
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+}
+
+THC_API void
+THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, long dimension, int flag)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+  THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  THCTensor_(resize)(state, self_, dim, NULL);
+  THLongStorage_free(dim);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (dimension == THCTensor_(nDimension)(state, src) - 1) {
+    THCTensor_varInnermostDim<THCTensor, real, false>(state, self, src, flag);
+  } else {
+    THCTensor_varOuterDim<THCTensor, real, false>(state, self, src, dimension, flag);
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+}
+
+THC_API accreal
+THCTensor_(stdall)(THCState *state, THCTensor *self)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self)));
+}
+
+THC_API accreal
+THCTensor_(varall)(THCState *state, THCTensor *self)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  accreal mean = THCTensor_(meanall)(state, self);
+
+  accreal val;
+  if (!THC_reduceAll(state, self,
+                     SquareFunctor<accreal, real>(mean),
+                     ReduceAdd<accreal, accreal>(),
+                     ReduceAdd<accreal, accreal>(),
+                     ScalarConvert<int, accreal>::to(0),
+                     &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  val = THCNumerics<accreal>::div(
+    val,
+    ScalarConvert<ptrdiff_t, accreal>::to(THCTensor_(nElement)(state, self) - 1)
+  );
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API void
+THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
+    THC_reduceDim(state, self, src,
+                  TensorNonZeroOp<real>(), ReduceAdd<real, real>(),
+                  ScalarConvert<float, real>::to(0.0), dimension);
+  } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(1.0))) {
+    THC_reduceDim(state, self, src,
+                  TensorNormOp<real, 1>(value), ReduceAdd<real, real>(),
+                  ScalarConvert<float, real>::to(0.0), dimension);
+
+  } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(2.0))) {
+    THC_reduceDim(state, self, src,
+                  TensorNormOp<real, 2>(value), ReduceAdd<real, real>(),
+                  ScalarConvert<float, real>::to(0.0), dimension);
+    THCTensor_(pow)(state, self, self, ScalarConvert<float, real>::to(0.5));
+
+  } else {
+    THC_reduceDim(state, self, src,
+                  TensorNormOp<real, -1>(value), ReduceAdd<real, real>(),
+                  ScalarConvert<float, real>::to(0.0), dimension);
+    THCTensor_(pow)(state, self, self, THCNumerics<real>::cinv(value));
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API accreal
+THCTensor_(normall)(THCState *state, THCTensor *self, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  accreal result;
+
+  if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(0.0))) {
+    THC_reduceAll(state, self,
+                  TensorNonZeroOp<real>(),
+                  ReduceAdd<real, accreal>(),
+                  ReduceAdd<accreal, accreal>(),
+                  ScalarConvert<float, accreal>::to(0.0f),
+                  &result, 0);
+  } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(1.0))) {
+    THC_reduceAll(state, self,
+                  TensorNormOp<real, 1>(value),
+                  ReduceAdd<real, accreal>(),
+                  ReduceAdd<accreal, accreal>(),
+                  ScalarConvert<float, accreal>::to(0.0f),
+                  &result, 0);
+  } else if (THCNumerics<real>::eq(value, ScalarConvert<float, real>::to(2.0))) {
+    THC_reduceAll(state, self,
+                  TensorNormOp<real, 2>(value),
+                  ReduceAdd<real, accreal>(),
+                  ReduceAdd<accreal, accreal>(),
+                  ScalarConvert<float, accreal>::to(0.0f),
+                  &result, 0);
+    result = THCNumerics<accreal>::sqrt(result);
+  } else {
+    THC_reduceAll(state, self,
+                  TensorNormOp<real, -1>(value),
+                  ReduceAdd<real, accreal>(),
+                  ReduceAdd<accreal, accreal>(),
+                  ScalarConvert<float, accreal>::to(0.0f),
+                  &result, 0);
+    result = THCNumerics<accreal>::pow(
+      result,
+      ScalarConvert<real, accreal>::to(THCNumerics<real>::cinv(value))
+    );
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return result;
+}
+
+accreal THCTensor_(dist)(THCState *state, THCTensor *self,
+                         THCTensor *src, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  self = THCTensor_(newContiguous)(state, self);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  src = THCTensor_(newContiguous)(state, src);
+  thrust::device_ptr<real> self_data(THCTensor_(data)(state, self));
+  thrust::device_ptr<real> src_data(THCTensor_(data)(state, src));
+
+  THCThrustAllocator thrustAlloc(state);
+  accreal result = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    self_data, self_data+size, src_data, ScalarConvert<int, accreal>::to(0),
+    thrust::plus<accreal>(),
+    TensorDistOp<accreal, real>(ScalarConvert<real, accreal>::to(value)));
+
+  THCTensor_(free)(state, src);
+  THCTensor_(free)(state, self);
+
+  return THCNumerics<accreal>::pow(result, 1.0 / ScalarConvert<real, accreal>::to(value));
+}
+
+#endif
+
+THC_API accreal
+THCTensor_(sumall)(THCState *state, THCTensor *self) {
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<real>(),
+                     ReduceAdd<real, accreal>(),
+                     ReduceAdd<accreal, accreal>(),
+                     ScalarConvert<int, accreal>::to(0),
+                     &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API accreal
+THCTensor_(prodall)(THCState *state, THCTensor *self) {
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  accreal val;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<real>(),
+                     ReduceMultiply<real, accreal>(),
+                     ReduceMultiply<accreal, accreal>(),
+                     ScalarConvert<int, accreal>::to(1),
+                     &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  val = THCNumerics<accreal>::div(
+    val,
+    ScalarConvert<long, accreal>::to(THCTensor_(nElement)(state, self)) - 1
+  );
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API accreal
+THCTensor_(meanall)(THCState *state, THCTensor *self)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  THArgCheck(self->nDimension > 0, 1, "empty Tensor");
+  return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
+}
+
+THC_API real
+THCTensor_(minall)(THCState *state, THCTensor *self) {
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  real val;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<real>(),
+                     ReduceMin<real>(),
+                     ReduceMin<real>(),
+                     THCNumerics<real>::max(), &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API real
+THCTensor_(maxall)(THCState *state, THCTensor *self) {
+  THAssert(THCTensor_(checkGPU)(state, 1, self));
+  real val;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<real>(),
+                     ReduceMax<real>(),
+                     ReduceMax<real>(),
+                     THCNumerics<real>::min(), &val, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+  return val;
+}
+
+THC_API void
+THCTensor_(max)(THCState *state,
+                THCTensor *values,
+                THCudaLongTensor *indices,
+                THCTensor *src,
+                long dimension) {
+  THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+  thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
+    init =
+    thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
+      THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1);
+
+  return THC_reduceDimIndex(
+    state, values, indices, src, dimension, init,
+    MaxValuePair<typename TensorUtils<THCTensor>::DataType, long>());
+}
+
+THC_API void
+THCTensor_(min)(THCState *state,
+                THCTensor *values,
+                THCudaLongTensor *indices,
+                THCTensor *src,
+                long dimension) {
+  THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+  thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
+    init =
+    thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
+      THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1);
+
+  return THC_reduceDimIndex(
+    state, values, indices, src, dimension, init,
+    MinValuePair<typename TensorUtils<THCTensor>::DataType, long>());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathReduce.h b/lib/THC/generic/THCTensorMathReduce.h
new file mode 100644
index 0000000..dc38ed6
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathReduce.h
@@ -0,0 +1,41 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension, real max_norm);
+THC_API void THCTensor_(std)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
+THC_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, real value, long dimension);
+THC_API void THCTensor_(var)(THCState *state, THCTensor *self, THCTensor *src, long dim, int flag);
+
+THC_API accreal THCTensor_(stdall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, real value);
+THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self);
+
+#endif
+
+THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+
+THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(min)(THCState *state,
+                             THCTensor *values,
+                             THCudaLongTensor *indices,
+                             THCTensor *src, long dim);
+THC_API void THCTensor_(max)(THCState *state,
+                             THCTensor *values,
+                             THCudaLongTensor *indices,
+                             THCTensor *src, long dim);
+
+THC_API real THCTensor_(minall)(THCState *state, THCTensor *self);
+THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self);
+
+THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
+                              real value);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathScan.cu b/lib/THC/generic/THCTensorMathScan.cu
new file mode 100644
index 0000000..8a8e434
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathScan.cu
@@ -0,0 +1,89 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathScan.cu"
+#else
+
+template<class BinaryOp>
+__host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt,
+                                       THCTensor *src, long dimension,
+                                       real init, BinaryOp binary_op)
+{
+  unsigned ndim = THCTensor_(nDimension)(state, src);
+  // Treat all outer dimensions (i.e. dim < dimension) as one.
+  unsigned num_orows = 1;
+  for (long dim = 0; dim < dimension; dim++) {
+    num_orows *= THCTensor_(size)(state, src, dim);
+  }
+  unsigned row_size = THCTensor_(size)(state, src, dimension);
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  unsigned num_irows = 1;
+  for (unsigned dim = dimension + 1; dim < ndim; dim++) {
+    num_irows *= THCTensor_(size)(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  THCTensor_kernel_scanOuterDim<real><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src),
+    num_orows, num_irows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template<class BinaryFunction>
+__host__ void THCTensor_(scanInnermostDim)(THCState *state, THCTensor *tgt,
+                                           THCTensor *src, real init,
+                                           BinaryFunction binary_op)
+{
+  unsigned ndim = THCTensor_(nDimension)(state, src);
+  // Treat all outer dimensions as a single dimension.
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= THCTensor_(size)(state, src, dim);
+  }
+  unsigned row_size = THCTensor_(size)(state, src, ndim - 1);
+
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  THCTensor_kernel_scanInnermostDim<real, 16, 32><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src), num_rows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template<class BinaryFunction>
+void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
+                         long dimension, real init, BinaryFunction binary_op)
+{
+  THCTensor_(resizeAs)(state, self_, src);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  src = THCTensor_(newContiguous)(state, src);
+
+  if (dimension == THCTensor_(nDimension)(state, src) - 1) {
+    THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
+  } else {
+    THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
+  }
+
+  THCTensor_(free)(state, src);
+  THCTensor_(freeCopyTo)(state, self, self_);
+}
+
+void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  return THCTensor_(scanDim)(state, self, src, dimension,
+                             ScalarConvert<float, real>::to(0.0), AddOp<real>());
+}
+
+void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, long dimension)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+  return THCTensor_(scanDim)(state, self, src, dimension,
+                             ScalarConvert<float, real>::to(1.0), MulOp<real>());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathScan.h b/lib/THC/generic/THCTensorMathScan.h
new file mode 100644
index 0000000..edd825a
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathScan.h
@@ -0,0 +1,8 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathScan.h"
+#else
+
+THC_API void THCTensor_(cumsum)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(cumprod)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+
+#endif
diff --git a/lib/THC/generic/THCTensorRandom.cu b/lib/THC/generic/THCTensorRandom.cu
new file mode 100644
index 0000000..f6d6979
--- /dev/null
+++ b/lib/THC/generic/THCTensorRandom.cu
@@ -0,0 +1,351 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.cu"
+#else
+
+#define NUM_BLOCKS min((int)THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, a, b);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generateLogNormal<real><<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, lambda);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, median, sigma);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+void THCTensor_(renormRows)(struct THCState* state,
+                             THCTensor* t) {
+  THAssert(THCTensor_(nDimension)(state, t) == 2);
+  long rows = THCTensor_(size)(state, t, 0);
+  long cols = THCTensor_(size)(state, t, 1);
+
+  cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+  THAssert(props != NULL);
+
+  int numSM = props->multiProcessorCount;
+  int maxThreads = props->maxThreadsPerBlock;
+
+  dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
+  dim3 block(cols < maxThreads ? cols : maxThreads);
+
+  renormRowsL1<real>
+    <<<grid, block, block.x * sizeof(real),
+    THCState_getCurrentStream(state)>>>(THCTensor_(data)(state, t),
+                                        rows, cols);
+}
+
+THC_API void THCTensor_(multinomial)(struct THCState *state,
+                                      THCudaLongTensor *self,
+                                      THCTensor *prob_dist,
+                                      int n_sample,
+                                      int with_replacement)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, prob_dist));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  int inputSize = THCTensor_(nDimension)(state, prob_dist);
+  THArgCheck(inputSize > 0 && inputSize <= 2, 2,
+             "prob_dist must be 1 or 2 dim");
+
+  // Categories are in the innermost dimension
+  long numDist =
+    inputSize == 1 ? 1 : THCTensor_(size)(state, prob_dist, 0);
+  long numCategoriesLong =
+    inputSize == 1 ? THCTensor_(size)(state, prob_dist, 0) :
+    THCTensor_(size)(state, prob_dist, 1);
+
+  // Since the index tensor is float, numCategories cannot exceed max
+  // float integer precision
+  THArgCheck(numCategoriesLong <= FLOAT32_MAX_CONSECUTIVE_INT, 2,
+             "number of categories cannot exceed 2^24");
+  int numCategories = (int) numCategoriesLong;
+
+  THArgCheck(n_sample > 0, 3, "cannot sample <= 0 samples");
+
+  if (!with_replacement) {
+    THArgCheck(n_sample <= numCategories, 2,
+               "cannot sample n_sample > prob_dist:size(1) samples without "
+               "replacement");
+  }
+
+  // It is possible that prob_dist is non-contiguous
+  THCTensor* probDistContig =
+    THCTensor_(newContiguous)(state, prob_dist);
+
+  // Restructure data for 2d
+  if (inputSize == 1) {
+    THCTensor_(resize2d)(state, probDistContig, 1, numCategories);
+  }
+
+  THCudaLongTensor_resize2d(state, self, numDist, n_sample);
+
+  if (n_sample == 1) {
+    // Optimized allocation-free implementation
+    // To exploit greater parallelism for the sampling, generate the
+    // Uniform random samples in a separate kernel launch, into
+    // temporarily allocated memory. The device RNG is thread-limited
+    THCTensor *sampled = THCTensor_(newWithSize2d)(state, numDist, n_sample);
+    THCTensor_(uniform)(state, sampled, 0.0, 1.0);
+    cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+    THAssert(props != NULL);
+    int numSM = props->multiProcessorCount;
+    int maxThreads = props->maxThreadsPerBlock;
+    dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
+    dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
+    sampleMultinomialOnce
+      <<<grid, block, block.x * sizeof(real),
+         THCState_getCurrentStream(state)>>>(
+      THCudaLongTensor_data(state, self),
+      numDist,
+      numCategories,
+      THCTensor_(data)(state, sampled),
+      THCTensor_(data)(state, probDistContig));
+    THCTensor_(free)(state, sampled);
+  } else {
+    // Generic, slow implementation with memory allocations
+
+    // For sampling without replacement, we modify the distribution
+    // for subsequent samples in this space
+    THCTensor* origDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, origDist, probDistContig);
+    THCTensor_(copy)(state, origDist, probDistContig);
+
+    THCTensor* normDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, normDist, probDistContig);
+
+    THCTensor* prefixSum = THCTensor_(new)(state);
+
+    // Renorm along rows
+    THCTensor_(copy)(state, normDist, origDist);
+    THCTensor_(renormRows)(state, normDist);
+
+    // Prefix sum along rows
+    THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+
+    if (with_replacement) {
+      // Sample with replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from one
+      // distribution concurrently.
+      dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
+
+      sampleMultinomialWithReplacement
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          gen->gen_states,
+          n_sample,
+          THCudaLongTensor_data(state, self),
+          numDist, numCategories,
+          THCTensor_(data)(state, prefixSum));
+    } else {
+      // Sample without replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from a different
+      // distribution concurrently.
+      ptrdiff_t numBlocks = THCCeilDiv(numDist, 4L);
+      dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS);
+
+      for (int sample = 0; sample < n_sample; ++sample) {
+        if (sample > 0) {
+          // Update probabilities
+          // Renorm along rows
+          THCTensor_(copy)(state, normDist, origDist);
+          THCTensor_(renormRows)(state, normDist);
+
+          // Prefix sum along rows
+          THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+        }
+
+        // The kernel can only draw one sample before we have to
+        // recalculate our distribution
+        sampleMultinomialWithoutReplacement
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            gen->gen_states,
+            n_sample,
+            sample,
+            THCudaLongTensor_data(state, self),
+            numDist, numCategories,
+            THCTensor_(data)(state, origDist),
+            THCTensor_(data)(state, prefixSum));
+      }
+    }
+
+    THCTensor_(free)(state, prefixSum);
+    THCTensor_(free)(state, normDist);
+    THCTensor_(free)(state, origDist);
+  }
+
+  // Revert data restructuring based on input sizes
+  if (inputSize == 1) {
+    THCudaLongTensor_resize1d(state, self, n_sample);
+
+    // Unfortunately, if prob_dist is contiguous already,
+    // newContiguous is not a private copy, so we have to restructure
+    // this too, so as to not affect prob_dist
+    THCTensor_(resize1d)(state, probDistContig, numCategories);
+  }
+
+  THCTensor_(free)(state, probDistContig);
+}
+
+THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(uniform)(state, r_, 0, 1);
+}
+
+void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(normal)(state, r_, 0, 1);
+}
+
+#endif
+
+#if defined(THC_REAL_IS_DOUBLE)
+GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p)
+#else
+GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (ScalarConvert<bool, real>::to(x <= p)))
+#endif
+
+THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+#define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE)               \
+THC_API void THCTensor_(NAME)(THCState* state,                                 \
+        THCTensor *self_, PROB_TYPE *probs_)                                   \
+{                                                                              \
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, probs_));                     \
+  Generator* gen = THCRandom_getGenerator(state);                              \
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);                   \
+  PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_);                 \
+  ptrdiff_t size = THCTensor_(nElement)(state, self);                          \
+  ptrdiff_t prob_size = PROB_TYPE##_nElement(state, probs);                    \
+  real *result_data = THCTensor_(data)(state, self);                           \
+  PROB_DATA_TYPE *probs_data = PROB_TYPE##_data(state, probs);                 \
+                                                                               \
+  THArgCheck(size == prob_size, 3, "inconsistent tensor size");                \
+                                                                               \
+  generate_bernoulli_tensor<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( \
+      gen->gen_states, size, result_data, probs_data);                         \
+                                                                               \
+  PROB_TYPE##_free(state, probs);                                              \
+  THCTensor_(freeCopyTo)(state, self, self_);                                  \
+}
+
+DEFINE_BERNOULLI_TENSOR(bernoulli_FloatTensor, THCudaTensor, float)
+DEFINE_BERNOULLI_TENSOR(bernoulli_DoubleTensor, THCudaDoubleTensor, double)
+
+#if defined(THC_REAL_IS_DOUBLE)
+
+GENERATE_KERNEL1(generate_geometric, double, double p, double, curand_uniform_double, ceil(log(x) / log(1-p)))
+#else
+GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (ScalarConvert<float, real>::to(ceilf(logf(x) / log(1-p)))))
+#endif
+
+THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+#undef NUM_BLOCKS
+
+#endif
diff --git a/lib/THC/generic/THCTensorRandom.h b/lib/THC/generic/THCTensorRandom.h
new file mode 100644
index 0000000..98017ab
--- /dev/null
+++ b/lib/THC/generic/THCTensorRandom.h
@@ -0,0 +1,23 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b);
+THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda);
+THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma);
+THC_API void THCTensor_(multinomial)(struct THCState *state, THCudaLongTensor *self, THCTensor *prob_dist, int n_sample, int with_replacement);
+
+#endif
+
+THC_API void THCTensor_(bernoulli)(struct THCState *state, THCTensor *self, double p);
+THC_API void THCTensor_(bernoulli_FloatTensor)(struct THCState *state, THCTensor *self, THCudaTensor *p);
+THC_API void THCTensor_(bernoulli_DoubleTensor)(struct THCState *state, THCTensor *self, THCudaDoubleTensor *p);
+THC_API void THCTensor_(geometric)(struct THCState *state, THCTensor *self, double p);
+
+#endif
diff --git a/lib/THC/generic/THCTensorScatterGather.cu b/lib/THC/generic/THCTensorScatterGather.cu
new file mode 100644
index 0000000..c120f88
--- /dev/null
+++ b/lib/THC/generic/THCTensorScatterGather.cu
@@ -0,0 +1,266 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorScatterGather.cu"
+#else
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_gatherKernel<TYPE, REAL, DIMS>                                \
+  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+    tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
+
+void THCTensor_(gather)(THCState* state, THCTensor *tensor,
+                         THCTensor *src, int dim, THCudaLongTensor *index) {
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(THCTensor_(nDimension)(state, src) == THCTensor_(nDimension)(state, tensor), 2,
+             "Input tensor must have same dimensions as output tensor");
+  THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 3,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor_nDimension(state, index) == THCTensor_(nDimension)(state, src), 4,
+             "Index tensor must have same dimensions as input tensor");
+  THLongStorage *indexSize = THCudaLongTensor_newSizeOf(state, index);
+  THArgCheck(THCTensor_(isSize)(state, tensor, indexSize), 4,
+             "Index tensor must have the same size as output tensor.");
+  THLongStorage_free(indexSize);
+
+  for (int d = 0; d < THCTensor_(nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 2,
+                 "Input tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (TensorUtils<THCTensor>::overlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, tensor);
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, src);
+    TensorInfo<long, unsigned int> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        THCudaCheck(cudaGetLastError());
+        break;
+    }
+  } else {
+    TensorInfo<real, unsigned long> tensorInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, tensor);
+    TensorInfo<real, unsigned long> srcInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, src);
+    TensorInfo<long, unsigned long> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, index);
+    RUN(unsigned long, -1, real);
+    THCudaCheck(cudaGetLastError());
+  }
+
+  if (oldTensor) {
+    TensorUtils<THCTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_scatterKernel<TYPE, REAL, DIMS>                               \
+  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+    tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
+
+void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
+  THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor_nDimension(state, index) == THCTensor_(nDimension)(state, src), 3,
+             "Index tensor must have same dimensions as input tensor");
+  THArgCheck(THCTensor_(nDimension)(state, src) == THCTensor_(nDimension)(state, tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+  THLongStorage *indexDims = THCudaLongTensor_newSizeOf(state, index);
+  THArgCheck(THCTensor_(isSize)(state, src, indexDims), 3,
+             "Index tensor must have the same size as input tensor.");
+  THLongStorage_free(indexDims);
+
+  for (int d = 0; d < THCTensor_(nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) == THCTensor_(size)(state, src, d), 4,
+                 "Input tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (TensorUtils<THCTensor>::overlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, tensor);
+    TensorInfo<real, unsigned int> srcInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, src);
+    TensorInfo<long, unsigned int> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        break;
+    }
+  } else {
+    TensorInfo<real, unsigned long> tensorInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, tensor);
+    TensorInfo<real, unsigned long> srcInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, src);
+    TensorInfo<long, unsigned long> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, index);
+
+    RUN(unsigned long, -1, real)
+  }
+
+  if (oldTensor) {
+    TensorUtils<THCTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+#define RUN(TYPE, DIMS, REAL)                                           \
+  THCudaTensor_scatterFillKernel<TYPE, REAL, DIMS>                           \
+      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
+          tensorInfo, indexInfo, value, dim, (TYPE)totalElements);
+
+void
+THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
+                         int dim, THCudaLongTensor *index, real value) {
+  THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, index));
+
+  THArgCheck(dim >= 0 && dim < THCTensor_(nDimension)(state, tensor), 2,
+             "Index dimension is out of bounds");
+  THArgCheck(THCudaLongTensor_nDimension(state, index) ==
+             THCTensor_(nDimension)(state, tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+
+  for (int d = 0; d < THCTensor_(nDimension)(state, tensor); d++) {
+    if (d != dim) {
+      THArgCheck(THCTensor_(size)(state, tensor, d) ==
+                 THCudaLongTensor_size(state, index, d), 4,
+                 "Index tensor must have same size as output tensor apart from the specified dimension");
+    }
+  }
+
+  THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
+             1, CUTORCH_DIM_WARNING);
+
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+
+  THCTensor* oldTensor = NULL;
+  if (TensorUtils<THCTensor>::overlappingIndices(state, tensor)) {
+    oldTensor = tensor;
+    tensor = THCTensor_(newContiguous)(state, tensor);
+  }
+
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<real, unsigned int> tensorInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, tensor);
+    TensorInfo<long, unsigned int> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, index);
+
+    // Specialize for a small number of dimensions.
+    switch (indexInfo.dims) {
+      case 1:
+        RUN(unsigned int, 1, real);
+        break;
+      case 2:
+        RUN(unsigned int, 2, real);
+        break;
+      case 3:
+        RUN(unsigned int, 3, real);
+        break;
+      default:
+        RUN(unsigned int, -1, real);
+        break;
+    }
+  } else {
+    TensorInfo<real, unsigned long> tensorInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, tensor);
+    TensorInfo<long, unsigned long> indexInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, index);
+
+    RUN(unsigned long, -1, real);
+  }
+
+  if (oldTensor) {
+    TensorUtils<THCTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
+    THCTensor_(free)(state, tensor);
+    tensor = oldTensor;
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef RUN
+
+#endif
diff --git a/lib/THC/generic/THCTensorScatterGather.h b/lib/THC/generic/THCTensorScatterGather.h
new file mode 100644
index 0000000..2071014
--- /dev/null
+++ b/lib/THC/generic/THCTensorScatterGather.h
@@ -0,0 +1,9 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorScatterGather.h"
+#else
+
+THC_API void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
+THC_API void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src);
+THC_API void THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, real value);
+
+#endif
diff --git a/lib/THC/generic/THCTensorSort.cu b/lib/THC/generic/THCTensorSort.cu
new file mode 100644
index 0000000..afef796
--- /dev/null
+++ b/lib/THC/generic/THCTensorSort.cu
@@ -0,0 +1,336 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorSort.cu"
+#else
+
+// In alignment with default sort on a c++ map, this function
+// will permute key and value tensors identically, and
+// in such a way that the 'key' tensor is ordered numerically
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                           THCTensor* key,
+                                           THCudaLongTensor* value,
+                                           int dim, bool dir) {
+  THLongStorage *valueSize = THCudaLongTensor_newSizeOf(state, value);
+  THArgCheck(THCTensor_(isSize)(state, key, valueSize), 2,
+             "Key tensor must have same size as value tensor");
+  THLongStorage_free(valueSize);
+  long dims = THCudaLongTensor_nDimension(state, value);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(nDimension)(state, key);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+
+  ptrdiff_t inElements = THCTensor_(nElement)(state, key);
+  long keySliceSize = THCTensor_(size)(state, key, dim);
+  ptrdiff_t keySlices = inElements / keySliceSize;
+
+  if (THCTensor_(nDimension)(state, key) == 0) {
+    // Zero-dim tensor; do nothing
+    return;
+  }
+
+  // The amount of shared memory and block size is based on
+  // 2^ceil(lg(n)); we choose that sorting implementation for a given
+  // size.
+  long ceilPowerOf2 = nextHighestPowerOf2(keySliceSize);
+
+  // FIXME: We'd have to find some other trick with Thrust to perform a
+  // vectorized (key, value) sort by slice segment
+  if (ceilPowerOf2 > 2048) {
+    THError("sortKeyValueInplace only works for sizes <= 2048 at present");
+  }
+
+  // The grid is based on the number of independent slices that we
+  // have to sort; one block per slice
+  dim3 grid;
+  if (!THC_getGridFromTiles(keySlices, grid)) {
+    THError("Slice to sort is too large");
+  }
+
+#define HANDLE_CASE(TYPE, A, SIZE)                                      \
+  do {                                                                  \
+    int blockSize = SIZE / 2;                                           \
+    if (blockSize < 1) {                                                \
+      blockSize = 1;                                                    \
+    }                                                                   \
+                                                                        \
+    dim3 block(blockSize);                                              \
+                                                                        \
+    if (dir) {                                                          \
+      bitonicSortKVInPlace<real, long, A, -1, GTComp<real>, TYPE, SIZE> \
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
+          keyInfo,                                                      \
+          keySlices,                                                    \
+          (TYPE) keySliceSize,                                          \
+          (TYPE) keyInfo.strides[collapseKeyDim],                       \
+          valueInfo,                                                    \
+          (TYPE) valueInfo.strides[collapseValueDim],                   \
+          GTComp<real>());                                              \
+    } else {                                                            \
+      bitonicSortKVInPlace<real, long, A, -1, LTComp<real>, TYPE, SIZE> \
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
+          keyInfo,                                                      \
+          keySlices,                                                    \
+          (TYPE) keySliceSize,                                          \
+          (TYPE) keyInfo.strides[collapseKeyDim],                       \
+          valueInfo,                                                    \
+          (TYPE) valueInfo.strides[collapseValueDim],                   \
+          LTComp<real>());                                              \
+    }                                                                   \
+  } while (0)
+
+#define HANDLE_SORT_CASE(TYPE, A)                       \
+  {                                                     \
+    switch (ceilPowerOf2) {                             \
+      case 2048:                                        \
+      HANDLE_CASE(TYPE, A, 2048);                       \
+      break;                                            \
+      case 1024:                                        \
+      case 512:                                         \
+      case 256:                                         \
+      HANDLE_CASE(TYPE, A, 1024);                       \
+      break;                                            \
+      case 128:                                         \
+      case 64:                                          \
+      HANDLE_CASE(TYPE, A, 128);                        \
+      break;                                            \
+      case 32:                                          \
+      case 16:                                          \
+      case 8:                                           \
+      case 4:                                           \
+      case 2:                                           \
+      HANDLE_CASE(TYPE, A, 32);                         \
+      break;                                            \
+      case 1:                                           \
+      /* Nothing to do, data already sorted */          \
+      break;                                            \
+      default:                                          \
+      assert(false);                                    \
+    }                                                   \
+  }
+
+  // The constructed key/value tensor info is used to select the slice
+  // we are sorting on a per-block basis
+  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, key)) {
+    TensorInfo<real, unsigned int> keyInfo =
+      getTensorInfo<THCTensor, unsigned int>(state, key);
+    keyInfo.reduceDim(dim);
+    int collapseKeyDim = keyInfo.collapseDims(dim);
+
+    TensorInfo<long, unsigned int> valueInfo =
+      getTensorInfo<THCudaLongTensor, unsigned int>(state, value);
+    valueInfo.reduceDim(dim);
+    int collapseValueDim = valueInfo.collapseDims(dim);
+
+    if (keyInfo.isContiguous()) {
+      HANDLE_SORT_CASE(unsigned int, -2);
+    } else {
+      switch (keyInfo.dims) {
+        case 2:
+          HANDLE_SORT_CASE(unsigned int, 2);
+          break;
+        default:
+          HANDLE_SORT_CASE(unsigned int, -1);
+          break;
+      }
+    }
+  } else {
+    TensorInfo<real, unsigned long> keyInfo =
+      getTensorInfo<THCTensor, unsigned long>(state, key);
+    keyInfo.reduceDim(dim);
+    int collapseKeyDim = keyInfo.collapseDims(dim);
+
+    TensorInfo<long, unsigned long> valueInfo =
+      getTensorInfo<THCudaLongTensor, unsigned long>(state, value);
+    valueInfo.reduceDim(dim);
+    int collapseValueDim = valueInfo.collapseDims(dim);
+
+    // long case is rare, just instantiate the generic version
+    HANDLE_SORT_CASE(unsigned long, -1);
+  }
+#undef HANDLE_CASE
+#undef HANDLE_SORT_CASE
+#undef HANDLE_A_CASE
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void sortViaThrust(THCState* state,
+                   THCTensor* sorted,
+                   THCudaLongTensor* indices,
+                   THCTensor* input,
+                   int dim, bool dir) {
+  long nDims = THCTensor_(nDimension)(state, input);
+
+  ptrdiff_t totalElements = THCTensor_(nElement)(state, input);
+  long sliceSize = THCTensor_(size)(state, input, dim);
+  long sliceStride = THCTensor_(stride)(state, input, dim);
+
+  // We perform a vectorized segmented sort in Thrust.
+  // Say we are sorting a (2, 3) tensor. We have in flattened form:
+  // values 0.4 1.2 5.3 6.2 1.3 2.3
+  // indices  0   1   2   3   4   5
+  // where indices is a global index (across all slices)
+
+  // First we sort by values, globally:
+  // values 6.2 5.3 2.3 1.2 1.3 0.4
+  // indices  3   2   5   1   4   0
+
+  // Then we stable sort by segment, which is index / 3:
+  // values 5.3 1.2 0.4 6.2 2.3 1.3
+  // indices  2   1   0   3   5   4
+
+  // Then we translate the global index to a per-slice Lua index
+  // (index % 3) + 1:
+  // values 5.3 1.2 0.4 6.2 2.3 1.3
+  // indices  3   2   1   1   3   2
+
+  // This method can only work if the slice we are sorting (`dim`) is
+  // innermost, and both values and indices are contiguous. We do this
+  // by re-arranging the input into this form as needed, which will
+  // unfortunately allocate memory if the request is not in this form.
+  // Vectorized sort is slower than iterated sort if the number of
+  // slices is small (since we're sorting twice, instead of invoking a
+  // smaller sort `numSlices` times), but the Thrust sort
+  // implementation here is a catch-all, so we're not looking for
+  // efficiency, but instead correctness.
+  THCTensor_(copy)(state, sorted, input);
+  THCTensor* trKeys = THCTensor_(newWithTensor)(state, sorted);
+  THCudaLongTensor* trIndices = THCudaLongTensor_newWithTensor(state, indices);
+
+  // Transpose dim to innermost
+  if (dim != nDims - 1) {
+    THCTensor_(transpose)(state, trKeys, NULL, dim, nDims - 1);
+    THCudaLongTensor_transpose(state, trIndices, NULL, dim, nDims - 1);
+  }
+
+  // Thrust must operate on a contiguous layout
+  THCTensor* trContigKey = THCTensor_(newContiguous)(state, trKeys);
+  THCudaLongTensor* trContigIndices = THCudaLongTensor_newContiguous(state, trIndices);
+
+  THCTensor_(free)(state, trKeys);
+  THCudaLongTensor_free(state, trIndices);
+
+  THCThrustAllocator thrustAlloc(state);
+
+  thrust::device_ptr<real> keyIter(THCTensor_(data)(state, trContigKey));
+
+  // Since we are composing a global index across all segments rather
+  // than a per-segment index, we treat the memory as int so we don't
+  // have problems sorting slices < 2^24 but where the entire tensor
+  // has more than 2^24 elements
+  thrust::device_ptr<long>
+    indexIter((long*) THCudaLongTensor_data(state, trContigIndices));
+
+  // Fill the indices with a global index across all slices
+  thrust::counting_iterator<long> countIter(0);
+
+  thrust::copy(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    countIter, countIter + totalElements, indexIter);
+
+  // First, we sort globally (across all slices) according to key
+  // (the values we're sorting)
+  if (dir) {
+    thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<real>());
+  } else {
+    thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<real>());
+  }
+
+  // Then, re-sort according to slice that each index is
+  // in. This completes the segment sort in Thrust, since we're
+  // stably sorting here, preserving the relative order of values
+  // per each slice
+  thrust::stable_sort_by_key(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    indexIter, indexIter + totalElements, keyIter,
+    SliceComp(sliceSize));
+
+  // Translate the global integer 0-based index to a per-slice real
+  // Lua index
+  thrust::for_each(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
+#endif
+    indexIter, indexIter + totalElements,
+    GlobalIndexToPerSliceIndex(sliceSize));
+
+  // Reverse the transposition as needed
+  if (dim != nDims - 1) {
+    THCTensor_(transpose)(state, trContigKey, NULL, dim, nDims - 1);
+    THCudaLongTensor_transpose(state, trContigIndices, NULL, dim, nDims - 1);
+  }
+
+  // Then copy back to the expected output
+  THCTensor_(freeCopyTo)(state, trContigKey, sorted);
+  THCudaLongTensor_freeCopyTo(state, trContigIndices, indices);
+}
+
+THC_API void THCTensor_(sort)(THCState* state,
+                               THCTensor *sorted,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               int dim, int order) {
+  THAssert(THCTensor_(checkGPU)(state, 2, sorted, input));
+  THAssert(THCudaLongTensor_checkGPU(state, 1, indices));
+  long dims = THCTensor_(nDimension)(state, sorted);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  dims = THCTensor_(nDimension)(state, input);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
+  dims = THCudaLongTensor_nDimension(state, indices);
+  THArgCheck(dims <= MAX_CUTORCH_DIMS, 3, CUTORCH_DIM_WARNING);
+
+  // Make sure sufficient output space is allocated
+  THCTensor_(resizeAs)(state, sorted, input);
+  THLongStorage *inputSize = THCTensor_(newSizeOf)(state, input);
+  THCudaLongTensor_resize(state, indices, inputSize, NULL);
+  THLongStorage_free(inputSize);
+
+  // How large are the slices that we are sorting?
+  long sliceSize = THCTensor_(size)(state, input, dim);
+
+  // Workaround:
+  // CUDA 8 uses more shared memory than 7.5 for bitonicSortKVInPlace,
+  // and so for the double word types,
+  // we get "too many resources requested for launch" in the 2048 case
+#if CUDA_VERSION >= 8000
+#if defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_LONG)
+  int maxSliceSize = 1024;
+#else
+  int maxSliceSize = 2048;
+#endif
+#else
+  int maxSliceSize = 2048;
+#endif
+
+  if (sliceSize <= maxSliceSize) {
+    // Fill `indices` (the values) with the
+    // slice-relative index.
+    THCudaLongTensor_fillSliceWithIndex(state, indices, dim);
+
+    // We sort k/v pairs in-place; copy unsorted input to output
+    THCTensor_(copy)(state, sorted, input);
+
+    // Sort using our in-place k/v kernel that supports arbitrary
+    // layout
+    THCTensor_(sortKeyValueInplace)(state, sorted, indices, dim, order);
+  } else {
+    // Otherwise, fall back upon Thrust, which handles all other cases
+    // (potentially slowly, with extra copies/memory allocations)
+    sortViaThrust(state, sorted, indices, input, dim, (bool) order);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorSort.h b/lib/THC/generic/THCTensorSort.h
new file mode 100644
index 0000000..009d825
--- /dev/null
+++ b/lib/THC/generic/THCTensorSort.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorSort.h"
+#else
+
+/* Performs an in-place sort of (keys, values). Only works for slice sizes
+   <= 2048 at the moment (slice size == size of keys/values dim `dim`) */
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                             THCTensor* keys,
+                                             THCudaLongTensor* values,
+                                             int dim, int order);
+
+/* Performs an out-of-place sort of `input`, returning the per-slice indices
+   in `indices` and the sorted values in `sorted` */
+THC_API void THCTensor_(sort)(THCState* state,
+                              THCTensor* sorted,
+                              THCudaLongTensor* indices,
+                              THCTensor* input,
+                              int dim, int order);
+
+#endif
diff --git a/rocks/cutorch-1.0-0.rockspec b/rocks/cutorch-1.0-0.rockspec
new file mode 100644
index 0000000..07e309e
--- /dev/null
+++ b/rocks/cutorch-1.0-0.rockspec
@@ -0,0 +1,38 @@
+package = "cutorch"
+version = "1.0-0"
+
+source = {
+   url = "git://github.com/torch/cutorch.git",
+   tag = "1.0-0"
+}
+
+description = {
+   summary = "Torch CUDA Implementation",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/torch/cutorch",
+   license = "BSD"
+}
+
+dependencies = {
+   "torch >= 7.0",
+}
+
+build = {
+   type = "command",
+   build_command = [[
+
+jopts=$(getconf _NPROCESSORS_CONF)
+
+echo "Building on $jopts cores"
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+]],
+	platforms = {
+      windows = {
+   build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+]]
+	  }
+   },
+   install_command = "cd build"
+}
diff --git a/rocks/cutorch-scm-1.rockspec b/rocks/cutorch-scm-1.rockspec
new file mode 100644
index 0000000..8314385
--- /dev/null
+++ b/rocks/cutorch-scm-1.rockspec
@@ -0,0 +1,37 @@
+package = "cutorch"
+version = "scm-1"
+
+source = {
+   url = "git://github.com/torch/cutorch.git",
+}
+
+description = {
+   summary = "Torch CUDA Implementation",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/torch/cutorch",
+   license = "BSD"
+}
+
+dependencies = {
+   "torch >= 7.0",
+}
+
+build = {
+   type = "command",
+   build_command = [[
+
+jopts=$(getconf _NPROCESSORS_CONF)
+
+echo "Building on $jopts cores"
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$jopts install
+]],
+	platforms = {
+      windows = {
+   build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
+]]
+	  }
+   },
+   install_command = "cd build"
+}
diff --git a/rocks/version.sh b/rocks/version.sh
new file mode 100644
index 0000000..a2cd17d
--- /dev/null
+++ b/rocks/version.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+cd "$(dirname "$0")"
+fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1)
+echo "Last known version: $fname" 
+luarocks new_version $fname
+
+new_fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1)
+new_version=$(echo $new_fname | cut -f2,3,4,5 -d'-'|sed -e 's/.rockspec//g')
+echo "new rockspec: $new_fname"
+echo "new version: $new_version"
+git add $new_fname
+git commit -m "Cutting version $new_version"
+git branch $new_version
+
+git push origin master:master
+git push origin $new_version:$new_version
+
+git clone https://github.com/torch/rocks
+cp $new_fname rocks/
+cd rocks
+th make-manifest.lua
+git add $new_fname
+git commit -am "adding rockspec $new_fname"
+git push
+cd ..
+rm -rf rocks
+cd ..
+
diff --git a/test/test.lua b/test/test.lua
new file mode 100644
index 0000000..32918b1
--- /dev/null
+++ b/test/test.lua
@@ -0,0 +1,4225 @@
+local runtests = false
+if not cutorch then
+   require 'cutorch'
+   runtests = true
+end
+
+local test = {}
+local minsize = 5
+local maxsize = 10
+local minvalue = 2
+local maxvalue = 20
+local nloop = 100
+local test_tolerance = 1e-5
+local unpack = unpack or table.unpack
+local hasHalfChecked = false
+--e.g. unit test cmd: th -lcutorch -e "cutorch.test{'view','viewAs'}"
+
+local typenames = {
+    'torch.CudaByteTensor',
+    'torch.CudaCharTensor',
+    'torch.CudaShortTensor',
+    'torch.CudaIntTensor',
+    'torch.CudaLongTensor',
+    'torch.CudaTensor',
+    'torch.CudaDoubleTensor'
+}
+
+local float_typenames = {
+    'torch.CudaTensor',
+    'torch.CudaDoubleTensor'
+}
+
+local t2gpu = {
+   ['torch.ByteTensor'] = 'torch.CudaByteTensor',
+   ['torch.CharTensor'] = 'torch.CudaCharTensor',
+   ['torch.ShortTensor'] = 'torch.CudaShortTensor',
+   ['torch.IntTensor'] = 'torch.CudaIntTensor',
+   ['torch.LongTensor'] = 'torch.CudaLongTensor',
+   ['torch.FloatTensor'] = 'torch.CudaTensor',
+   ['torch.DoubleTensor'] = 'torch.CudaDoubleTensor',
+
+   ['torch.ByteStorage'] = 'torch.CudaByteStorage',
+   ['torch.CharStorage'] = 'torch.CudaCharStorage',
+   ['torch.ShortStorage'] = 'torch.CudaShortStorage',
+   ['torch.IntStorage'] = 'torch.CudaIntStorage',
+   ['torch.LongStorage'] = 'torch.CudaLongStorage',
+   ['torch.FloatStorage'] = 'torch.CudaStorage',
+   ['torch.DoubleStorage'] = 'torch.CudaDoubleStorage',
+}
+
+local t2cpu = {}
+for k,v in pairs(t2gpu) do
+   t2cpu[v] = k
+end
+
+local function checkHalf()
+   if cutorch.hasHalf and hasHalfChecked == false then
+       table.insert(typenames, 'torch.CudaHalfTensor')
+       table.insert(float_typenames, 'torch.CudaHalfTensor')
+       t2cpu['torch.CudaHalfTensor'] = 'torch.FloatTensor'
+       t2gpu['torch.HalfTensor'] = 'torch.CudaHalfTensor'
+   end
+   hasHalfChecked = true
+end
+
+local function isFloat(t)
+    for k, v in pairs(float_typenames) do
+        if t == k then
+            return true
+        end
+    end
+    return false
+end
+
+-- Picks an integer between a and b, inclusive of endpoints
+local function chooseInt(a, b)
+   return math.floor(torch.uniform(a, b + 1))
+end
+
+-- Constructs a tensor from a larger storage, with holes in each dimension
+local function createHoledTensorWithSizes(size)
+   local osize = {}
+   for i = 1, #size do osize[i] = size[i] end
+   -- randomly inflate a few dimensions in osize
+   for i = 1, 3 do
+      local dim = torch.random(1,#osize)
+      local add = torch.random(4, 15)
+      osize[dim] = osize[dim] + add
+   end
+   local input = torch.FloatTensor(torch.LongStorage(osize))
+   -- now extract the input of correct size from 'input'
+   for i = 1, #size do
+      if input:size(i) ~= size[i] then
+         local bounds = torch.random(1, input:size(i) - size[i] + 1)
+         input = input:narrow(i, bounds, size[i])
+      end
+   end
+   return input
+end
+
+-- Create a tensor of a given size, allowing for transpositions or holes
+local function createTestTensorWithSizes(allowHoles, allowTransposition, sizes)
+   local t = nil
+   if allowHoles then
+      t = createHoledTensorWithSizes(sizes)
+   else
+      t = torch.FloatTensor(unpack(sizes))
+   end
+
+   if allowTransposition then
+      local dims = t:nDimension()
+
+      local numTranspositions = chooseInt(1, dims)
+
+      for i = 1, numTranspositions do
+         local dim1 = chooseInt(1, dims)
+         local dim2 = chooseInt(1, dims)
+
+         if dim1 ~= dim2 then
+            t = t:transpose(dim1, dim2)
+         end
+      end
+   end
+
+   if allowHoles then
+      -- fill the holes with NaNs (the non-holes will be overwritten below)
+      -- this will help detect garbage usage
+      t:storage():fill(0/0)
+   end
+
+   -- The test tensor may be used for sort/selection testing, in which
+   -- case we wish to avoid duplicate elements, but might like some
+   -- randomness
+   t:copy(torch.randperm(t:nElement()))
+
+   return t
+end
+
+-- Create a test tensor bounded by total size `maxSize`
+local function createTestTensorMaxSize(allowHoles, allowTransposition, maxSize)
+   local dims = chooseInt(1, 5)
+   local maxDimSize = math.ceil(math.pow(maxSize, 1 / dims))
+   local sizes = nil
+
+   while true do
+      sizes = {}
+      local size = 1
+
+      for i = 1, dims do
+         sizes[i] = chooseInt(1, maxDimSize)
+         size = size * sizes[i]
+      end
+
+      if (size > 1) and (size < maxSize) then
+         break
+      end
+   end
+
+   return createTestTensorWithSizes(allowHoles, allowTransposition, sizes)
+end
+
+-- Create a (potentially transposed, potentially with holes) tensor of a given
+-- max size
+local function createTestTensor(maxSize)
+   -- 50/50 chance of contig/non-contig
+   local contig = chooseInt(1, 2) == 1
+   local holes = false
+   local tr = false
+   if not contig then
+      holes = chooseInt(1, 2) == 1
+      tr = chooseInt(1, 2) == 1
+   end
+
+   return createTestTensorMaxSize(holes, tr, maxSize)
+end
+
+local function isEqual(x, y, tolerance, ...)
+   if a == nil and b == nil then return true end
+   if a == nil and b ~= nil then return false end
+   if a ~= nil and b == nil then return false end
+
+   -- clone the tensors so we can modify the contents if necessary for testing
+   local a = x:clone()
+   local b = y:clone()
+
+   if torch.type(b) ~= torch.type(a) then
+      b = b:typeAs(a) -- TODO: remove the need for this (a-b doesnt work for bytetensor, cudatensor pairs)
+   end
+   local diff = a-b
+   tolerance = tolerance or 0.000001
+
+   if type(a) == 'number' then
+      -- NaN Check:
+      if a ~= a and b ~= b then
+          return true
+      end
+      return math.abs(diff) < tolerance
+   else
+      if torch.type(diff) ~= 'torch.FloatTensor' then
+         diff = diff:float() -- TODO: remove the need for this (byteTensor and abs)
+      end
+      -- NaN Check:
+      local hasNaN = false
+      diff:apply(function(elt) if elt ~= elt then hasNaN = true end end)
+      if hasNaN then
+         -- check if NaN in equal positions
+         local nea = torch.ne(a, a)
+         local neb = torch.ne(b, b)
+         if not nea:equal(neb) then
+            return false
+         end
+         -- check diff of all other elements less than tolerance
+         local ea = a:apply(function(elt) if elt ~= elt then return 0 else return elt end end)
+         local eb = b:apply(function(elt) if elt ~= elt then return 0 else return elt end end)
+         return (ea-eb):abs():max() < tolerance
+      else
+         return diff:abs():max() < tolerance
+      end
+   end
+end
+
+local function checkMultiDevice(x, fn, ...)
+   local device_count = cutorch.getDeviceCount()
+   if device_count >= 2 then
+      local x = x:cuda()
+      cutorch.setDevice(cutorch.getDevice() == 1 and 2 or 1)
+      local ok, err = pcall(function(...) x[fn](x, ...) end, ...)
+      tester:assert(not ok, "Multi-device checks failed for: " .. tostring(fn))
+   end
+end
+
+local function cloneExactlyToGPU(t)
+   -- keep the size/stride of original tensor, handling tensors that
+   -- potentially have holes as well
+   local tGPU = nil
+
+   if t:storage() then
+      local sGPU = torch.CudaStorage(t:storage():size()):copy(t:storage())
+      tGPU = torch.CudaTensor(sGPU, t:storageOffset(), t:size(), t:stride())
+   else
+      tGPU = torch.CudaTensor()
+   end
+
+   return tGPU
+end
+
+local function compareFloatAndCuda(x, fn, ...)
+   local args = {...}
+   args['input'] = x
+   local x_cpu = x:float()
+   local x_cuda = cloneExactlyToGPU(x_cpu)
+
+   local rcpu = {}
+   local rcuda = {}
+   if type(fn) == 'string' then
+      tester:assertne(x_cuda[fn], nil,
+		      string.format("Missing function CudaTensor.%s", fn))
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4] = x_cpu[fn](x_cpu, ...)
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = x_cuda[fn](x_cuda, ...)
+   elseif type(fn) == 'function' then
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4] = fn(x_cpu, ...)
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = fn(x_cuda, ...)
+   else
+      error("Incorrect function type")
+   end
+   local errstr = string.format("Divergent results between CPU and CUDA" ..
+				" for function '%s' (return value 1)", tostring(fn))
+   local tolerance = test_tolerance
+   tester:assert(#rcpu == #rcuda,
+		 string.format("number of return arguments for CPU and CUDA "
+			       .. "are different for function '%s'", tostring(fn)))
+   for k, _ in ipairs(rcpu) do
+      if not isEqual(rcpu[k], rcuda[k], tolerance) then
+	      print(args)
+	      tester:assert(false, errstr)
+      end
+   end
+end
+
+local function compareFloatAndCudaTensorArgs(x, fn, ...)
+   local x_cpu = x:float()
+   local x_cuda = cloneExactlyToGPU(x_cpu)
+
+   local rcpu = {}
+   local rcuda = {}
+
+   -- Transformation of args
+   local tranform_args = function(t, type)
+      for k,v in pairs(t) do
+         local v_type = torch.Tensor.type(v)
+         if v_type == 'torch.FloatTensor' or v_type == 'torch.CudaTensor'
+	 or v_type == 'torch.DoubleTensor' then
+            t[k] = v:type(type).new(v:size(), v:stride())
+            if v:storage() then t[k]:storage():copy(v:storage()) end
+         end
+      end
+      return t
+   end
+   local cpu_args = tranform_args({...}, 'torch.FloatTensor')
+   local cuda_args = tranform_args({...}, 'torch.CudaTensor')
+   if type(fn) == 'string' then
+      tester:assertne(x_cuda[fn], nil,
+         string.format("Missing function CudaTensor.%s", fn))
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4]  = x_cpu[fn](x_cpu, unpack(cpu_args))
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = x_cuda[fn](x_cuda, unpack(cuda_args))
+   elseif type(fn) == 'function' then
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4] = fn(x_cpu, unpack(cpu_args))
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = fn(x_cuda, unpack(cuda_args))
+   else
+      error("Incorrect function type")
+   end
+   local errstr = string.format("Divergent results between CPU and CUDA" ..
+				" for function '%s' (return value 1)", tostring(fn))
+   local tolerance = test_tolerance
+   tester:assert(#rcpu == #rcuda,
+		 string.format("number of return arguments for CPU and CUDA "
+			       .. "are different for function '%s'", tostring(fn)))
+   for k, _ in ipairs(rcpu) do
+      if not isEqual(rcpu[k], rcuda[k], tolerance) then
+	 print(args)
+	 tester:assert(false, errstr)
+      end
+   end
+end
+
+-- converts a tensor to it's exact GPU type
+local function GPU(t, gpu2cpu_map)
+   gpu2cpu_map = gpu2cpu_map or t2gpu
+   if torch.isTensor(t) or torch.isStorage(t) then
+      return torch[gpu2cpu_map[torch.type(t)]:match('torch.(%a+)')] or t
+   elseif torch.type(t) == 'string' then
+      return torch[gpu2cpu_map[t]:match('torch.(%a+)')]
+   end
+   error('not tensor or storage')
+end
+
+-- converts a tensor to it's exact CPU type
+local function CPU(t)
+   if torch.isTensor(t) or torch.isStorage(t) then
+      return torch[t2cpu[torch.type(t)]:match('torch.(%a+)')] or t
+   elseif torch.type(t) == 'string' then
+      return torch[t2cpu[t]:match('torch.(%a+)')]
+   end
+   error('not tensor or storage')
+end
+
+-- exactly clone a tensor (same size / storage) to it's equivalent GPU type
+-- if baseType is given, convert to the baseType's GPU type instead
+local function cloneExactlyToGPUType(t, baseType, gpu2cpu_map)
+   local type = baseType and baseType or t
+   -- keep the size/stride of original tensor, handling tensors that
+   -- potentially have holes as well
+   local tGPU = nil
+   if t:storage() then
+      local sGPU = GPU(type, gpu2cpu_map).new(1):storage().new(t:storage():size()):copy(t:storage())
+      tGPU = GPU(type, gpu2cpu_map)(sGPU, t:storageOffset(), t:size(), t:stride())
+   else
+      tGPU = GPU(type, gpu2cpu_map)()
+   end
+
+   return tGPU
+end
+
+-- baseType = the tensor type to test
+-- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
+--             false: convert then to baseType when doing CUDA
+-- x = first argument tensor
+-- gpu2cpu_map = map of gpu types to cpu types
+-- fn = function name (as string), or the function)
+-- ... = the rest of arguments to fn
+local function compareCPUAndCUDATypeTensorArgsWithConv(cudaType, gpu2cpu_map, indexMode, x, fn, ...)
+   local baseType = t2cpu[cudaType]
+   assert(baseType, 'Cannot find baseType for ' .. cudaType)
+   local x_cpu = x:type(baseType)
+   local x_cuda = cloneExactlyToGPUType(x_cpu, nil, gpu2cpu_map)
+
+   local rcpu = {}
+   local rcuda = {}
+   -- Transformation of args
+   local tranform_args = function(t, type)
+      for k,v in pairs(t) do
+	 if torch.isTensor(v) or torch.isStorage(v) then
+	    if indexMode == true then
+                t[k] = cloneExactlyToGPUType(v, nil, gpu2cpu_map)
+	    else
+                t[k] = cloneExactlyToGPUType(v, x_cpu, gpu2cpu_map)
+	    end
+         end
+      end
+      return t
+   end
+
+   local cpu_args = {...}
+   local cuda_args = tranform_args({...})
+   if type(fn) == 'string' then
+      tester:assertne(x_cuda[fn], nil,
+                     string.format("Missing function %s.%s", torch.type(x_cuda), fn))
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4]  = x_cpu[fn](x_cpu, unpack(cpu_args))
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = x_cuda[fn](x_cuda, unpack(cuda_args))
+   elseif type(fn) == 'function' then
+      rcpu[1], rcpu[2], rcpu[3], rcpu[4] = fn(x_cpu, unpack(cpu_args))
+      rcuda[1], rcuda[2], rcuda[3], rcuda[4] = fn(x_cuda, unpack(cuda_args))
+   else
+      error("Incorrect function type")
+   end
+
+   local tolerance = test_tolerance
+   local errstr = string.format("Divergent results between CPU and CUDA"
+				.. " for function '%s.%s", torch.type(x_cuda), fn)
+   if indexMode ~= nil then
+      errstr = errstr .. " in indexMode = " .. tostring(indexMode)
+   end
+   errstrval = errstr .. " for return value # %d"
+   errstrval = errstrval .. ". Divergence value: %f"
+   errstrobj = errstr .. " for object"
+   errstrobj = errstrobj .. ". Divergence value: %f"
+   local function divval(cpu, cuda)
+      return torch.isTensor(cpu) and (cpu:double() - cuda:double()):abs():max() or 0
+   end
+
+   tester:assert(#rcpu == #rcuda,
+		 string.format("number of return arguments for CPU and CUDA "
+			       .. "are different for function '%s'", tostring(fn)))
+   for k, _ in ipairs(rcpu) do
+      tester:assert(isEqual(rcpu[k], rcuda[k], tolerance),
+                    string.format(errstrval, k, divval(rcpu[k], rcuda[k])))
+   end
+   -- also test x in case function changed object
+   tester:assert(isEqual(x_cpu, x_cuda, tolerance),
+                 string.format(errstrobj, divval(x_cpu, x_cuda)))
+end
+
+-- baseType = the tensor type to test
+-- indexMode = true: keep indexing and masking Tensors as their CPU equivalents
+--             false: convert then to baseType when doing CUDA
+-- x = first argument tensor
+-- fn = function name (as string), or the function)
+-- ... = the rest of arguments to fn
+local function compareCPUAndCUDATypeTensorArgs(cudaType, indexMode, x, fn, ...)
+   compareCPUAndCUDATypeTensorArgsWithConv(cudaType, nil, indexMode, x, fn, ...)
+end
+
+function test.squeeze()
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 1, sz, 1)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(typename)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'squeeze')
+   end
+
+   local y = x:cuda():squeeze()
+   tester:assert(y:dim() == 2, "squeeze err")
+
+   x = torch.FloatTensor():rand(sz, 1, 1, sz)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(typename)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'squeeze', 2)
+   end
+
+   local y = x:cuda():squeeze(2)
+   tester:assert(y:dim() == 3, "squeeze1d err")
+
+   x = torch.FloatTensor(1):normal()
+   for k, typename in ipairs(typenames) do
+      local x = x:type(typename)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'squeeze')
+   end
+end
+
+function test.expand()
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 1)
+   compareFloatAndCuda(x, 'expand', sz, sz)
+
+   x = torch.FloatTensor():rand(1, sz)
+   compareFloatAndCuda(x, 'expand', sz, sz)
+end
+
+function test.view()
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 3)
+   compareFloatAndCuda(x, 'view', sz, 3, 1)
+end
+
+function test.viewAs()
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 3)
+   local y = torch.FloatTensor():rand(sz, 3, 1)
+   compareFloatAndCudaTensorArgs(x, 'viewAs', y)
+end
+
+function test.repeatTensor()
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 3)
+   compareFloatAndCuda(x, 'repeatTensor', sz, 2)
+end
+
+function test.permute()
+   local perm = torch.randperm(7):totable()
+   local x = torch.FloatTensor():rand(1, 2, 3, 4, 5, 6, 7)
+   compareFloatAndCuda(x, 'permute', unpack(perm))
+end
+
+function test.split()
+   local sz = {chooseInt(minsize, maxsize),
+               chooseInt(minsize, maxsize),
+               chooseInt(minsize, maxsize)}
+   local x = torch.rand(unpack(sz))
+   local dim = torch.random(3)
+   local size = torch.random(sz[dim])
+   local y = x:split(size, dim)
+   local y_ref = x:float():split(size, dim)
+
+   tester:asserteq(#y, #y_ref)
+   for i = 1, math.min(#y, #y_ref) do
+      tester:assertTensorEq(y[i]:float(), y_ref[i], 0)
+   end
+end
+
+function test.chunk()
+   local sz = {chooseInt(minsize, maxsize),
+               chooseInt(minsize, maxsize),
+               chooseInt(minsize, maxsize)}
+   local x = torch.rand(unpack(sz))
+   local dim = torch.random(3)
+   local n = torch.random(sz[dim])
+   local y = x:chunk(n, dim)
+   local y_ref = x:float():chunk(n, dim)
+
+   tester:asserteq(#y, #y_ref)
+   for i = 1, math.min(#y, #y_ref) do
+      tester:assertTensorEq(y[i]:float(), y_ref[i], 0)
+   end
+end
+
+function test.copyRandomizedTest()
+   local maxSize = 1000000 -- 1M elements max
+   local ndimInput = torch.random(10)
+   local function randomSizeGenerator(ndimInput)
+      local size = {}
+      local totalSize = 1
+      for i = 1, ndimInput do
+         size[i] = torch.random(25)
+         totalSize = totalSize * size[i]
+      end
+      return size, totalSize
+   end
+   local inputSize, nElem = randomSizeGenerator(ndimInput)
+   local attemptsAtSizeGeneration = 1
+   while nElem > maxSize do
+      attemptsAtSizeGeneration = attemptsAtSizeGeneration + 1
+      -- make atmost 100 attempts to generate sizes randomly.
+      -- this guarantees that even in the worst case,
+      -- this test does not run forever
+      if attemptsAtSizeGeneration == 100 then
+         inputSize = {1, 10, 100}
+         break
+      end
+      inputSize, nElem = randomSizeGenerator(ndimInput)
+   end
+
+   -- http://rosettacode.org/wiki/Prime_decomposition#Lua
+   local function IsPrime(n)
+      if n <= 1 or (n ~= 2 and n % 2 == 0) then return false end
+      for i = 3, math.sqrt(n), 2 do if n % i == 0 then return false end end
+         return true
+   end
+   local function PrimeDecomposition(n)
+      local f = {}
+      if IsPrime(n) then f[1] = n; return f end
+      local i = 2
+      repeat
+         while n % i == 0 do f[#f + 1] = i; n = n / i end
+         repeat i = i + 1 until IsPrime( i )
+      until n == 1
+      return f
+   end
+   local function constructOutput(size)
+      local outputSize = {}
+      for i = 1, #size do outputSize[i] = size[i] end
+      for i = 1, 10 do -- 10 randomizations
+         -- pick an input dim
+         local dim = torch.random(1, #size)
+         -- factor it
+         local factors = PrimeDecomposition(outputSize[dim])
+         if #factors ~= 0 then
+            -- remove one of the factors
+            local factor = factors[torch.random(#factors)]
+            local addNewDim = torch.random(1, 2)
+            if addNewDim == 1 then -- add it as a new dimension
+               outputSize[dim] = outputSize[dim] / factor
+               -- where to insert new dimension
+               local where = torch.random(1, #outputSize)
+               local o = {}
+               o[where] = factor
+               local index = 1
+               for j = 1, #outputSize + 1 do
+                  if j == where then
+                     o[j] = factor
+                  else
+                     o[j] = outputSize[index]
+                     index = index + 1
+                  end
+               end
+               outputSize = o
+            else -- or multiply the factor to another dimension
+               local where = torch.random(1, #outputSize)
+               outputSize[dim] = outputSize[dim] / factor
+               outputSize[where] = outputSize[where] * factor
+            end
+         end
+      end
+      return outputSize
+   end
+   local outputSize = constructOutput(inputSize)
+   local nelem1 = 1
+   local nelem2 = 1
+   for i = 1, #inputSize do nelem1 = nelem1 * inputSize[i] end
+   for i = 1, #outputSize do nelem2 = nelem2 * outputSize[i] end
+   tester:asserteq(nelem1, nelem2, 'input and output sizes have to be the same')
+   local input, output
+
+   -- extract a sub-cube with probability 50%
+   -- (to introduce unreachable storage locations)
+   local holedInput = torch.random(1, 2)
+   local holedOutput = torch.random(1, 2)
+   if holedInput == 1 then
+      input = createHoledTensorWithSizes(inputSize)
+   else
+      input = torch.FloatTensor(torch.LongStorage(inputSize))
+   end
+   input:storage():fill(-150)
+   input:copy(torch.linspace(1, input:nElement(), input:nElement()))
+
+   if holedOutput == 1 then
+      output = createHoledTensorWithSizes(outputSize)
+   else
+      output = torch.FloatTensor(torch.LongStorage(outputSize))
+   end
+
+   output:storage():fill(-100)
+   output:fill(-1)
+   -- function to randomly transpose a tensor
+   local function randomlyTranspose(input)
+      local d1 = torch.random(1, input:dim())
+      local d2 = torch.random(1, input:dim())
+      if d1 ~= d2 then input = input:transpose(d1, d2) end
+      return input
+   end
+   -- randomly transpose with 50% prob
+   local transposeInput = torch.random(1, 2)
+   local transposeOutput = torch.random(1, 2)
+   if transposeInput == 1 then
+      for i = 1, 10 do input = randomlyTranspose(input) end
+   end
+   if transposeOutput == 1 then
+      for i = 1, 10 do output = randomlyTranspose(output) end
+   end
+
+   local input_tensor_float = input
+   local output_tensor_float = output
+   local input_storage_float = input:storage()
+   local output_storage_float = output:storage()
+   local input_storage_cuda =
+      torch.CudaStorage(input_storage_float:size()):copy(input_storage_float)
+   local output_storage_cuda =
+      torch.CudaStorage(output_storage_float:size()):copy(output_storage_float)
+
+   -- Also test cross-device copy behavior, if multiple devices are available.
+   local input_device = chooseInt(1, cutorch.getDeviceCount())
+   local output_device = chooseInt(1, cutorch.getDeviceCount())
+
+   -- Selectively disable p2p access to test that codepath as well
+   local access_disabled = false
+   if input_device ~= output_device and chooseInt(1, 2) == 1 then
+      -- p2p access between this pair of devices might not be available at all
+      if cutorch.getPeerToPeerAccess(output_device, input_device) then
+         access_disabled = true
+         cutorch.setPeerToPeerAccess(output_device, input_device, false)
+      end
+   end
+
+   local prev_device = cutorch.getDevice()
+
+   cutorch.setDevice(input_device)
+   local input_tensor_cuda = torch.CudaTensor(input_storage_cuda,
+                                          input_tensor_float:storageOffset(),
+                                          input_tensor_float:size(),
+                                          input_tensor_float:stride())
+
+   cutorch.setDevice(output_device)
+   local output_tensor_cuda = torch.CudaTensor(output_storage_cuda,
+                                          output_tensor_float:storageOffset(),
+                                          output_tensor_float:size(),
+                                          output_tensor_float:stride())
+
+   cutorch.setDevice(prev_device)
+
+   output_tensor_float:copy(input_tensor_float)
+   output_tensor_cuda:copy(input_tensor_cuda)
+
+   if access_disabled then
+      cutorch.setPeerToPeerAccess(output_device, input_device, true)
+   end
+
+   -- now compare output_storage_cuda and output_storage_float for exactness
+   local flat_tensor_float = torch.FloatTensor(input_storage_float)
+   local flat_storage_cuda =
+      torch.FloatStorage(input_storage_cuda:size()):copy(input_storage_cuda)
+   local flat_tensor_cuda = torch.FloatTensor(flat_storage_cuda)
+
+   local err = (flat_tensor_float - flat_tensor_cuda):abs():max()
+   if err ~= 0 then
+      print('copyRandomizedTest failure input size: ', input:size())
+      print('copyRandomizedTest failure input stride: ', input:stride())
+      print('copyRandomizedTest failure output size: ', output:size())
+      print('copyRandomizedTest failure output stride: ', output:stride())
+   end
+   tester:assert(err == 0, 'diverging input and output in copy test')
+end
+
+function test.copyNoncontiguous()
+   local x = torch.FloatTensor():rand(1, 1)
+   local f = function(src)
+      return src.new(2, 2):copy(src:expand(2, 2))
+   end
+   compareFloatAndCuda(x, f)
+
+   local sz = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz, 1)
+   local f = function(src)
+      return src.new(sz, sz):copy(src:expand(sz, sz))
+   end
+   compareFloatAndCuda(x, f)
+
+   x = torch.FloatTensor():rand(sz, sz, 2)
+   local f = function(src)
+      return src.new(sz, sz):copy(src[{{},{},{2}}])
+   end
+   compareFloatAndCuda(x, f)
+
+   x = torch.FloatTensor():rand(2, sz, sz)
+   local f = function(src)
+      return src.new(sz, sz):copy(src[{{2},{},{}}])
+   end
+   compareFloatAndCuda(x, f)
+
+   x = torch.FloatTensor():rand(sz, 2, sz)
+   local f = function(src)
+      return src.new(sz, sz):copy(src[{{},{2},{}}])
+   end
+   compareFloatAndCuda(x, f)
+
+   x = torch.FloatTensor():rand(sz, 2, sz)
+   local f = function(src)
+      return src.new(sz, 1, sz):copy(src[{{},{2},{}}])
+   end
+   compareFloatAndCuda(x, f)
+
+   x = torch.FloatTensor():rand(sz, sz):transpose(1,2)
+   local f = function(src)
+      return src.new(sz, sz):copy(src)
+   end
+   compareFloatAndCuda(x, f)
+
+   -- case for https://github.com/torch/cutorch/issues/90
+   do
+      local val = 1
+      local ps = torch.LongStorage({4, 4, 4})
+      local cube = torch.Tensor(ps):apply(
+         function()
+            val = val + 1
+            return val
+         end
+                                     ):cuda()
+
+      local ps = torch.LongStorage({4, 12})
+      local x = torch.CudaTensor(ps):fill(-1)
+
+      local l = 2
+      local h = 1
+      local w = 2
+
+      x[{{1},{1,9}}]:copy(cube[l][{{h,h+2},{w,w+2}}])
+      tester:assert((x[{1,{1,9}}]-cube[l][{{h,h+2},{w,w+2}}]):abs():max() == 0,
+         'diverging input and output in copy test')
+   end
+end
+
+function test.copyAsync()
+   local sz = chooseInt(maxsize, 2 * maxsize)
+   local host_tensors = {
+     cutorch.createCudaHostTensor(sz),
+     cutorch.createCudaHostDoubleTensor(sz)
+   }
+   if cutorch.hasHalf then
+     table.insert(host_tensors, cutorch.createCudaHostHalfTensor(sz))
+   end
+   for k,host_tensor in ipairs(host_tensors) do
+      local device_type = t2gpu[torch.type(host_tensor)]:match(('torch.(%a+)'))
+      if torch.type(host_tensor) ~= 'torch.HalfTensor' then
+         host_tensor = host_tensor:uniform()
+      else
+         -- HalfTensor doesn't have math functions defined.
+         local copy_tensor = torch[device_type](sz):uniform()
+         host_tensor:copy(copy_tensor)
+      end
+      local device_tensor = torch[device_type](sz)
+      device_tensor:copyAsync(host_tensor)
+      cutorch.streamSynchronize(cutorch.getStream())
+      tester:assertTensorEq(host_tensor:double(), device_tensor:double(), 0,
+                            "Async copy to device failed.")
+
+      device_tensor:uniform()
+      host_tensor:copyAsync(device_tensor)
+      cutorch.streamSynchronize(cutorch.getStream())
+      tester:assertTensorEq(device_tensor:double(), host_tensor:double(), 0,
+                            "Async copy to host failed.")
+  end
+end
+
+function test.largeNoncontiguous()
+   local x = torch.FloatTensor():randn(20, 1, 60, 60)
+   local sz = chooseInt(maxsize, 2 * maxsize)
+   local f = function(src)
+      return src.new(20, sz, 60, 60):copy(src:expand(20, sz, 60, 60))
+   end
+   compareFloatAndCuda(x, f)
+end
+
+function test.zero()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'zero')
+   end
+   checkMultiDevice(x, 'zero')
+end
+
+function test.fill()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local v = torch.uniform()
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'fill', v)
+   end
+   checkMultiDevice(x, 'fill', v)
+end
+
+function test.reshape()
+   local sz1 = chooseInt(minsize, maxsize)*2
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'reshape', sz1/2, sz2*2)
+   end
+   checkMultiDevice(x, 'reshape', sz1/2, sz2*2)
+end
+
+function test.zeros()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local t = torch.getdefaulttensortype()
+   torch.setdefaulttensortype('torch.CudaTensor')
+   local x = torch.zeros(sz1, sz2)
+   assert(x:sum() == 0)
+   torch.setdefaulttensortype(t)
+end
+
+function test.ones()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local t = torch.getdefaulttensortype()
+   torch.setdefaulttensortype('torch.CudaTensor')
+   local x = torch.ones(sz1, sz2)
+   assert(x:sum() == x:nElement())
+   torch.setdefaulttensortype(t)
+end
+
+
+function test.add()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor():rand(sz1, sz2)
+   local v = torch.uniform()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local x, y, z = x:type(ctype), y:type(ctype), z:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'add', z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'add', z, v)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'add', y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'add', y, v, z)
+   end
+   checkMultiDevice(x, 'add', z)
+   checkMultiDevice(x, 'add', z, v)
+   checkMultiDevice(x, 'add', y, z)
+   checkMultiDevice(x, 'add', y, v, z)
+end
+
+function test.csub()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor():rand(sz1, sz2)
+   local v = torch.uniform()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local x, y, z = x:type(ctype), y:type(ctype), z:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'csub', z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'csub', z, v)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'csub', y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'csub', y, v, z)
+   end
+   checkMultiDevice(x, 'csub', z)
+   checkMultiDevice(x, 'csub', z, v)
+   checkMultiDevice(x, 'csub', y, z)
+   checkMultiDevice(x, 'csub', y, v, z)
+end
+
+function test.cmul()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(typenames) do
+       local ctype = t2cpu[typename]
+       local x, y = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cmul', y)
+   end
+   checkMultiDevice(x, 'cmul', y)
+end
+
+function test.cpow()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(typenames) do
+       local ctype = t2cpu[typename]
+       local x, y = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cpow', y)
+   end
+   checkMultiDevice(x, 'cpow', y)
+end
+
+function test.cremainder()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor(sz1, sz2):uniform(-50, 50)
+   local y = torch.FloatTensor(sz1, sz2):uniform(-50, 50)
+   for k, typename in ipairs(typenames) do
+       local ctype = t2cpu[typename]
+       local a, b = x:type(ctype), y:type(ctype)
+       if not isFloat(typename) then
+           b[b:eq(0)] = 1
+       end
+       compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cremainder', b)
+   end
+   checkMultiDevice(x, 'cremainder', y)
+
+   -- ensure we test divide by zero
+   local x = torch.FloatTensor(1):fill(1)
+   local y = torch.FloatTensor(1):zero()
+   for k, typename in ipairs(float_typenames) do
+       local ctype = t2cpu[typename]
+       local a, b = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cremainder', b)
+   end
+   checkMultiDevice(x, 'cremainder', y)
+end
+
+function test.cfmod()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor(sz1, sz2):uniform(-50, 50)
+   local y = torch.FloatTensor(sz1, sz2):uniform(-50, 50)
+   for k, typename in ipairs(typenames) do
+       local ctype = t2cpu[typename]
+       local a, b = x:type(ctype), y:type(ctype)
+       if not isFloat(typename) then
+           b[b:eq(0)] = 1
+       end
+       compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cfmod', b)
+   end
+   checkMultiDevice(x, 'cfmod', y)
+
+   -- ensure we test mod by zero
+   local x = torch.FloatTensor(1):fill(1)
+   local y = torch.FloatTensor(1):zero()
+   for k, typename in ipairs(float_typenames) do
+       local ctype = t2cpu[typename]
+       local a, b = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cfmod', b)
+   end
+   checkMultiDevice(x, 'cfmod', y)
+end
+
+function test.nonzero()
+    local minsize = 10
+    local maxsize = 20
+    local dims = {chooseInt(minsize, maxsize)}
+    local threshold = 1 / 3
+    local flip = math.random()
+    while flip > threshold do
+        dims[#dims + 1] = chooseInt(minsize, maxsize)
+        flip = math.random()
+    end
+    local x = createTestTensorWithSizes(true, true, dims)
+    local randMask = torch.ByteTensor(unpack(dims)):bernoulli()
+    x:maskedFill(randMask, 0)
+    for k, typename in ipairs(typenames) do
+        local ctype = t2cpu[typename]
+        local x = x:type(ctype)
+        compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'nonzero')
+    end
+    checkMultiDevice(x, 'nonzero')
+end
+
+function test.cdiv()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   compareFloatAndCudaTensorArgs(x, 'cdiv', y)
+   checkMultiDevice(x, 'cdiv', y)
+end
+
+function test.cdiv3()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor(sz1, sz2)
+   compareFloatAndCudaTensorArgs(z, 'cdiv', x, y)
+   checkMultiDevice(z, 'cdiv', x, y)
+end
+
+function test.addcmul()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor():rand(sz1, sz2)
+
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      local y = y:type(t2cpu[typename])
+      local z = z:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'addcmul', y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'addcmul', torch.uniform(), y, z)
+   end
+
+   checkMultiDevice(x, 'addcmul', y, z)
+   checkMultiDevice(x, 'addcmul', torch.uniform(), y, z)
+
+   local r = torch.zeros(sz1, sz2)
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      local y = y:type(t2cpu[typename])
+      local z = z:type(t2cpu[typename])
+      local r = r:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, r, 'addcmul', x, y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, r, 'addcmul', x, torch.uniform(), y, z)
+   end
+
+   checkMultiDevice(r, 'addcmul', x, y, z)
+   checkMultiDevice(r, 'addcmul', x, torch.uniform(), y, z)
+
+end
+
+function test.addcdiv()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   -- add so no divide by zero
+   local x = torch.FloatTensor():rand(sz1, sz2):add(torch.random(1, 5))
+   local y = torch.FloatTensor():rand(sz1, sz2):add(torch.random(1, 5))
+   local z = torch.FloatTensor():rand(sz1, sz2):add(torch.random(1, 5))
+
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      local y = y:type(t2cpu[typename])
+      local z = z:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'addcdiv', y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'addcdiv', torch.uniform(), y, z)
+   end
+
+   checkMultiDevice(x, 'addcdiv', y, z)
+   checkMultiDevice(x, 'addcdiv', torch.uniform(), y, z)
+
+   local r = torch.zeros(sz1, sz2)
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      local y = y:type(t2cpu[typename])
+      local z = z:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, r, 'addcdiv', x, y, z)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, r, 'addcdiv', x, torch.uniform(), y, z)
+   end
+
+   checkMultiDevice(r, 'addcdiv', x, y, z)
+   checkMultiDevice(r, 'addcdiv', x, torch.uniform(), y, z)
+end
+
+function test.fmod()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():randn(sz1, sz2)
+   x:apply(function(x)
+       x = x * torch.random(1, 100)
+       return x
+   end)
+   local r = torch.normal(0, 25)
+
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'fmod', r)
+   end
+end
+
+function test.remainder()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():randn(sz1, sz2)
+   x:apply(function(x)
+       x = x * torch.random(1, 100)
+       return x
+   end)
+   local r = torch.normal(0, 25)
+
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'remainder', r)
+   end
+end
+
+function test.equal()
+    -- empty tensors are equal
+    local x = torch.FloatTensor()
+    local y = torch.FloatTensor()
+
+    for _, typename in ipairs(typenames) do
+        local a = x:type(typename)
+        local b = y:type(typename)
+        tester:assert(a:equal(b), 'Empty Tensors should be considered equal')
+    end
+
+    -- mismatched size tensors are not equal
+    local x = torch.FloatTensor(5):fill(1)
+    local y = torch.FloatTensor(3):fill(1)
+
+    for _, typename in ipairs(typenames) do
+        local a = x:type(typename)
+        local b = y:type(typename)
+        tester:assert(not a:equal(b), 'Tensors of different sizes not equal')
+    end
+
+    -- tensors of same size but different value are not equal
+    local sz1 = chooseInt(minsize, maxsize)
+    local sz2 = chooseInt(minsize, maxsize)
+    local x = torch.FloatTensor(sz1, sz2):apply(function() return torch.random(0, 255) end)
+    local y = torch.add(x, 1)
+
+    for _, typename in ipairs(typenames) do
+        local a = x:type(typename)
+        local b = y:type(typename)
+        tester:assert(not a:equal(b), 'Tensors should not be equal')
+    end
+
+    -- actual equality
+    for _, typename in ipairs(typenames) do
+        local a = x:type(typename)
+        local b = x:type(typename)
+        tester:assert(a:equal(b), 'Tensors should be equal')
+    end
+end
+
+function test.logicalValue()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   compareFloatAndCudaTensorArgs(x, 'gt', y, 0.3)
+   compareFloatAndCuda(x, 'gt', 0.3)
+   checkMultiDevice(x, 'gt', y, 0.3)
+   checkMultiDevice(x, 'gt', 0.3)
+end
+
+function test.logicalTensor()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor():rand(sz1, sz2)
+   compareFloatAndCudaTensorArgs(x, 'gt', z)
+   compareFloatAndCudaTensorArgs(x, 'gt', y, z)
+   checkMultiDevice(x, 'gt', z)
+   checkMultiDevice(x, 'gt', y, z)
+end
+
+function test.mean()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for k, typename in ipairs(typenames) do
+     local x = x:type(t2cpu[typename])
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'zero')
+   end
+   checkMultiDevice(x, 'mean')
+   checkMultiDevice(x, 'mean', 1)
+end
+
+function test.max()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.randperm(sz1 * sz2):view(sz1, sz2):float()
+   for k, typename in ipairs(typenames) do
+      local x_
+      if typename == 'torch.CudaByteTensor' or typename == 'torch.CudaCharTensor'
+      or typename == 'torch.CudaShortTensor' then
+	 -- limit the range of max, so that there's no same indices
+	 local sz1 = chooseInt(1, 10)
+	 local sz2 = chooseInt(1, 10)
+	 x_ = torch.randperm(sz1 * sz2):view(sz1, sz2)
+      else
+         x_ = x:type(t2cpu[typename])
+      end
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'max')
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'max', 1)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'max', 2)
+   end
+   checkMultiDevice(x, 'max')
+   checkMultiDevice(x, 'max', 1)
+end
+
+function test.min()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.randperm(sz1 * sz2):view(sz1, sz2):float()
+   for k, typename in ipairs(typenames) do
+      local x_
+      if typename == 'torch.CudaByteTensor' or typename == 'torch.CudaCharTensor'
+      or typename == 'torch.CudaShortTensor' then
+	 -- limit the range of min, so that there's no same indices
+	 local sz1 = chooseInt(1, 10)
+	 local sz2 = chooseInt(1, 10)
+	 x_ = torch.randperm(sz1 * sz2):view(sz1, sz2)
+      else
+         x_ = x:type(t2cpu[typename])
+      end
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'min')
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'min', 1)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x_, 'min', 2)
+   end
+   checkMultiDevice(x, 'min')
+   checkMultiDevice(x, 'min', 1)
+end
+
+function test.cmax()
+  local sz1 = chooseInt(minsize, maxsize)
+  local sz2 = chooseInt(minsize, maxsize)
+  local a = torch.FloatTensor(sz1, sz2):uniform()
+  local b = torch.FloatTensor(sz1, sz2):uniform()
+  local c = torch.FloatTensor(sz1, sz2):zero()
+  local v = torch.uniform()
+
+  for _, typename in ipairs(typenames) do
+      local a = a:type(t2cpu[typename])
+      local b = b:type(t2cpu[typename])
+      local c = c:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, c, 'cmax', a, b)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, c, 'cmax', a, v)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cmax', b)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cmax', v)
+  end
+
+  checkMultiDevice(c, 'cmax', a, b)
+  checkMultiDevice(c, 'cmax', a, v)
+  checkMultiDevice(a, 'cmax', b)
+  checkMultiDevice(a, 'cmax', v)
+end
+
+function test.cmin()
+  local sz1 = chooseInt(minsize, maxsize)
+  local sz2 = chooseInt(minsize, maxsize)
+  local a = torch.FloatTensor(sz1, sz2):uniform()
+  local b = torch.FloatTensor(sz1, sz2):uniform()
+  local c = torch.FloatTensor(sz1, sz2):zero()
+  local v = torch.uniform()
+
+  for _, typename in ipairs(typenames) do
+      local a = a:type(t2cpu[typename])
+      local b = b:type(t2cpu[typename])
+      local c = c:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, c, 'cmin', a, b)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, c, 'cmin', a, v)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cmin', b)
+      compareCPUAndCUDATypeTensorArgs(typename, nil, a, 'cmin', v)
+  end
+
+  checkMultiDevice(c, 'cmin', a, b)
+  checkMultiDevice(c, 'cmin', a, v)
+  checkMultiDevice(a, 'cmin', b)
+  checkMultiDevice(a, 'cmin', v)
+end
+
+function test.allAndAny()
+   for tries = 1, 10 do
+      local size1 = chooseInt(10, 100)
+      local t = nil
+      if torch.uniform(0, 1) > 0.5 then
+         t = torch.CudaByteTensor(size1):fill(1)
+      else
+         local size2 = chooseInt(10, 100)
+         t = torch.CudaByteTensor(size1, size2):fill(1)
+
+         if torch.uniform(0, 1) > 0.5 then
+            t = t:transpose(1, 2)
+         end
+      end
+
+      tester:assert(t:all(), 'error in all()')
+      tester:assert(t:any(), 'error in any()')
+
+      if t:dim() == 1 then
+         t[chooseInt(1, t:size()[1])] = 0
+      else
+         t[chooseInt(1, t:size()[1])][chooseInt(1, t:size()[2])] = 0
+      end
+
+      tester:assert(not t:all(), 'error in all()')
+      tester:assert(t:any(), 'error in any()')
+
+      t:zero()
+      tester:assert(not t:all(), 'error in all()')
+      tester:assert(not t:any(), 'error in any()')
+   end
+end
+
+function test.sum()
+   local minsize = 10
+   local maxsize = 20
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   test_tolerance = 1e-1
+   compareFloatAndCuda(x, 'sum')
+   compareFloatAndCuda(x, 'sum', 1)
+   compareFloatAndCuda(x, 'sum', 2)
+   test_tolerance = 1e-5
+   checkMultiDevice(x, 'sum')
+   checkMultiDevice(x, 'sum', 1)
+end
+
+function test.cumsum()
+   local minsize = 10
+   local maxsize = 20
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumsum');
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumsum', 1);
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumsum', 2);
+   end
+   checkMultiDevice(x, 'cumsum')
+   checkMultiDevice(x, 'cumsum', 1)
+end
+
+function test.prod()
+   local minsize = 10
+   local maxsize = 20
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   compareFloatAndCuda(x, 'prod')
+   compareFloatAndCuda(x, 'prod', 1)
+   compareFloatAndCuda(x, 'prod', 2)
+   checkMultiDevice(x, 'prod')
+   checkMultiDevice(x, 'prod', 1)
+end
+
+function test.cumprod()
+   local minsize = 10
+   local maxsize = 20
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumprod');
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumprod', 1);
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cumprod', 2);
+   end
+   checkMultiDevice(x, 'cumprod')
+   checkMultiDevice(x, 'cumprod', 1)
+end
+
+function test.var()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+     local x = x:type(t2cpu[typename])
+     compareFloatAndCuda(x, 'var')
+     compareFloatAndCuda(x, 'var', 1, true)
+     compareFloatAndCuda(x, 'var', 1, false)
+     compareFloatAndCuda(x, 'var', 2, true)
+     compareFloatAndCuda(x, 'var', 2, false)
+   end
+
+   checkMultiDevice(x, 'var')
+   checkMultiDevice(x, 'var', 1)
+end
+
+function test.std()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+     local x = x:type(t2cpu[typename])
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'std')
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'std', 1, true)
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'std', 1, false)
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'std', 2, true)
+     compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'std', 2, false)
+   end
+
+   checkMultiDevice(x, 'std')
+   checkMultiDevice(x, 'std', 1)
+end
+
+function test.diag()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local k = chooseInt(-minsize, minsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'diag')
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'diag', k)
+   end
+   checkMultiDevice(x, 'diag')
+   checkMultiDevice(x, 'diag', k)
+
+   local y = torch.FloatTensor():rand(sz1)
+   for _, typename in ipairs(float_typenames) do
+       local y = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'diag')
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'diag', k)
+   end
+   checkMultiDevice(y, 'diag')
+   checkMultiDevice(y, 'diag', k)
+
+   -- test non-contiguous cases
+   local x1 = createTestTensorWithSizes(true, true, {sz1, sz2});
+   for _, typename in ipairs(float_typenames) do
+       local x1 = x1:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x1, 'diag')
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x1, 'diag', k)
+   end
+   checkMultiDevice(x1, 'diag')
+   checkMultiDevice(x1, 'diag', k)
+
+   local y1 = createTestTensorWithSizes(true, true, {sz1});
+   for _, typename in ipairs(float_typenames) do
+       local y1 = y1:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y1, 'diag')
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y1, 'diag', k)
+   end
+   checkMultiDevice(y1, 'diag')
+   checkMultiDevice(y1, 'diag', k)
+end
+
+function test.trace()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'trace')
+   end
+   checkMultiDevice(x, 'trace')
+end
+
+function test.tril()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'tril')
+   end
+   checkMultiDevice(x, 'tril')
+end
+
+function test.triu()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'triu')
+   end
+   checkMultiDevice(x, 'triu')
+end
+
+-- Test element-wise unary operators with both one and two arguments.
+local function testUnary1(fnp, types, tensor)
+   local fn = fnp[1]
+   local min = fnp[2]
+   local max = fnp[3]
+   local function test()
+      local sz1 = chooseInt(minsize, maxsize)
+      local sz2 = chooseInt(minsize, maxsize)
+      local x = tensor and tensor or torch.DoubleTensor(sz1, sz2):uniform(min, max)
+      for k, typename in ipairs(types and types or float_typenames) do
+         local x = x:type(t2cpu[typename]):clone()
+         compareCPUAndCUDATypeTensorArgs(typename, nil, x, fn)
+      end
+   end
+   return test
+end
+
+local function testUnary2(fnp, types)
+   local fn = fnp[1]
+   local min = fnp[2]
+   local max = fnp[3]
+   local function test()
+      local sz1 = chooseInt(minsize, maxsize)
+      local sz2 = chooseInt(minsize, maxsize)
+      local x = torch.DoubleTensor(sz1, sz2):uniform(min, max)
+      local y = torch.DoubleTensor()
+      for k, typename in ipairs(types and types or float_typenames) do
+          local x = x:type(t2cpu[typename]):clone()
+          local y = y:type(t2cpu[typename]):clone()
+         compareCPUAndCUDATypeTensorArgs(typename, nil, y, fn, x)
+      end
+      checkMultiDevice(y, fn, x)
+   end
+   return test
+end
+
+for _,name in ipairs({
+      {"log", 0.001, 2},
+      {"log1p", -0.9, 2},
+      {"exp", -2, 2},
+      {"cos", -2, 2},
+      {"acos", -1, 1},
+      {"cosh", -2, 2},
+      {"sin", -2, 2},
+      {"asin", -1, 1},
+      {"sinh", -2, 2},
+      {"tan", -2, 2},
+      {"atan", -2, 2},
+      {"tanh", -2, 2},
+      {"sqrt", 0, 2},
+      {"neg", -100, 100},
+      {"sigmoid", -2, 2},
+      {"ceil", -100, 100},
+      {"floor", -100, 100},
+      {"frac", -100, 100},
+      {"trunc", -100, 100},
+      {"cinv", -2, 2},
+      {"round", -100, 100}}) do
+
+   test[name[1] .. "1"] = testUnary1(name)
+   test[name[1] .. "2"] = testUnary2(name)
+
+end
+
+test["abs1"] = testUnary1({"abs", -100, 100}, {'torch.CudaIntTensor',
+                                               'torch.CudaLongTensor'})
+test["abs2"] = testUnary2({"abs", -100, 100}, {'torch.CudaIntTensor',
+                                               'torch.CudaLongTensor'})
+
+
+test["sign1"] = testUnary1({"sign", -100, 100}, typenames)
+test["sign2"] = testUnary2({"sign", -100, 100}, typenames)
+test["sign3"] = testUnary1({"sign", -100, 100}, typenames, torch.ByteTensor(10):fill(0))
+
+function test.rsqrt()
+   local old_tolerance = test_tolerance
+   test_tolerance = 1E-1  -- max observed error with 500x500 tensors in 10000 runs was 0.01157
+   testUnary1('rsqrt')
+   testUnary2('rsqrt')
+   test_tolerance = old_tolerance
+end
+
+function test.atan2()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local z = torch.FloatTensor()
+   compareFloatAndCudaTensorArgs(z, 'atan2', x, y)
+   checkMultiDevice(z, 'atan2', x, y)
+end
+
+function test.lerp()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   local w = math.random()
+   local z = torch.FloatTensor()
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       local y = y:type(t2cpu[typename])
+       local z = z:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, z, 'lerp', x, y, w)
+   end
+   checkMultiDevice(z, 'lerp', x, y, w)
+end
+
+function test.pow1()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local pow = torch.uniform(minvalue,maxvalue)
+   for k, typename in ipairs(float_typenames) do
+       local ctype = t2cpu[typename]
+       local x = x:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'pow', pow)
+   end
+   checkMultiDevice(x, 'pow', pow)
+end
+
+function test.pow2()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor()
+   local pow = torch.uniform(minvalue,maxvalue)
+   for k, typename in ipairs(float_typenames) do
+       local ctype = t2cpu[typename]
+       local x, y = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'pow', x, pow)
+   end
+   checkMultiDevice(y, 'pow', x, pow)
+end
+
+function test.powExponentTensor()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local pow = torch.uniform(minvalue,maxvalue)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor()
+   for k, typename in ipairs(float_typenames) do
+       local ctype = t2cpu[typename]
+       local x, y = x:type(ctype), y:type(ctype)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'pow', pow, x)
+   end
+   checkMultiDevice(y, 'pow', pow, x)
+end
+
+function test.clamp1()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2):mul(5):add(-2.5)
+   local min_val = -1
+   local max_val = 1
+   x[1][1] = min_val - 1
+   if sz2 >= 2 then
+     x[1][2] = max_val + 1
+   end
+   for _, typename in ipairs(typenames) do
+      if typename ~= 'torch.CudaCharTensor' and typename ~= 'torch.CudaByteTensor' then
+        local x = x:type(t2cpu[typename])
+        compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'clamp', min_val, max_val);
+      end
+   end
+   checkMultiDevice(x, 'clamp', min_val, max_val)
+end
+
+function test.clamp2()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2):mul(5):add(-2.5)
+   local min_val = -1
+   local max_val = 1
+   x[1][1] = min_val - 1
+   if sz2 >= 2 then
+     x[1][2] = max_val + 1
+   end
+   local y = torch.FloatTensor():resizeAs(x)
+   for _, typename in ipairs(typenames) do
+      if typename ~= 'torch.CudaCharTensor' and typename ~= 'torch.CudaByteTensor' then
+        local x = x:type(t2cpu[typename])
+        local y = y:type(t2cpu[typename])
+        compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'clamp', x, min_val, max_val);
+      end
+   end
+   checkMultiDevice(y, 'clamp', x, min_val, max_val)
+end
+
+-- same as clamp1, clamp2 but only allow positive values
+function test.clamp3()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2):mul(5);
+   local min_val = 1
+   local max_val = 3
+   x[1][1] = min_val - 1
+   if sz2 >= 2 then
+     x[1][2] = max_val + 1
+   end
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'clamp', min_val, max_val);
+   end
+   checkMultiDevice(x, 'clamp', min_val, max_val)
+end
+
+function test.clamp4()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2):mul(5);
+   local min_val = 1
+   local max_val = 3
+   x[1][1] = min_val - 1
+   if sz2 >= 2 then
+     x[1][2] = max_val + 1
+   end
+   local y = torch.FloatTensor():resizeAs(x)
+   for _, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      local y = y:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, y, 'clamp', x, min_val, max_val);
+   end
+   checkMultiDevice(x, 'clamp', min_val, max_val)
+end
+
+function test.index()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local sz3 = chooseInt(10, 20)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+
+   local longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   local index = 1
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'index',
+                                      index, longIndex)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'index',
+                                          index, longIndex)
+      end
+   end
+
+   index = 2
+   longIndex =  torch.LongTensor{chooseInt(1, sz2), chooseInt(1, sz2)}
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'index',
+                                      index, longIndex)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'index',
+                                          index, longIndex)
+      end
+   end
+
+   x = torch.FloatTensor():rand(sz1)
+   index = 1
+   longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'index',
+                                      index, longIndex)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'index',
+                                          index, longIndex)
+      end
+   end
+
+   x = torch.FloatTensor():rand(sz1,sz2,sz3)
+   index = 3
+   longIndex = torch.randperm(sz3):long()
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'index',
+                                      index, longIndex)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'index',
+                                          index, longIndex)
+      end
+   end
+
+   tester:assert(isEqual(x:cuda():index(index, longIndex:cuda()), x:index(index, longIndex)),
+      "Divergent results between CPU and CUDA for function 'index'")
+
+   checkMultiDevice(x, 'index', index, longIndex)
+end
+
+function test.indexCopy()
+   local sz1 = chooseInt(minsize, maxsize) -- dim1
+   local sz2 = chooseInt(minsize, maxsize) -- dim2
+   local x = torch.FloatTensor():rand(sz1, sz2) -- input
+
+
+   -- Case 1: 2D tensor, indexCopy over first dimension, 2 indices
+   -- choose two indices from the first dimension, i.e. [1,sz1]
+   local longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   local index = 1
+   local src = torch.FloatTensor(2, sz2):uniform()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexCopy',
+                                      index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexCopy',
+                                          index, longIndex, src)
+      end
+   end
+
+   -- Case 2: 2D tensor, indexCopy over second dimension, 2 indices
+   index = 2
+   longIndex =  torch.LongTensor{chooseInt(1, sz2), chooseInt(1, sz2)}
+   src = torch.FloatTensor(sz1, 2):uniform():cuda()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexCopy',
+                                      index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexCopy',
+                                          index, longIndex, src)
+      end
+   end
+
+   -- Case 3: 1D tensor, indexCopy over 1st dimension, 2 indices
+   x = torch.FloatTensor():rand(sz1)
+   index = 1
+   longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   src = torch.FloatTensor(2):uniform()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexCopy',
+                                      index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexCopy',
+                                          index, longIndex, src)
+      end
+   end
+
+   tester:assert(isEqual(
+      x:cuda():indexCopy(index, longIndex:cuda(), src:cuda()),
+      x:indexCopy(index, longIndex, src)),
+      "Divergent results between CPU and CUDA for function 'indexCopy'")
+
+   checkMultiDevice(x, 'indexCopy', index, longIndex, src)
+end
+
+local function testIndexAdd(types, gpu2cpu_map)
+   local sz1 = chooseInt(minsize, maxsize) -- dim1
+   local sz2 = chooseInt(minsize, maxsize) -- dim2
+   local x = torch.FloatTensor():rand(sz1, sz2) -- input
+
+   -- Case 1: 2D tensor, indexAdd over first dimension, 2 indices
+   -- choose two indices from the first dimension, i.e. [1,sz1]
+   local longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   local index = 1
+   local src = torch.FloatTensor(2, sz2):uniform()
+
+   for k, typename in ipairs(types) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+                                              index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+                                                  index, longIndex, src)
+      end
+   end
+
+   -- Case 2: 2D tensor, indexAdd over second dimension, 2 indices
+   index = 2
+   longIndex =  torch.LongTensor{chooseInt(1, sz2), chooseInt(1, sz2)}
+   src = torch.FloatTensor(sz1, 2):uniform():cuda()
+   for k, typename in ipairs(types) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+                                              index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+                                                  index, longIndex, src)
+      end
+   end
+
+   -- Case 3: 1D tensor, indexAdd over 1st dimension, 2 indices
+   x = torch.FloatTensor():rand(sz1)
+   index = 1
+   longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   src = torch.FloatTensor(2):uniform()
+   for k, typename in ipairs(types) do
+      local ctype = t2cpu[typename]
+      local x, src = x:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, true, x, 'indexAdd',
+                                              index, longIndex, src)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgsWithConv(typename, gpu2cpu_map, false, x, 'indexAdd',
+                                                  index, longIndex, src)
+      end
+   end
+
+   tester:assert(isEqual(
+      x:cuda():indexAdd(index, longIndex:cuda(), src:cuda()),
+      x:indexAdd(index, longIndex, src)),
+      "Divergent results between CPU and CUDA for function 'indexAdd'")
+
+   checkMultiDevice(x, 'indexAdd', index, longIndex, src)
+end
+
+function test.indexAdd()
+   testIndexAdd(typenames)
+end
+
+function test.indexAddHalf()
+   -- don't have cpu versions of half, so let's compare with float.
+   -- additional divergence due to float/half:
+   -- half_digits_precision = log10(2^11) ~ 3, reserve another
+   -- digit to be safe
+   if cutorch.hasHalf then
+      local old_tolerance = test_tolerance
+      test_tolerance = test_tolerance + 1e-2;
+      local halfOnly = { 'torch.CudaHalfTensor' }
+      local halft2gpu2 = {
+        ['torch.FloatTensor'] = 'torch.CudaHalfTensor',
+        ['torch.LongTensor'] = 'torch.CudaLongTensor'
+      }
+      testIndexAdd(halfOnly, halft2gpu2)
+      local test_tolerance =  old_tolerance
+  end
+end
+
+function test.indexFill()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+
+   local longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   local index = 1
+   local val = torch.random(10)
+   for k, typename in ipairs(typenames) do
+       local x = x:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexFill',
+                                       index, longIndex, val)
+       if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+           compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexFill',
+                                           index, longIndex, val)
+       end
+   end
+   index = 2
+   longIndex =  torch.LongTensor{chooseInt(1, sz2), chooseInt(1, sz2)}
+   val = torch.random(10)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexFill',
+                                      index, longIndex, val)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexFill',
+                                          index, longIndex, val)
+      end
+   end
+
+   x = torch.FloatTensor():rand(sz1)
+   index = 1
+   longIndex = torch.LongTensor{chooseInt(1, sz1), chooseInt(1, sz1)}
+   val = torch.random(10)
+   for k, typename in ipairs(typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, x, 'indexFill',
+                                      index, longIndex, val)
+      if typename ~= 'torch.CudaByteTensor' and typename ~= 'torch.CudaCharTensor' then
+          compareCPUAndCUDATypeTensorArgs(typename, false, x, 'indexFill',
+                                          index, longIndex, val)
+      end
+   end
+
+   tester:assert(isEqual(
+      x:cuda():indexFill(index, longIndex:cuda(), val),
+      x:indexFill(index, longIndex, val)),
+      "Divergent results between CPU and CUDA for function 'indexFill'")
+
+   checkMultiDevice(x, 'indexFill', index, longIndex, val)
+end
+
+function test.norm()
+   for n = 0, 3 do
+     local cpu = torch.FloatTensor(chooseInt(20, 50), 2):uniform(-0.5, 0.5)
+     for _, typename in ipairs(float_typenames) do
+        local x = cpu:type(t2cpu[typename])
+        compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'norm', n)
+        compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'norm', n, 1)
+        compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'norm', n, 2)
+     end
+   end
+
+   for i = 1, 5 do
+      for n = 0, 3 do
+         local cpu = torch.FloatTensor(chooseInt(20, 50), 2):uniform(-0.5, 0.5)
+
+         if torch.random(1, 2) == 1 then
+            cpu = cpu:transpose(1, 2)
+         end
+
+         compareFloatAndCuda(cpu, 'norm', n)
+         compareFloatAndCuda(cpu, 'norm', n, 1)
+         compareFloatAndCuda(cpu, 'norm', n, 2)
+      end
+   end
+end
+
+function test.renorm()
+   local x = torch.randn(10,5):float()
+   local maxnorm = x:norm(2,1):mean()
+
+   for _, typename in ipairs(float_typenames) do
+      local x = x:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'renorm', 2, 2, maxnorm)
+   end
+
+   compareFloatAndCuda(x, 'renorm', 2, 2, maxnorm)
+
+   x = torch.randn(3,4,5)
+   compareFloatAndCuda(x, 'renorm', 2, 2, maxnorm)
+
+   x = torch.randn(3,4,5)
+   compareFloatAndCuda(x, 'renorm', 3, 2, maxnorm)
+
+   x = torch.randn(3,4,5,100)
+   compareFloatAndCuda(x, 'renorm', 3, 2, maxnorm)
+
+   x = torch.randn(3,4,5,100)
+   compareFloatAndCuda(x, 'renorm', 4, 2, maxnorm)
+
+   checkMultiDevice(x, 'renorm', 4, 2, maxnorm)
+end
+
+function test.dist()
+   local minsize = 5
+   local maxsize = 10
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local x = torch.FloatTensor():rand(sz1, sz2)
+   local y = torch.FloatTensor():rand(sz1, sz2)
+   for _, typename in ipairs(float_typenames) do
+       local x = x:type(t2cpu[typename])
+       local y = y:type(t2cpu[typename])
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'dist', y)
+       compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'dist', y, 3)
+   end
+   checkMultiDevice(x, 'dist', y)
+end
+
+function test.indexCopy2()
+   for tries = 1, 5 do
+      local t = createTestTensor(1000000)
+      local selectdim = chooseInt(1, t:nDimension())
+      local indices = torch.randperm(t:size(selectdim)):long()
+
+      compareFloatAndCudaTensorArgs(
+          t, 'indexCopy', selectdim, indices, t:clone())
+   end
+end
+
+function test.indexAdd2()
+   for tries = 1, 5 do
+      local t = createTestTensor(1000000)
+      local selectdim = chooseInt(1, t:nDimension())
+      local indices = torch.randperm(t:size(selectdim)):long()
+
+      compareFloatAndCudaTensorArgs(
+          t, 'indexAdd', selectdim, indices, t:clone())
+   end
+end
+
+function test.indexFill2()
+   for tries = 1, 5 do
+      local t = createTestTensor(1000000)
+      local selectdim = chooseInt(1, t:nDimension())
+      local numIndices = chooseInt(1, t:size(selectdim))
+      local indices = torch.randperm(numIndices):long()
+
+      compareFloatAndCuda(t, 'indexFill', selectdim, indices, 1)
+   end
+end
+
+function test.indexSelect2()
+   for tries = 1, 5 do
+      local t = createTestTensor(1000000)
+      local selectdim = chooseInt(1, t:nDimension())
+      local numIndices = chooseInt(1, t:size(selectdim))
+      local indices = torch.randperm(numIndices):long()
+
+      compareFloatAndCuda(t, 'index', selectdim, indices)
+   end
+end
+
+function test.cross()
+   -- Test finding the first non-zero dimension
+   local x = torch.FloatTensor():randn(4,3,2,3)
+   local y = torch.FloatTensor():randn(4,3,2,3)
+   compareFloatAndCudaTensorArgs(x, 'cross', y)
+   checkMultiDevice(x, 'cross', y)
+
+   for tries = 1, 5 do
+      local nelems = 10000000
+      local ndims = chooseInt(1, 10)
+      local crossdim = chooseInt(1, ndims)
+      sizes = {}
+      for i = 1, ndims do
+         sizes[i] = chooseInt(1, math.min(20, math.sqrt(nelems)))
+         nelems = nelems / sizes[i]
+      end
+      sizes[crossdim] = 3
+      local x = torch.FloatTensor():randn(unpack(sizes))
+      local y = torch.FloatTensor():randn(unpack(sizes))
+      for _, typename in ipairs(typenames) do
+         local x = x:type(t2cpu[typename])
+         local y = y:type(t2cpu[typename])
+         compareCPUAndCUDATypeTensorArgs(typename, nil, x, 'cross', y, crossdim)
+         checkMultiDevice(x, 'cross', y, crossdim)
+      end
+   end
+end
+
+function test.addmv()
+   --[[ Size ]]--
+   local sizes = {
+      {2,1},
+      {1,2},
+      {1,1},
+      {3,4},
+      {3,3},
+      {15,18},
+      {19,15}
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, m = unpack(size)
+      local c = torch.zeros(n)
+      local a = torch.randn(n, m)
+      local b = torch.randn(m)
+      compareFloatAndCudaTensorArgs(c, 'addmv', torch.normal(), torch.normal(), a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'addmv', torch.normal(), torch.normal(), a, b)
+         multiCheck = true
+      end
+   end
+end
+
+function test.mv()
+   --[[ Size ]]--
+   local sizes = {
+      {2,1},
+      {1,2},
+      {1,1},
+      {3,4},
+      {3,3},
+      {15,18},
+      {19,15}
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, m = unpack(size)
+      local c = torch.zeros(n)
+      local a = torch.randn(n, m)
+      local b = torch.randn(m)
+      compareFloatAndCudaTensorArgs(c, 'mv', a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'mv', a, b)
+         multiCheck = true
+      end
+   end
+end
+
+function test.addr()
+   --[[ Size ]]--
+   local sizes = {
+      {2,1},
+      {1,2},
+      {1,1},
+      {3,4},
+      {3,3},
+      {15,18},
+      {19,15}
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, m = unpack(size)
+      local c = torch.zeros(n,m)
+      local a = torch.randn(n)
+      local b = torch.randn(m)
+      compareFloatAndCudaTensorArgs(c, 'addr', torch.normal(), a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'addr', torch.normal(), a, b)
+         multiCheck = true
+      end
+   end
+end
+
+function test.addmm()
+   --[[ Size ]]--
+   local sizes = {
+      {16, 3, 1},
+      {1, 12, 1},
+      {24, 23, 22},
+      {1, 1, 1},
+      {1, 1, 7},
+      {12, 1, 12},
+      {10, 10, 10},
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, k, m = unpack(size)
+      local c = torch.zeros(n, m)
+      local a = torch.randn(n, k)
+      local b = torch.randn(k, m)
+      compareFloatAndCudaTensorArgs(c, 'addmm', torch.normal(), torch.normal(), a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'addmm', torch.normal(), torch.normal(), a, b)
+         multiCheck = true
+      end
+   end
+
+   -- check all zero-strided cases for the inputs
+   -- considers that the output tensor is not zero-strided
+   local n, k, m = 10, 10, 10
+   local function generateTensor(t,idx)
+      local tensor = torch.FloatTensor()
+      local s1,s2
+      if t == 1 then
+        s1 = n
+        s2 = m
+      elseif t == 2 then
+        s1 = n
+        s2 = k
+      else
+        s1 = k
+        s2 = m
+      end
+      if idx == 1 then
+        tensor:resize(s1,s2)
+      elseif idx == 2 then
+        tensor:resize(s1,1)
+      elseif idx == 3 then
+        tensor:resize(1,s2)
+      else
+        tensor:resize(1,1)
+      end
+      if t == 1 then
+        tensor:zero()
+      else
+        tensor:uniform()
+      end
+      tensor = tensor:expand(s1,s2)
+      return tensor
+   end
+
+   for i = 1, 4*4*4 do
+      local a_idx = (i-1)%4 + 1
+      local b_idx = math.floor(((i-1)%16)/4)  + 1
+      local c_idx = 1 -- math.floor((i-1)/16) + 1
+      local c = generateTensor(1,c_idx)
+      local a = generateTensor(2,a_idx)
+      local b = generateTensor(3,b_idx)
+      compareFloatAndCudaTensorArgs(c, 'addmm', torch.normal(), torch.normal(), a, b)
+   end
+end
+
+function test.mm()
+   --[[ Size ]]--
+   local sizes = {
+      {16, 3, 1},
+      {1, 12, 1},
+      {24, 23, 22},
+      {1, 1, 1},
+      {1, 1, 7},
+      {12, 1, 12},
+      {10, 10, 10},
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, k, m = unpack(size)
+      local c = torch.zeros(n, m)
+      local a = torch.randn(n, k)
+      local b = torch.randn(k, m)
+      compareFloatAndCudaTensorArgs(c, 'mm', a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'mm', a, b)
+         multiCheck = true
+      end
+   end
+
+   -- check all zero-strided cases for the inputs
+   -- considers that the output tensor is not zero-strided
+   local n, k, m = 10, 10, 10
+   local function generateTensor(t,idx)
+      local tensor = torch.FloatTensor()
+      local s1,s2
+      if t == 1 then
+        s1 = n
+        s2 = m
+      elseif t == 2 then
+        s1 = n
+        s2 = k
+      else
+        s1 = k
+        s2 = m
+      end
+      if idx == 1 then
+        tensor:resize(s1,s2)
+      elseif idx == 2 then
+        tensor:resize(s1,1)
+      elseif idx == 3 then
+        tensor:resize(1,s2)
+      else
+        tensor:resize(1,1)
+      end
+      if t == 1 then
+        tensor:zero()
+      else
+        tensor:uniform()
+      end
+      tensor = tensor:expand(s1,s2)
+      return tensor
+   end
+
+   for i = 1, 4*4*4 do
+      local a_idx = (i-1)%4 + 1
+      local b_idx = math.floor(((i-1)%16)/4)  + 1
+      local c_idx = 1 -- math.floor((i-1)/16) + 1
+      local c = generateTensor(1,c_idx)
+      local a = generateTensor(2,a_idx)
+      local b = generateTensor(3,b_idx)
+      compareFloatAndCudaTensorArgs(c, 'mm', a, b)
+   end
+end
+
+function test.addbmm()
+    local sizes = {
+        {16, 3, 1, 4},
+        {1, 12, 1, 7},
+        {24, 23, 22, 21},
+        {1, 1, 1, 1},
+        {1, 1, 7, 4},
+        {12, 1, 12, 1},
+        {10, 10, 10, 10},
+    }
+    local old_tt = test_tolerance
+    test_tolerance = 1e-3
+    local multiCheck = false
+    for _, size in pairs(sizes) do
+        local b, n, k, m = unpack(size)
+        local cs = torch.randn(n, m)
+        local as = torch.randn(b, n, k)
+        local bs = torch.randn(b, k, m)
+        local beta = torch.randn(1)[1]
+        local alpha = torch.randn(1)[1]
+        compareFloatAndCudaTensorArgs(cs, 'addbmm', beta, cs, alpha, as, bs)
+        if not multiCheck then -- just check multidevice once
+            checkMultiDevice(cs, 'addbmm', as, bs)
+            multiCheck = true
+        end
+    end
+    test_tolerance = old_tt
+end
+
+function test.baddbmm()
+   local sizes = {
+      {16, 3, 1, 4},
+      {1, 12, 1, 7},
+      {24, 23, 22, 21},
+      {1, 1, 1, 1},
+      {1, 1, 7, 4},
+      {12, 1, 12, 1},
+      {10, 10, 10, 10},
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local b, n, k, m = unpack(size)
+      local cs = torch.randn(b, n, m)
+      local as = torch.randn(b, n, k)
+      local bs = torch.randn(b, k, m)
+      compareFloatAndCudaTensorArgs(cs, 'baddbmm', as, bs)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(cs, 'baddbmm', as, bs)
+         multiCheck = true
+      end
+   end
+end
+
+function test.baddbmmTransposed()
+   local b, n, k, m = 16, 3, 8, 4
+   -- Can't use compareFloatAndCudaTensorArgs because the transposition will be
+   -- lost when converting the tensor to a CudaTensor.
+   local c_cpu = torch.randn(m, n, b)  -- First and last dimensions will be tranposed.
+   local a_cpu = torch.randn(n, b, k)  -- First two dimensions will be transposed.
+   local b_cpu = torch.randn(b, m, k)  -- Last two dimensions will be transposed.
+
+   local c_cuda = c_cpu:cuda()
+   local a_cuda = a_cpu:cuda()
+   local b_cuda = b_cpu:cuda()
+
+   c_cpu = c_cpu:transpose(1, 3)
+   c_cuda = c_cuda:transpose(1, 3)
+   a_cpu = a_cpu:transpose(1, 2)
+   a_cuda = a_cuda:transpose(1, 2)
+   b_cpu = b_cpu:transpose(2, 3)
+   b_cuda = b_cuda:transpose(2, 3)
+
+   c_cpu:baddbmm(a_cpu, b_cpu)
+   c_cuda:baddbmm(a_cuda, b_cuda)
+
+   tester:assert(isEqual(c_cpu, c_cuda, 1e-5),
+                 string.format("Divergent results between CPU and CUDA for function 'bmm'"))
+end
+
+function test.bmm()
+   local sizes = {
+      {16, 3, 1, 4},
+      {1, 12, 1, 7},
+      {24, 23, 22, 21},
+      {1, 1, 1, 1},
+      {1, 1, 7, 4},
+      {12, 1, 12, 1},
+      {10, 10, 10, 10},
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local b, n, k, m = unpack(size)
+      local cs = torch.zeros(b, n, m)
+      local as = torch.randn(b, n, k)
+      local bs = torch.randn(b, k, m)
+      compareFloatAndCudaTensorArgs(cs, 'bmm', as, bs)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(cs, 'bmm', as, bs)
+         multiCheck = true
+      end
+   end
+end
+
+function test.bmmTransposed()
+   local b, n, k, m = 16, 3, 8, 4
+   -- Can't use compareFloatAndCudaTensorArgs because the transposition will be
+   -- lost when converting the tensor to a CudaTensor.
+   local c_cpu = torch.zeros(b, n, m)
+   local a_cpu = torch.randn(b, k, n)  -- Last two dimensions will be transposed.
+   local b_cpu = torch.randn(m, k, b)  -- First and last dimensions will be transposed.
+
+   local c_cuda = c_cpu:cuda()
+   local a_cuda = a_cpu:cuda()
+   local b_cuda = b_cpu:cuda()
+
+   a_cpu = a_cpu:transpose(2, 3)
+   a_cuda = a_cuda:transpose(2, 3)
+   b_cpu = b_cpu:transpose(1, 3)
+   b_cuda = b_cuda:transpose(1, 3)
+
+   c_cpu:bmm(a_cpu, b_cpu)
+   c_cuda:bmm(a_cuda, b_cuda)
+
+   tester:assert(isEqual(c_cpu, c_cuda, 1e-5),
+                 string.format("Divergent results between CPU and CUDA for function 'bmm'"))
+end
+
+function test.ger()
+   --[[ Size ]]--
+   local sizes = {
+      {16, 1},
+      {1, 12},
+      {24, 23},
+      {1, 1},
+      {33, 7},
+      {12, 14},
+      {10, 10},
+   }
+   local multiCheck = false
+   for _, size in pairs(sizes) do
+      local n, m = unpack(size)
+      local c = torch.zeros(n, m)
+      local a = torch.randn(n)
+      local b = torch.randn(m)
+      compareFloatAndCudaTensorArgs(c, 'ger', a, b)
+      if not multiCheck then -- just check multidevice once
+         checkMultiDevice(c, 'ger', a, b)
+         multiCheck = true
+      end
+   end
+end
+
+function test.inverse()
+   local a = torch.eye(5):add(torch.Tensor(5, 5):uniform(-0.1, 0.1))
+   for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+       local at = a:type(typename)
+       local i1 = torch.inverse(at)
+       local i2 = torch.inverse(a:cuda())
+       tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong inverse answer")
+   end
+end
+
+if cutorch.magma then
+   function test.gesv()
+      local a = torch.Tensor(5, 5):uniform(-1, 1)
+      local b = torch.Tensor(5, 3):uniform(-1, 1)
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = a:type(typename)
+          local bt = b:type(typename)
+          local rb1, ra1 = torch.gesv(bt, at)
+          local rb2, ra2 = torch.gesv(bt:cuda(), at:cuda())
+          tester:assertle((rb2 - rb1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
+          tester:assertle((ra2 - ra1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
+      end
+   end
+
+   function test.gels()
+      local a = torch.Tensor{
+         {-0.8862, 0.8186,  0.2334,  0.8008,  0.2377},
+         { 0.6116, 0.2242,  0.2854,  0.5427,  0.5937},
+         {-0.3716,-0.7247, -0.7658, -0.1285,  0.6749},
+         {-0.5878, 0.7596, -0.7765, -0.5373,  0.6326},
+         { 0.0868,-0.4918,  0.7771, -0.7550, -0.6020},
+      }
+      local b = torch.Tensor{
+         { 0.4807, 0.1842, 0.7908},
+         {-0.0035, 0.7557, 0.1627},
+         { 0.3495,-0.0840, 0.8164},
+         { 0.5360, 0.2048, 0.2745},
+         { 0.8535,-0.3938,-0.2140},
+      }
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = a:type(typename)
+          local bt = b:type(typename)
+          local rb1, ra1 = torch.gels(bt, at)
+          local rb2, ra2 = torch.gels(bt:cuda(), at:cuda())
+          tester:assertle((rb2 - rb1:cuda()):abs():max(), 5e-4, "wrong gels answer")
+          tester:assertle((ra2 - ra1:cuda()):abs():max(), 5e-4, "wrong gels answer")
+      end
+   end
+
+   function test.symeig()
+      local a = torch.Tensor({{ 1.96,  0.00,  0.00,  0.00,  0.00},
+                              {-6.49,  3.80,  0.00,  0.00,  0.00},
+                              {-0.47, -6.39,  4.17,  0.00,  0.00},
+                              {-7.20,  1.50, -1.51,  5.70,  0.00},
+                              {-0.65, -6.34,  2.67,  1.80, -7.10}}):t()
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = a:type(typename)
+          local e1,v1 = torch.symeig(at, 'V')
+          local e2,v2 = torch.symeig(at:cuda(), 'V')
+          tester:assertle((e2 - e1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
+          tester:assertle((v2 - v1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
+      end
+   end
+
+   function test.eig()
+      local a = torch.Tensor{
+         {-0.1425, -0.4750, -0.8551, 0.6729, -0.7453},
+         {-0.2696,  0.4330,  0.5077, 0.3709, -0.6053},
+         { 0.4330,  0.6727, -0.5049, 0.4600,  0.6249},
+         { 0.5766, -0.6743,  0.6903, 0.3646, -0.4571},
+         {-0.8956, -0.4074, -0.7583, 0.1838, -0.0091},
+      }
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = a:type(typename)
+          local e1,v1 = torch.eig(at, 'V')
+          local e2,v2 = torch.eig(at:cuda(), 'V')
+          tester:assertle((e2 - e1:cuda()):abs():max(), 1e-6, "wrong eig answer")
+          tester:assertle((v2:abs() - v1:abs():cuda()):abs():max(), 1e-6, "wrong eig answer")
+      end
+   end
+
+   function test.svd()
+      local a = torch.CudaTensor{
+         {8.79,  6.11, -9.15,  9.57, -3.49,  9.84},
+         {9.93,  6.91, -7.93,  1.64,  4.02,  0.15},
+         {9.83,  5.04,  4.86,  8.83,  9.80, -8.99},
+         {5.45, -0.27,  4.85,  0.74, 10.00, -6.02},
+         {3.16,  7.98,  3.01,  5.80,  4.27, -5.31}}
+
+      for _, typename in ipairs({'torch.CudaDoubleTensor', 'torch.CudaTensor'}) do
+          local at = a:type(typename)
+          local u,s,v = torch.svd(a, 'A')
+
+          local temp = torch.Tensor(a:size(2)):zero()
+          temp:narrow(1, 1, a:size(1)):copy(s)
+          local sigma = torch.diag(temp):resize(a:size(1), a:size(2)):cuda()
+
+          local m = u * sigma * v:t()
+
+          tester:assertle((m - a):abs():max(), 1e-5, "svd: a != u * s * vT")
+          tester:assertle((u*u:t() - torch.eye(a:size(1)):cuda()):abs():max(), 1e-6, "svd: u should be unitary")
+          tester:assertle((v*v:t() - torch.eye(a:size(2)):cuda()):abs():max(), 1e-6, "svd: v should be unitary")
+      end
+   end
+
+
+   function test.potri()
+      local A = torch.Tensor{
+         { 0.9023,  1.5967,  0.3388, -0.0746, -0.5717},
+         {-2.0442,  2.3974, -1.0883,  0.4018, -0.3938},
+         {-0.1065, -1.3180,  0.3542,  1.3684,  0.3934},
+         {-0.2987,  1.9035, -1.4192, -0.9738,  1.4384},
+         {-0.5315,  0.4958,  0.4449, -0.4676, -0.4878},
+      }
+      A = A * A:t()
+
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = A:type(typename)
+          for _, triarg in ipairs({'U', 'L'}) do
+              local chol  = torch.potrf(at, triarg)
+
+              local i1 = torch.potri(chol, triarg)
+              local i2 = torch.potri(chol:cuda(), triarg)
+              local M = at:cuda() * i2
+              tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potri answer")
+              tester:assertle((M - torch.eye(at:size(1)):cuda()):abs():max(), 1e-5, "potri not an inverse")
+          end
+      end
+   end
+
+   function test.potrf()
+      local A = torch.Tensor{
+         { 8.7937, 0.5104, 1.5955,-0.6738,-3.3883},
+         { 0.5104, 1.4286, 0.0236, 0.4734, 0.2807},
+         { 1.5955, 0.0236, 1.4539,-1.1123, 0.8161},
+         {-0.6738, 0.4734,-1.1123, 2.4071,-1.2756},
+         {-3.3883, 0.2807, 0.8161,-1.2756, 4.3415},
+      }
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = A:type(typename)
+          for _, triarg in ipairs({'U', 'L'}) do
+              local i1 = torch.potrf(at, triarg)
+              local i2 = torch.potrf(at:cuda(), triarg)
+              tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potrf answer")
+          end
+      end
+   end
+
+   function test.potrs()
+      local A = torch.Tensor({
+        {1.2705,  0.9971,  0.4948,  0.1389,  0.2381},
+        {0.9971,  0.9966,  0.6752,  0.0686,  0.1196},
+        {0.4948,  0.6752,  1.1434,  0.0314,  0.0582},
+        {0.1389,  0.0686,  0.0314,  0.0270,  0.0526},
+        {0.2381,  0.1196,  0.0582,  0.0526,  0.3957}})
+      local B = torch.Tensor({
+        {0.6219,  0.3439,  0.0431},
+        {0.5642,  0.1756,  0.0153},
+        {0.2334,  0.8594,  0.4103},
+        {0.7556,  0.1966,  0.9637},
+        {0.1420,  0.7185,  0.7476}})
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = A:type(typename)
+          local bt = B:type(typename)
+          for _, triarg in ipairs({'U', 'L'}) do
+              local chol = torch.potrf(at, triarg)
+              local solve1 = torch.potrs(bt, chol, triarg)
+              local solve2 = torch.potrs(bt:cuda(), chol:cuda(), triarg)
+              tester:assertle((solve2 - solve1:cuda()):abs():max(), 1e-4, "wrong potrs answer")
+          end
+      end
+   end
+
+   function test.qr()
+      local A = torch.Tensor{
+         { 0.9023,  1.5967,  0.3388, -0.0746, -0.5717},
+         {-2.0442,  2.3974, -1.0883,  0.4018, -0.3938},
+         {-0.1065, -1.3180,  0.3542,  1.3684,  0.3934},
+         {-0.2987,  1.9035, -1.4192, -0.9738,  1.4384},
+         {-0.5315,  0.4958,  0.4449, -0.4676, -0.4878},
+      }
+      for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+          local at = A:type(typename)
+          local q1,r1 = torch.qr(at)
+          local q2,r2 = torch.qr(at:cuda())
+          tester:assertle((q2 - q1:cuda()):abs():max(), 1e-5, "wrong qr answer")
+          tester:assertle((r2 - r1:cuda()):abs():max(), 1e-5, "wrong qr answer")
+      end
+   end
+end
+
+function test.isSameSizeAs()
+   local t1 = torch.CudaTensor(3, 4, 9, 10)
+   local t2 = torch.CudaTensor(3, 4)
+   local t3 = torch.CudaTensor(1, 9, 3, 3)
+   local t4 = torch.CudaTensor(3, 4, 9, 10)
+
+   tester:assert(t1:isSameSizeAs(t2) == false, "wrong answer ")
+   tester:assert(t1:isSameSizeAs(t3) == false, "wrong answer ")
+   tester:assert(t1:isSameSizeAs(t4) == true, "wrong answer ")
+end
+
+function test.isSetTo()
+  local t1 = torch.CudaTensor(7, 4, 9)
+  local t2 = torch.CudaTensor(7, 8, 2)
+  local t3 = t2:view(7*8*2)
+  tester:assert(t1:isSetTo(t2) == false, "t1 and t2 are not the same tensor. ")
+  tester:assert(t2:isSetTo(t3) == false, "t2 and t3 share storage but are different views. ")
+  t2:set(t1)
+  tester:assert(t1:isSetTo(t2) == true, "t1 and t2 are the same tensor now.")
+  tester:assert(t2:isSetTo(t1) == true, "by symmetry. ")
+  tester:assert(t3:isSetTo(t1) == false, "now they are completely unrelated.")
+end
+
+function test.isSize()
+   local t1 = torch.CudaTensor(3, 4, 5)
+   local s1 = torch.LongStorage({3, 4, 5})
+   local s2 = torch.LongStorage({5, 4, 3})
+
+   tester:assert(t1:isSize(s1) == true, "wrong answer ")
+   tester:assert(t1:isSize(s2) == false, "wrong answer ")
+   tester:assert(t1:isSize(t1:size()) == true, "wrong answer ")
+end
+
+function test.elementSize()
+  local float = torch.CudaStorage():elementSize()
+  tester:asserteq(float, torch.CudaTensor():elementSize())
+  tester:assertne(float, 0)
+end
+
+-- Test random number generation.
+local function checkIfUniformlyDistributed(t, min, max)
+   tester:assertge(t:min(), min - 1e-6, "values are too low")
+   tester:assertle(t:max(), max + 1e-6, "values are too high")
+   tester:assertalmosteq(t:mean(), (min + max) / 2, 0.1, "mean is wrong")
+end
+
+function test.uniform()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local min = torch.uniform()
+   local max = min + torch.uniform()
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:uniform(min, max)
+       checkIfUniformlyDistributed(x, min, max)
+   end
+   checkMultiDevice(t, 'uniform', min, max)
+end
+
+function test.bernoulli()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local p = torch.uniform()
+   local p_fl = torch.rand(sz1, sz2):cuda()
+   local p_dbl = torch.rand(sz1, sz2):cudaDouble()
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(typenames) do
+       local x = t:type(typename)
+       local expected_mean
+       for i, p in ipairs({p, p_fl, p_dbl}) do
+          x:bernoulli(p)
+          local mean = x:sum() / (sz1 * sz2)
+          if torch.type(p) == 'number' then
+             expected_mean = p
+          else
+             expected_mean = p:mean()
+          end
+          tester:assertalmosteq(mean, expected_mean, 0.1, "mean is not equal to the expected value")
+          local f = x:float()
+          tester:assertTensorEq(f:eq(1):add(f:eq(0)):float(),
+                                torch.FloatTensor(sz1, sz2):fill(1),
+                                1e-6,
+                                "each value must be either 0 or 1")
+       end
+   end
+   checkMultiDevice(t, 'bernoulli', p)
+end
+
+function test.normal()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local mean, std = torch.uniform(), 0.1 * torch.uniform()
+   local tolerance = 0.01
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(t2cpu[typename])
+       x:normal(mean, std)
+       tester:assertalmosteq(x:mean(), mean, tolerance, "mean is wrong")
+       tester:assertalmosteq(x:std(), std, tolerance, "standard deviation is wrong")
+   end
+
+   checkMultiDevice(t, 'normal', mean, std)
+end
+
+function test.logNormal()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local mean, std = torch.uniform(), 0.1 * torch.uniform()
+   local tolerance = 0.01
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:logNormal(mean, std)
+       local logt = x:log()
+       tester:assertalmosteq(logt:mean(), mean, tolerance, "mean is wrong")
+       tester:assertalmosteq(logt:std(), std, tolerance, "standard deviation is wrong")
+   end
+   checkMultiDevice(t, 'logNormal', mean, std)
+end
+
+function test.geometric()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+
+   -- unlike other tests, we pick a large p-value to lower the variance, so
+   -- that its highly unlikely the mean falls outside the bounds of the
+   -- specified tolerance
+   local p = 0.8
+   local tolerance = 0.2
+
+   local t = torch.CudaTensor(sz1, sz2)
+   local mean = (1 / p)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:geometric(p)
+       tester:assertalmosteq(x:mean(), mean, tolerance, "mean is wrong")
+   end
+   checkMultiDevice(t, 'geometric', p)
+end
+
+function test.exponential()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local lambda = torch.uniform()
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(t2cpu[typename])
+       x:exponential(lambda)
+       local u = torch.FloatTensor(sz1, sz2):fill(1) -
+                     (x:float() * -lambda):exp()
+       checkIfUniformlyDistributed(u, 0, 1)
+   end
+   checkMultiDevice(t, 'exponential', lambda)
+end
+
+function test.cauchy()
+   local minsize = 1000
+   local maxsize = 2000
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local median, sigma = torch.uniform(), torch.uniform()
+   local t = torch.CudaTensor(sz1, sz2)
+
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:cauchy(median, sigma)
+       local u = ((x:float() - median) / sigma):atan() / math.pi + 0.5
+       checkIfUniformlyDistributed(u, 0, 1)
+   end
+   checkMultiDevice(t, 'cauchy', median, sigma)
+end
+
+function test.random_seed()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local mean, std = torch.uniform(), torch.uniform()
+   local tolerance = 0.01
+   local t = torch.CudaTensor(sz1, sz2)
+   local u = torch.CudaTensor(sz1, sz2)
+
+   local seed = cutorch.seed()
+   t:normal(mean, std)
+   cutorch.manualSeed(seed)
+   u:normal(mean, std)
+   tester:assertTensorEq(t:float(), u:float(), 1e-6, "values not equal after resetting the seed")
+end
+
+function test.restore_rng()
+   local sz1 = chooseInt(minsize, maxsize)
+   local sz2 = chooseInt(minsize, maxsize)
+   local mean, std = torch.uniform(), torch.uniform()
+   local tolerance = 0.01
+   local t = torch.CudaTensor(sz1, sz2)
+   local u = torch.CudaTensor(sz1, sz2)
+
+   local seed = cutorch.seed()
+   local rng = cutorch.getRNGState()
+   t:normal(mean, std)
+   -- Change the seed so we can check that restoring the RNG state also restores the seed.
+   cutorch.manualSeed(seed + 123)
+   cutorch.setRNGState(rng)
+   u:normal(mean, std)
+   tester:assertTensorEq(t:float(), u:float(), 1e-6, "values not equal after restoring the RNG state")
+   tester:asserteq(cutorch.initialSeed(), seed, "seed was not restored")
+end
+
+function test.multi_gpu_random()
+   local rs = cutorch.getRNGState()
+   cutorch.manualSeedAll(1) -- set all device seeds to be the same
+
+   -- requires at least 2 devices
+   local device_count = cutorch.getDeviceCount()
+   if device_count < 2 then
+      return
+   end
+   cutorch.setDevice(1)
+   local n = 3
+   local expected = torch.CudaTensor(n):uniform():float()
+   for i = 2, device_count do
+      cutorch.setDevice(i)
+      local actual = torch.CudaTensor(n):uniform():float()
+      tester:assert(isEqual(expected, actual), "random tensors dont seem to be equal")
+   end
+   cutorch.setRNGState(rs) -- cleanup after yourself
+   cutorch.setDevice(1) -- reset device
+end
+
+function test.multinomial_with_replacement()
+   for tries = 1, 10 do
+      local n_row = torch.random(10)
+      local n_col = 1 + torch.random(1000)
+
+      local prob_dist = torch.CudaTensor(n_row, n_col):uniform()
+      prob_dist:select(2, n_col):fill(0) --index n_col shouldn't be sampled
+      local n_sample = torch.random(n_col - 1)
+      for _, typename in ipairs(float_typenames) do
+          if typename ~= 'torch.CudaHalfTensor' then
+             local pd = prob_dist:type(typename)
+             local sample_indices = torch.multinomial(pd, n_sample, true)
+             tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
+             tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+
+             for i = 1, n_row do
+                for j = 1, n_sample do
+                   local val = sample_indices[{i,j}]
+                   tester:assert(val == math.floor(val) and val >= 1 and val < n_col,
+                                 "sampled an invalid index: " .. val)
+                end
+             end
+         end
+      end
+   end
+end
+
+function test.multinomial_without_replacement()
+   for tries = 1, 10 do
+      local n_row = torch.random(1000)
+      -- choose a small number of columns to test that the 0 col is never chosen
+      local n_col = 1 + torch.random(10)
+
+      local prob_dist = torch.CudaTensor(n_row, n_col):uniform()
+      prob_dist:select(2, n_col):fill(0) --index n_col shouldn't be sampled
+      local n_sample = torch.random(n_col - 1)
+      for _, typename in ipairs(float_typenames) do
+          if typename ~= 'torch.CudaHalfTensor' then
+             local pd = prob_dist:type(typename)
+             local sample_indices = torch.multinomial(pd, n_sample, false)
+             tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
+             tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+
+             sample_indices = sample_indices:float()
+
+             for i = 1, n_row do
+                local row_samples = {}
+                for j = 1, n_sample do
+                   local sample_idx = sample_indices[{i,j}]
+                   tester:assert(
+                      sample_idx ~= n_col, "sampled an index with zero probability"
+                   )
+                   tester:assert(
+                         not row_samples[sample_idx], "sampled an index twice"
+                   )
+                   row_samples[sample_idx] = true
+                end
+             end
+         end
+      end
+   end
+end
+
+function test.multinomial_without_replacement_gets_all()
+   for tries = 1, 10 do
+      local distributions = torch.random(10)
+      local distSize = 1 + torch.random(1000)
+
+      local linear = torch.linspace(1, distSize, distSize):cuda()
+      local t = torch.CudaTensor(distributions, distSize)
+      for dist = 1, distributions do
+         t[dist] = linear
+      end
+
+      local orig = t:cudaLong()
+
+      for _, typename in ipairs(float_typenames) do
+          -- Half tensors have precision errors for the binary search causing this test
+          -- to fail frequently
+          if typename ~= 'torch.CudaHalfTensor' then
+              local x = t:type(typename)
+
+              -- Sample without replacement
+              local result = torch.multinomial(x, distSize)
+              tester:assert(result:size(1) == distributions)
+              tester:assert(result:size(2) == distSize)
+
+              -- Sort, and we should have the original results, since without replacement
+              -- sampling everything, we should have chosen every value uniquely
+              result = result:sort(2)
+              tester:assertTensorEq(orig, result, 0, "error in multinomial_without_replacement_gets_all")
+          end
+      end
+   end
+end
+
+function test.multinomial_vector()
+   local n_col = torch.random(100)
+   local prob_dist = torch.CudaTensor(n_col):uniform()
+   local n_sample = n_col
+   for _, typename in ipairs(float_typenames) do
+       if typename ~= 'torch.CudaHalfTensor' then
+           local pd = prob_dist:type(typename)
+           local sample_indices = torch.multinomial(pd, n_sample, true)
+           tester:assert(sample_indices:dim() == 1, "wrong sample_indices dim")
+           -- Multinomial resizes prob_dist to be 2d (1xn), check that the resize
+           -- was undone
+           tester:assert(prob_dist:dim() == 1, "wrong number of prob_dist dimensions")
+           tester:assert(sample_indices:size(1) == n_sample, "wrong number of samples")
+       end
+   end
+end
+
+function test.get_device()
+    local device_count = cutorch.getDeviceCount()
+    local tensors = { }
+    for i = 1,device_count do
+        table.insert(tensors, torch.Tensor():cuda())
+    end
+    -- Unallocated tensors are on device 0
+    for i = 1,device_count do
+       tester:assert(tensors[i]:getDevice() == 0, "unallocated tensor does not have deviceID 0")
+       -- Now allocate it
+       cutorch.setDevice(i)
+       tensors[i]:resize(1, 2, 3)
+       tester:assert(tensors[i]:getDevice() == i, "tensor does not have the correct deviceID")
+       tester:assert(tensors[i]:getDevice() == tensors[i]:storage():getDevice(),
+          "tensor's device id doesn't match its storage's device id")
+    end
+    cutorch.setDevice(1) -- reset device
+end
+
+function test.multi_gpu_copy_noncontig()
+   local srcDevice = 1
+   local dstDevice = cutorch.getDeviceCount()
+
+   local t1, t2
+   for transposeSrc = 0,1 do
+     for transposeDst = 0,1 do
+        cutorch.withDevice(
+           srcDevice,
+           function()
+              t1 = torch.CudaTensor(100000, 1000):fill(1)
+              cutorch.synchronize()
+        end)
+
+        cutorch.withDevice(
+           dstDevice,
+           function()
+              t2 = torch.CudaTensor(100000, 1000):fill(2)
+              cutorch.synchronize()
+        end)
+
+        if transposeSrc == 1 then -- maybe make t1 non-contiguous
+           cutorch.withDevice(srcDevice, function() t1=t1:transpose(1,2) end)
+        end
+        if transposeDst == 1 then -- maybe make t2 non-contiguous
+           cutorch.withDevice(dstDevice, function() t2=t2:transpose(1,2) end)
+        end
+
+        -- try to induce a race on t2
+        cutorch.withDevice(dstDevice, function() t2:fill(3) end)
+
+        -- perform the copy
+        -- CudaTensor:copy() should not depend on the current device
+        t2:copy(t1)
+
+        -- try to induce a race on t1
+        cutorch.withDevice(srcDevice, function() t1:fill(4) end)
+
+        local t2_max
+        cutorch.withDevice(dstDevice, function() t2_max = t2:max() end)
+        tester:assert(t2_max == 1, "bad copy, transposeSrc= " .. transposeSrc ..
+               " transposeDst= " .. transposeDst .. ". t2:max() = " .. t2_max)
+      end
+   end
+end
+
+function test.cudaTypeCopy()
+
+   local types = {
+      {'float', 'FloatTensor'},
+      {'byte',  'ByteTensor'},
+      {'char',  'CharTensor'},
+      {'short', 'ShortTensor'},
+      {'int',   'IntTensor'},
+      {'long',  'LongTensor'},
+      {'double','DoubleTensor'},
+      {'half', 'HalfTensor'},
+      {'cuda',      'CudaTensor'},
+      {'cudaByte',  'CudaByteTensor'},
+      {'cudaChar',  'CudaCharTensor'},
+      {'cudaShort', 'CudaShortTensor'},
+      {'cudaInt',   'CudaIntTensor'},
+      {'cudaLong',  'CudaLongTensor'},
+      {'cudaDouble','CudaDoubleTensor'},
+   }
+   if cutorch.hasHalf then
+      table.insert(types, {'cudaHalf', 'CudaHalfTensor'})
+   end
+
+   local N = 100
+   local t0 = torch.range(1,12):reshape(3,4)
+
+   -- t carries over from one iteration to the next
+   local t = t0:clone()
+   for i = 1, N do
+      -- convert to a random (CPU or GPU) type)
+      local conversionFunc, tensorSubtype = unpack(types[torch.random(#types)])
+      local tensorType = 'torch.' .. tensorSubtype
+
+      if torch.random(0,1) ~= 0 then
+         -- this is equivalent to t = t:float()
+         t = t[conversionFunc](t)
+      else
+         -- this is equivalent to t = torch.XTensor():copy(t)
+         t = torch[tensorSubtype](3,4):copy(t)
+      end
+
+      -- check the type
+      tester:assert(t:type() == tensorType, t:type() .. ' ~= ' .. tensorType)
+
+      -- check metadata
+      tester:assert(t:isContiguous())
+      tester:assert(t:size(1) == 3 and t:size(2) == 4)
+      tester:assert(t:nDimension() == 2)
+
+      -- check data
+      tester:assertTensorEq(t:double(), t0, 0)
+
+
+      -- check indexing
+      -- FIXME: doesn't work yet
+      -- tester:assert(ct[{1,1}] == 1)
+   end
+
+   -- check narrowing conversions
+   tester:assert(torch.Tensor(1):fill(500):cudaByte():float()[1] == 244)
+   tester:assert(torch.Tensor(1):fill(500):cudaChar():float()[1] == -12)
+end
+
+
+function test.cudaStorageTypeCopy()
+
+   local types = {
+      {'float', 'FloatStorage'},
+      {'byte',  'ByteStorage'},
+      {'char',  'CharStorage'},
+      {'short', 'ShortStorage'},
+      {'int',   'IntStorage'},
+      {'long',  'LongStorage'},
+      {'double','DoubleStorage'},
+      {'half',   'HalfStorage'},
+      {'cuda',      'CudaStorage'},
+      {'cudaByte',  'CudaByteStorage'},
+      {'cudaChar',  'CudaCharStorage'},
+      {'cudaShort', 'CudaShortStorage'},
+      {'cudaInt',   'CudaIntStorage'},
+      {'cudaLong',  'CudaLongStorage'},
+      {'cudaDouble','CudaDoubleStorage'},
+   }
+   if cutorch.hasHalf then
+      table.insert(types, {'cudaHalf', 'CudaHalfStorage'})
+   end
+
+   local N = 100
+   local t0 = torch.range(1,12):reshape(3,4):storage()
+
+   -- t carries over from one iteration to the next
+   local t = torch.DoubleStorage(t0:size()):copy(t0)
+   for i = 1, N do
+      -- convert to a random (CPU or GPU) type)
+      local conversionFunc, storageSubtype = unpack(types[torch.random(#types)])
+      local storageType = 'torch.' .. storageSubtype
+
+      -- this is equivalent to t = torch.XStorage():copy(t)
+      t = torch[storageSubtype](12):copy(t)
+
+      -- check the type
+      tester:assert(torch.type(t) == storageType, torch.type(t) .. ' ~= ' .. storageType)
+
+      local d = torch.DoubleStorage(12):copy(t)
+      for i = 1, t:size() do
+         tester:assert(d[i] == t0[i], storageSubtype .. ': ' .. i .. ': ' .. d[i] .. ' ~= ' .. t0[i])
+      end
+   end
+end
+
+function test.tensorToTable()
+   local types = {
+      {'CudaTensor',       'FloatTensor'},
+      {'CudaByteTensor',   'ByteTensor'},
+      {'CudaCharTensor',   'CharTensor'},
+      {'CudaShortTensor',  'ShortTensor'},
+      {'CudaIntTensor',    'IntTensor'},
+      {'CudaLongTensor',   'LongTensor'},
+      {'CudaDoubleTensor', 'DoubleTensor'},
+   }
+   if cutorch.hasHalf then
+      table.insert(types, {'CudaHalfTensor', 'HalfTensor'})
+   end
+   for _, types in ipairs(types) do
+      local cudaType, hostType = unpack(types)
+      local dim = torch.random(5)
+      local size = torch.LongTensor(dim):random(5):totable()
+      local hostTensor = nil
+      if hostType ~= 'HalfTensor' then
+          hostTensor = torch[hostType](size):random()
+      else
+          -- work around HalfTensor not having random functions and reduced range
+          local copyTensor = torch['FloatTensor'](size):random(128)
+          hostTensor = torch[hostType](size)
+          hostTensor:copy(copyTensor)
+      end
+      local cudaTensor = torch[cudaType](size):copy(hostTensor)
+      tester:assertTableEq(hostTensor:totable(), cudaTensor:totable(),
+                           'wrong result for ' .. cudaType .. ':totable()')
+   end
+end
+
+function test.storageToTable()
+   local types = {
+      {'CudaStorage',       'FloatTensor'},
+      {'CudaByteStorage',   'ByteTensor'},
+      {'CudaCharStorage',   'CharTensor'},
+      {'CudaShortStorage',  'ShortTensor'},
+      {'CudaIntStorage',    'IntTensor'},
+      {'CudaLongStorage',   'LongTensor'},
+      {'CudaDoubleStorage', 'DoubleTensor'},
+   }
+   if cutorch.hasHalf then
+     types['CudaHalfStorage'] = 'HalfTensor'
+   end
+
+   for _, types in ipairs(types) do
+      local cudaStorageType, hostTensorType = unpack(types)
+      local size = torch.random(10)
+      hostTensor = torch[hostTensorType](size):random()
+      cudaStorage = torch[cudaStorageType](size):copy(hostTensor:storage())
+      tester:assertTableEq(hostTensor:storage():totable(), cudaStorage:totable(),
+                           'wrong result for ' .. cudaStorageType .. ':totable()')
+   end
+end
+
+function test.maskedSelect()
+   local n_row = math.random(minsize,maxsize)
+   local n_col = math.random(minsize,maxsize)
+
+   -- contiguous, no result tensor, cuda mask
+   local x = torch.randn(n_row, n_col):float()
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   local y = x:maskedSelect(mask)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   local y_cuda = x:maskedSelect(mask)
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedSelect")
+   checkMultiDevice(x, 'maskedSelect', mask)
+
+   -- non-contiguous, no result tensor, cuda mask
+   local x = torch.randn(n_row, n_col):float()
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   local y = x:t():maskedSelect(mask)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   local y_cuda = x:t():maskedSelect(mask)
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                         "Error in maskedSelect non-contiguous")
+
+   -- contiguous, with result tensor, cuda mask
+   local x = torch.randn(n_row, n_col):float()
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   local y = torch.FloatTensor()
+   y:maskedSelect(x, mask)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   local y_cuda = torch.CudaTensor()
+   y_cuda:maskedSelect(x, mask)
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                         "Error in maskedSelect (with result)")
+
+   -- non-contiguous, with result tensor, cuda mask
+   local x = torch.randn(n_row, n_col):float()
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   local y = torch.FloatTensor()
+   y:maskedSelect(x:t(), mask)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   local y_cuda = torch.CudaTensor()
+   y_cuda:maskedSelect(x:t(), mask)
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+          "Error in maskedSelect non-contiguous (with result)")
+
+   -- indexing maskedSelect a[a:gt(0.5)] for example
+   local x = torch.randn(n_row, n_col):float()
+   local y = x[x:gt(0.5)]
+   x=x:cuda()
+   local y_cuda = x[x:gt(0.5)]
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                         "Error in maskedSelect indexing x[x:gt(y)]")
+
+   -- indexing maskedSelect (non-contiguous) a[a:gt(0.5)] for example
+   local x = torch.randn(n_row, n_col):float()
+   local y = x:t()[x:t():gt(0.5)]
+   x=x:cuda()
+   local y_cuda = x:t()[x:t():gt(0.5)]
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+          "Error in maskedSelect indexing non-contig x[x:gt(y)]")
+end
+
+function test.maskedCopy()
+   local n_row = math.random(minsize,maxsize)
+   local n_col = math.random(minsize,maxsize)
+
+   -- contiguous, cuda mask
+   local x = torch.rand(n_row, n_col):float()
+   local y = x:clone():fill(-1)
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   y:maskedCopy(mask, x:clone())
+   local y_cuda=x:cuda():fill(-1)
+   mask=mask:cudaByte()
+   x=x:cuda()
+   y_cuda:maskedCopy(mask, x)
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                                 "Error in maskedCopy (contiguous)")
+   checkMultiDevice(y_cuda, 'maskedCopy', mask, x)
+
+   -- non-contiguous source, cuda mask
+   local x = torch.rand(n_row, n_col):float()
+   local y = x:clone():fill(-1)
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   y:maskedCopy(mask, x:t())
+   local y_cuda=x:cuda():fill(-1)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   y_cuda:maskedCopy(mask, x:t())
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                       "Error in maskedCopy (non-contiguous source)")
+
+   -- non-contiguous result, cuda mask
+   local x = torch.rand(n_row, n_col):float()
+   local y = x:clone():fill(-1)
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   y:t():maskedCopy(mask, x:t())
+   local y_cuda=x:cuda():fill(-1)
+   x=x:cuda()
+   mask=mask:cudaByte()
+   y_cuda:t():maskedCopy(mask, x:t())
+   tester:assertTensorEq(y, y_cuda:float(), 0.00001,
+                        "Error in maskedCopy (non-contiguous dest)")
+
+   -- indexing maskedCopy a[a:gt(0.5)] for example
+   local gt = torch.rand(n_row, n_col):float()
+   local x = gt:clone()
+   local y = torch.rand(n_row, n_col):float()
+   x[x:gt(0.5)] = y
+   local x_cuda = gt:cuda()
+   y=y:cuda()
+   x_cuda[x_cuda:gt(0.5)] = y
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+                             "Error in maskedCopy indexing x[x:gt(y)]")
+
+   -- indexing maskedCopy non-contiguous src a[a:gt(0.5)] for example
+   local gt = torch.rand(n_row, n_col):float()
+   local x = gt:clone()
+   local y = torch.rand(n_row, n_col):float()
+   x[x:gt(0.5)] = y:t()
+   local x_cuda = gt:cuda()
+   y=y:cuda()
+   x_cuda[x_cuda:gt(0.5)] = y:t()
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+                            "Error in maskedCopy indexing x[x:gt(y)]")
+
+   -- indexing maskedCopy non-contiguous dst a[a:gt(0.5)] for example
+   local gt = torch.rand(n_row, n_col):float()
+   local x = gt:clone()
+   local y = torch.rand(n_row, n_col):float()
+   x:t()[x:t():gt(0.5)] = y
+   local x_cuda = gt:cuda()
+   y=y:cuda()
+   x_cuda:t()[x_cuda:t():gt(0.5)] = y
+
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+                         "Error in maskedCopy indexing x[x:gt(y)]")
+end
+
+function test.maskedFill()
+   local n_row = math.random(minsize,maxsize)
+   local n_col = math.random(minsize,maxsize)
+
+   -- contiguous, no result tensor, cuda mask
+   local gt = torch.randn(n_row, n_col):float()
+   local x = gt:clone()
+   local mask = torch.ByteTensor(n_row,n_col):bernoulli()
+   x:maskedFill(mask, 334)
+   local x_cuda=gt:cuda()
+   mask=mask:cudaByte()
+   x_cuda:maskedFill(mask, 334)
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001, "Error in maskedFill")
+   checkMultiDevice(x_cuda, 'maskedFill', mask, 334)
+
+   -- non-contiguous, no result tensor, cuda mask
+   local x = gt:clone()
+   mask = mask:byte()
+   x:t():maskedFill(mask, 334)
+   local x_cuda = gt:cuda()
+   mask=mask:cudaByte()
+   x_cuda:t():maskedFill(mask, 334)
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+                         "Error in maskedFill non-contiguous")
+
+   -- indexing maskedFill a[a:gt(0.5)] for example
+   local x = gt:clone()
+   x[x:gt(0.5)] = 334
+   local x_cuda = gt:cuda()
+   x_cuda[x_cuda:gt(0.5)] = 334
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+                         "Error in maskedFill indexing x[x:gt(y)]")
+
+   -- indexing maskedFill a[a:gt(0.5)] for example
+   local x = gt:clone()
+   x:t()[x:t():gt(0.5)] = 334
+   local x_cuda = gt:cuda()
+   x_cuda:t()[x_cuda:t():gt(0.5)] = 334
+   tester:assertTensorEq(x, x_cuda:float(), 0.00001,
+          "Error in maskedFill non-contig indexing x[x:gt(y)]")
+
+end
+
+-- Fill idx with valid indices.
+local function fillIdx(idx, dim, dim_size, elems_per_row, m, n, o)
+   for i = 1, (dim == 1 and 1 or m) do
+      for j = 1, (dim == 2 and 1 or n) do
+         for k = 1, (dim == 3 and 1 or o) do
+            local ii = {i, j, k}
+            ii[dim] = {}
+            idx[ii] = torch.randperm(dim_size)[{{1, elems_per_row}}]
+         end
+      end
+   end
+end
+
+function test.gather()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local src = torch.randn(m, n, o):float()
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, src:size(dim), elems_per_row, m, n, o)
+
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local src = src:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, true, src, 'gather', dim, idx)
+      compareCPUAndCUDATypeTensorArgs(typename, false, src, 'gather', dim, idx)
+   end
+end
+
+function test.scatter()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, ({m, n, o})[dim], elems_per_row, m, n, o)
+   local src = torch.FloatTensor():resize(unpack(idx_size)):normal()
+   local res = torch.FloatTensor(m, n, o):zero()
+
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local res, src = res:type(ctype), src:type(ctype)
+      compareCPUAndCUDATypeTensorArgs(typename, true, res, 'scatter', dim, idx, src)
+      compareCPUAndCUDATypeTensorArgs(typename, false, res, 'scatter', dim, idx, src)
+   end
+end
+
+function test.scatterFill()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local val = torch.uniform()
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, ({m, n, o})[dim], elems_per_row, m, n, o)
+
+   local res = torch.FloatTensor(m, n, o):zero()
+   for k, typename in ipairs(typenames) do
+      local res = res:type(t2cpu[typename])
+      compareCPUAndCUDATypeTensorArgs(typename, true, res, 'scatter', dim, idx, val)
+      compareCPUAndCUDATypeTensorArgs(typename, false, res, 'scatter', dim, idx, val)
+   end
+end
+
+function test.sort()
+   for tries = 1, 5 do
+      local t = createTestTensor(2 ^ 20)
+      local selectdim = chooseInt(1, t:nDimension())
+      local dir = chooseInt(1, 2) == 1
+
+      for k, typename in ipairs(typenames) do
+          if typename ~= 'torch.CudaByteTensor'
+              and typename ~= 'torch.CudaCharTensor'
+          and typename ~= 'torch.CudaShortTensor' then
+              local ctype = t2cpu[typename]
+              local t = t:type(ctype)
+              compareCPUAndCUDATypeTensorArgs(typename, nil, t, 'sort', selectdim, dir)
+          end
+      end
+   end
+
+   -- Test a large tensors whose total size exceeds 2^24,
+   -- but whose sorting dimension is less than 2^24
+   -- Since the sorting mechanism is not guaranteed to be the
+   -- same between GPU and CPU, we have to be careful when comparing
+   -- the indices
+   local t_cpu = torch.FloatTensor(5000, 5000):uniform()
+   local t_gpu = t_cpu:cuda()
+
+   local v_cpu, i_cpu = torch.sort(t_cpu, 2)
+   local v_gpu, i_gpu = torch.sort(t_gpu, 2)
+
+   -- Values should match exactly, regardless of sorting method
+   tester:assert(isEqual(v_cpu, v_gpu), 'value mismatch')
+
+   -- Indices can differ since the sorting method can differ (stable vs. not),
+   -- but values should be equivalent after gather
+   local gather_cpu = t_cpu:gather(2, i_cpu)
+   local gather_gpu = t_gpu:gather(2, i_gpu)
+
+   tester:assert(isEqual(gather_cpu, gather_gpu), 'indices mismatch')
+
+   -- Test a large tensors whose total size exceeds 2^24
+   local t_cpu = torch.FloatTensor(2^25):uniform()
+   local t_gpu = t_cpu:cuda()
+
+   local v_cpu, i_cpu = torch.sort(t_cpu, 1)
+   local v_gpu, i_gpu = torch.sort(t_gpu, 1)
+
+   -- Values should match exactly, regardless of sorting method
+   tester:assert(isEqual(v_cpu, v_gpu), 'value mismatch')
+
+   -- Indices can differ since the sorting method can differ (stable vs. not),
+   -- but values should be equivalent after gather
+   local gather_cpu = t_cpu:gather(1, i_cpu)
+   local gather_gpu = t_gpu:gather(1, i_gpu)
+
+   tester:assert(isEqual(gather_cpu, gather_gpu), 'indices mismatch')
+end
+
+function test.topk()
+   local function runTopK(t, dim, k, dir)
+      -- FIXME: if the tensors ever contain equivalent values, then their indices
+      -- could in fact be different.
+
+      if torch.Tensor.type(t) == 'torch.CudaTensor' then
+         return t:topk(k, dim, dir, true)
+      else
+         local sorted, indices = t:sort(dim, dir)
+         return sorted:narrow(dim, 1, k), indices:narrow(dim, 1, k)
+      end
+   end
+
+   for tries = 1, 5 do
+      -- max size 2^20 for indexing
+      local t = createTestTensor(2 ^ 20)
+      local dim = chooseInt(1, t:nDimension())
+      local dimSize = t:size(dim)
+      local dir = chooseInt(1, 2) == 1
+
+      -- Test boundary conditions
+      local kTests = {1, dimSize}
+
+      -- and some other random ones
+      table.insert(kTests, chooseInt(1, dimSize))
+      for i = 1, 2 do
+         -- some sizes that fit in our inplace kernel range (the dimSize one
+         -- will fall back to Thrust)
+         table.insert(kTests, chooseInt(1, math.min(2048, dimSize)))
+      end
+
+      for k = 1, #kTests do
+         compareFloatAndCuda(t, runTopK, dim, kTests[k], dir)
+      end
+   end
+end
+
+function test.cat()
+   for k, typename in ipairs(typenames) do
+      for dim = 1, 3 do
+         local x = torch.Tensor(13, minsize, minsize):uniform()
+            :type(typename):transpose(1, dim)
+         local y = torch.Tensor(17, minsize, minsize):uniform()
+            :type(typename):transpose(1, dim)
+         local mx = torch.cat(x, y, dim)
+         tester:assertTensorEq(mx:narrow(dim, 1, 13), x, 0, 'torch.cat value')
+         tester:assertTensorEq(mx:narrow(dim, 14, 17), y, 0, 'torch.cat value')
+
+         local mxx = torch.Tensor():type(typename)
+         torch.cat(mxx, x, y, dim)
+         tester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+
+         local x = torch.CudaTensor(1, 2, 3):uniform()
+         local y = torch.CudaTensor()
+         local mx = torch.cat(x,y,dim)
+         tester:asserteq(mx:size(1),1,'torch.cat size')
+         tester:asserteq(mx:size(2),2,'torch.cat size')
+         tester:asserteq(mx:size(3),3,'torch.cat size')
+         tester:assertTensorEq(mx, x, 0, 'torch.cat value')
+
+         local x = torch.CudaTensor()
+         local y = torch.CudaTensor()
+         local mx = torch.cat(x,y,dim)
+         tester:asserteq(mx:dim(),0,'torch.cat dim')
+      end
+   end
+end
+
+function test.catArray()
+   for k, typename in ipairs(typenames) do
+      for dim = 1, 3 do
+         local x = torch.Tensor(13, minsize, minsize):uniform()
+            :type(typename):transpose(1, dim)
+         local y = torch.Tensor(17, minsize, minsize):uniform()
+            :type(typename):transpose(1, dim)
+         local z = torch.Tensor(19, minsize, minsize):uniform()
+            :type(typename):transpose(1, dim)
+
+         local mx = torch.cat({x, y, z}, dim)
+         tester:assertTensorEq(mx:narrow(dim, 1, 13), x, 0, 'torch.cat value')
+         tester:assertTensorEq(mx:narrow(dim, 14, 17), y, 0, 'torch.cat value')
+         tester:assertTensorEq(mx:narrow(dim, 31, 19), z, 0, 'torch.cat value')
+
+         local mxx = torch.Tensor():type(typename)
+         torch.cat(mxx, {x, y, z}, dim)
+         tester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+
+         local x = torch.CudaTensor(1, 2, 3):uniform()
+         local y = torch.CudaTensor()
+         local mx = torch.cat({x,y},dim)
+         tester:asserteq(mx:size(1),1,'torch.cat size')
+         tester:asserteq(mx:size(2),2,'torch.cat size')
+         tester:asserteq(mx:size(3),3,'torch.cat size')
+         tester:assertTensorEq(mx, x, 0, 'torch.cat value')
+
+         local x = torch.CudaTensor()
+         local y = torch.CudaTensor()
+         local mx = torch.cat({x,y},dim)
+         tester:asserteq(mx:dim(),0,'torch.cat dim')
+      end
+   end
+end
+
+-- designed to specifically hit the batched kernel for catArray
+function test.catArrayBatched()
+    local batchSizes = {2, 16, 128, 1024, 4096}
+    for _, batchSize in ipairs(batchSizes) do
+        -- first, batches for 1D Tensors
+        local tensors = {}
+        for i = 1, batchSize do
+            table.insert(tensors, torch.CudaTensor(1024):uniform())
+        end
+        local mx = torch.cat(tensors, 1)
+        local offset = 1
+        for i = 1, batchSize do
+            tester:assertTensorEq(mx:narrow(1, offset, tensors[i]:size(1)), tensors[i], 0, 'torch.carArrayBatched value')
+            offset = offset + tensors[i]:size(1)
+        end
+
+        -- next, 2D Tensors
+        tensors = {}
+        for i = 1, batchSize do
+            table.insert(tensors, torch.CudaTensor(1, 1024):uniform())
+        end
+        -- across dim = 1 (row-wise concatentation)
+        mx = torch.cat(tensors, 1)
+        offset = 1
+        for i = 1, batchSize do
+            tester:assertTensorEq(mx:narrow(1, offset, tensors[i]:size(1)), tensors[i], 0, 'torch.carArrayBatched value')
+            offset = offset + tensors[i]:size(1)
+        end
+        tensors = {}
+        for i = 1, batchSize do
+            table.insert(tensors, torch.CudaTensor(128, 128):uniform())
+        end
+        -- across dim = 2 (column-wise concatentation)
+        mx = torch.cat(tensors, 2)
+        offset = 1
+        for i = 1, batchSize do
+            tester:assertTensorEq(mx:narrow(2, offset, tensors[i]:size(2)), tensors[i], 0, 'torch.carArrayBatched value')
+            offset = offset + tensors[i]:size(2)
+        end
+    end
+
+    -- one giant copy
+    local a = torch.CudaTensor(4096, 4096):uniform()
+    local b = torch.CudaTensor(4096, 4096):uniform()
+    local mx = torch.cat({a, b}, 1)
+    tester:assertTensorEq(mx:narrow(1, 1, 4096), a, 0, 'torch.carArrayBatched value')
+    tester:assertTensorEq(mx:narrow(1, 4097, 4096), b, 0, 'torch.carArrayBatched value')
+
+    -- output Tensor is non-contiguous
+    local notcontig = torch.CudaTensor(5, 4):t():uniform()
+    local a = torch.CudaTensor(2, 5):uniform()
+    local b = torch.CudaTensor(1, 5):uniform()
+    local c = torch.CudaTensor(1, 5):uniform()
+
+    torch.cat(notcontig, {a, b, c}, 1)
+    tester:assertTensorEq(notcontig:narrow(1, 1, 2), a, 0, 'torch.carArrayBatched value')
+    tester:assertTensorEq(notcontig:narrow(1, 3, 1), b, 0, 'torch.carArrayBatched value')
+    tester:assertTensorEq(notcontig:narrow(1, 4, 1), c, 0, 'torch.carArrayBatched value')
+end
+
+function test.streamWaitFor()
+   local size = 2000000
+   local iter = 20 + torch.random(10)
+   local result = torch.CudaTensor(size):zero()
+   local numStreams = torch.random(10)
+
+   cutorch.reserveStreams(numStreams + 1)
+   local tensors = {}
+   local waitingFor = {}
+
+   for stream = 1, numStreams do
+      cutorch.setStream(stream)
+      table.insert(waitingFor, stream)
+      table.insert(tensors, torch.CudaTensor(size):zero())
+   end
+
+   -- Queue a bunch of work on different streams
+   for i = 1, iter do
+      for stream = numStreams, 1, -1 do
+         cutorch.setStream(stream)
+         tensors[stream]:add(1)
+      end
+   end
+
+   -- In another stream, wait on the completion of all the above.
+   -- Without the streamWaitFor, this will race with the above and won't
+   -- gather all of the additions.
+   -- Unfortunately, it would be rather hard to write a test to ensure that
+   -- we're actually executing all this asynchronously, and to write a test that
+   -- always guarantees failure with this race is equally problematic.
+   -- So, we satisfy ourselves with this.
+   cutorch.setStream(numStreams + 1)
+   cutorch.streamWaitFor(numStreams + 1, waitingFor)
+
+   for i = 1, numStreams do
+      result:add(tensors[i])
+   end
+
+   tester:asserteq(result:min(), iter * numStreams)
+
+   -- return to default stream
+   cutorch.setStream(0)
+   result = nil
+   tensors = nil
+   collectgarbage()
+   collectgarbage()
+   cutorch.synchronize()
+end
+
+function test.streamWaitForMultiDevice()
+   -- This test requires multiple devices
+   local numDevices = cutorch.getDeviceCount()
+   if numDevices < 2 then
+      return
+   end
+
+   local size = 2000000
+   local iter = 80 + torch.random(10)
+   local numStreams = torch.random(10)
+   cutorch.reserveStreams(numStreams + 1)
+
+   -- Create scratch space on the last device to receive all results
+   -- `tmpResults` and `results` will be operated on in `numStreams + 1`
+   cutorch.setDevice(numDevices)
+   cutorch.setStream(numStreams + 1)
+   local tmpResults = {}
+   local results = torch.CudaTensor(size):zero()
+
+   for dev = 1, numDevices - 1 do
+      local tmpResultsPerDevice = {}
+      for stream = 1, numStreams do
+         table.insert(tmpResultsPerDevice, torch.CudaTensor(size):zero())
+      end
+
+      table.insert(tmpResults, tmpResultsPerDevice)
+   end
+
+   -- In order to test isolating the one-way barrier below, sync all the work
+   -- above so we know the `zero()` is complete.
+   cutorch.streamSynchronize(numStreams + 1)
+
+   -- Allocate data on all devices (except the last)
+   local tensors = {}
+
+   for dev = 1, numDevices - 1 do
+      cutorch.setDevice(dev)
+      local tensorsPerDevice = {}
+
+      for stream = 1, numStreams do
+         cutorch.setStream(stream)
+         table.insert(tensorsPerDevice, torch.CudaTensor(size):zero())
+      end
+
+      table.insert(tensors, tensorsPerDevice)
+   end
+
+   -- Queue work to all streams, all devices (except the last)
+   for i = 1, iter do
+      for dev = 1, numDevices - 1 do
+         cutorch.setDevice(dev)
+         for stream = 1, numStreams do
+            cutorch.setStream(stream)
+            tensors[dev][stream]:add(1)
+         end
+      end
+   end
+
+   -- Copy back to device `numDevices`
+   for dev = 1, numDevices - 1 do
+      cutorch.setDevice(dev)
+      for stream = 1, numStreams do
+         cutorch.setStream(stream)
+
+         -- These copies will be ordered in the source stream (dev, stream), but
+         -- tmpResults is on device `numDevices`.
+         tmpResults[dev][stream]:copy(tensors[dev][stream])
+
+         -- We will wait on the above copy to complete in the dest too
+         cutorch.streamWaitForMultiDevice(numDevices, numStreams + 1, {[dev]={stream}})
+
+         -- Note that because the copy is ordered in (dev, stream), we are free
+         -- to modify the value after issuing the above copy.
+         tensors[dev][stream]:zero()
+      end
+   end
+
+   -- Sum up the results
+   cutorch.setDevice(numDevices)
+   cutorch.setStream(numStreams + 1)
+
+   for dev = 1, numDevices - 1 do
+      for stream = 1, numStreams do
+         results:add(tmpResults[dev][stream])
+      end
+   end
+
+   tester:asserteq(results:min(), iter * numStreams * (numDevices - 1))
+
+   -- return to default device/stream
+   cutorch.setDevice(1)
+   cutorch.setStream(0)
+   results = nil
+   tmpResults = nil
+   tensors = nil
+   collectgarbage()
+   collectgarbage()
+   cutorch.synchronize()
+end
+
+function test.streamBarrier()
+   local size = 2000000
+   local iter = 20 + torch.random(10)
+   local numStreams = torch.random(10)
+
+   cutorch.reserveStreams(numStreams)
+   local tensors = {}
+   local results = {}
+   local waitingFor = {}
+
+   for stream = 1, numStreams do
+      cutorch.setStream(stream)
+      table.insert(waitingFor, stream)
+      table.insert(tensors, torch.CudaTensor(size):zero())
+      table.insert(results, torch.CudaTensor(size):zero())
+   end
+
+   -- Queue a bunch of work on different streams
+   for stream = numStreams, 1, -1 do
+      cutorch.setStream(stream)
+      for i = 1, iter do
+         tensors[stream]:add(1)
+      end
+   end
+
+   -- Create an all-way barrier
+   cutorch.streamBarrier(waitingFor)
+
+   -- In all streams, sum against all other tensors
+   for stream = 1, numStreams do
+      cutorch.setStream(stream)
+      for otherStream = 1, numStreams do
+         results[stream]:add(tensors[otherStream])
+      end
+   end
+
+   -- Validate that all streams received the full values
+   -- As above, it would be rather hard to write a test to ensure that
+   -- we're actually executing all this asynchronously, and to write a test that
+   -- always guarantees failure with this race is equally problematic.
+   -- So, we satisfy ourselves with this.
+   for stream = 1, numStreams do
+      cutorch.setStream(stream)
+      tester:asserteq(results[stream]:min(), iter * numStreams)
+   end
+
+   -- return to default stream
+   cutorch.setStream(0)
+   results = nil
+   tensors = nil
+   collectgarbage()
+   collectgarbage()
+   cutorch.synchronize()
+end
+
+function test.streamBarrierMultiDevice()
+   -- This test requires multiple devices
+   local numDevices = cutorch.getDeviceCount()
+   if numDevices < 2 then
+      return
+   end
+
+   local size = 2000000
+   local iter = 50 + torch.random(10)
+   local numStreams = torch.random(10)
+   cutorch.reserveStreams(numStreams)
+
+   local tensors = {} -- per device, per stream
+   local tmpResults = {} -- per device, (per other device, per other stream)
+   local results = {} -- per device
+   local waitingFor = {}
+
+   -- Create space on all devices
+   for device = 1, numDevices do
+      cutorch.setDevice(device)
+      cutorch.setStream(1)
+      table.insert(results, torch.CudaTensor(size):zero())
+
+      -- tmpResults[our device][other device][other stream]
+      local tmpResultsPerDevice = {}
+      for otherDevice = 1, numDevices do
+         local tmpResultsPerOtherDevice = {}
+         for otherStream = 1, numStreams do
+            table.insert(tmpResultsPerOtherDevice, torch.CudaTensor(size):zero())
+         end
+         table.insert(tmpResultsPerDevice, tmpResultsPerOtherDevice)
+      end
+      table.insert(tmpResults, tmpResultsPerDevice)
+
+      -- tensors[our device][our stream]
+      local tensorsPerDevice = {}
+      local waitingForPerDevice = {}
+      for stream = 1, numStreams do
+         cutorch.setStream(stream)
+         table.insert(tensorsPerDevice, torch.CudaTensor(size):zero())
+         table.insert(waitingForPerDevice, stream)
+      end
+
+      table.insert(tensors, tensorsPerDevice)
+      table.insert(waitingFor, waitingForPerDevice)
+   end
+
+   -- Queue work to all streams, all devices
+   for i = 1, iter do
+      for dev = 1, numDevices do
+         cutorch.setDevice(dev)
+         for stream = 1, numStreams do
+            cutorch.setStream(stream)
+            tensors[dev][stream]:add(1)
+         end
+      end
+   end
+
+   -- Create an all-way barrier
+   cutorch.streamBarrierMultiDevice(waitingFor)
+
+   -- All-to-all copy (done in stream 1 on each device)
+   for dev = 1, numDevices do
+      cutorch.setDevice(dev)
+      cutorch.setStream(1)
+
+      for otherDev = 1, numDevices do
+         for otherStream = 1, numStreams do
+            -- This copy is ordered in the source (otherDev, stream 1)
+            -- which produced the value.
+            -- (dev, stream 1) on all devices is complete due to the all-way
+            -- barrier above.
+            tmpResults[dev][otherDev][otherStream]:copy(tensors[otherDev][otherStream])
+         end
+      end
+   end
+
+   -- For each device in stream 1, sum up the accumulated results from
+   -- all devices/all streams
+   for dev = 1, numDevices do
+      cutorch.setDevice(dev)
+      cutorch.setStream(1)
+
+      for otherDev = 1, numDevices do
+         for otherStream = 1, numStreams do
+            -- Since the copy above is ordered in stream (otherDev, 1),
+            -- we need to wait for its completion
+            if dev ~= otherDev then
+               cutorch.streamWaitForMultiDevice(dev, 1, {[otherDev]={1}})
+            end
+
+            results[dev]:add(tmpResults[dev][otherDev][otherStream])
+         end
+      end
+   end
+
+   -- Validate that all devices received the full values
+   -- As above, it would be rather hard to write a test to ensure that
+   -- we're actually executing all this asynchronously, and to write a test that
+   -- always guarantees failure with this race is equally problematic.
+   -- So, we satisfy ourselves with this.
+   for dev = 1, numDevices do
+      cutorch.setDevice(dev)
+      cutorch.setStream(1)
+      tester:asserteq(results[dev]:min(), iter * numStreams * numDevices)
+   end
+
+   -- return to default stream/device
+   cutorch.setDevice(1)
+   cutorch.setStream(0)
+   results = nil
+   tmpResults = nil
+   tensors = nil
+   collectgarbage()
+   collectgarbage()
+   cutorch.synchronize()
+end
+
+function test.cudaEvent()
+   cutorch.reserveStreams(2)
+   cutorch.setStream(1)
+
+   local t1 = torch.CudaTensor(100000000):zero()
+   local t2 = torch.CudaTensor(1):zero()
+
+   local t1View = t1:narrow(1, 100000000, 1)
+   t1:fill(1)
+   -- Event is created here
+   local event = cutorch.Event()
+
+   cutorch.setStream(2)
+
+   -- assert below will fail without this
+   event:waitOn()
+   t2:copy(t1View)
+   tester:asserteq(t2[1], 1)
+
+   -- revert to default stream
+   cutorch.setStream(0)
+end
+
+function test.cudaHostTensor()
+  local t = cutorch.createCudaHostTensor(3, 4, 5)
+  tester:assertTableEq(t:size():totable(), {3, 4, 5})
+
+  local u = torch.Tensor(4, 5, 6)
+  local v = cutorch.createCudaHostTensor(u:size())
+  tester:assertTableEq(u:size():totable(), v:size():totable())
+
+  local w = cutorch.createCudaHostTensor()
+  tester:assert(w:storage() ~= nil, 'Empty CUDA host tensor must have a storage')
+  tester:asserteq(w:nElement(), 0, 'Expected an empty tensor')
+end
+
+function test.kernelP2PAccess()
+   -- We can only test direct kernel p2p access if we have multiple devices
+   -- and p2p enabled
+   if cutorch.getDeviceCount() < 2 then
+      return
+   end
+
+   if cutorch.getPeerToPeerAccess(1, 2) then
+      -- We should be on device 1 anyways, but just make sure
+      cutorch.setDevice(1)
+      local a = torch.CudaTensor(8):zero()
+      local b = nil
+      cutorch.withDevice(2, function() b = torch.CudaTensor(8):fill(1) end)
+
+      local expected = false
+
+      -- a is on device 1, b is on device 2, so this is a kernel p2p access
+      local function tryAdd()
+         local ok, err = pcall(function() a:add(b) end)
+         tester:assert(ok == expected)
+      end
+
+      -- By default, direct kernel p2p access should be an error
+      cutorch.setKernelPeerToPeerAccess(false)
+      cutorch.withDevice(1, tryAdd)
+      tester:asserteq(a:sum(), 0)
+
+      -- Now enable and try again
+      cutorch.setKernelPeerToPeerAccess(true)
+      expected = true
+      cutorch.withDevice(1, tryAdd)
+      tester:asserteq(a:sum(), 8)
+
+      a:zero()
+
+      -- Turn it back off and check again
+      cutorch.setKernelPeerToPeerAccess(false)
+      expected = false
+      cutorch.withDevice(1, tryAdd)
+      tester:asserteq(a:sum(), 0)
+   end
+end
+
+if os.getenv('THC_CACHING_ALLOCATOR') == '1' then
+   local function getCyclesPerMs()
+      cutorch.synchronize()
+      local t = torch.Timer()
+      cutorch._sleep(1e6)
+      cutorch.synchronize()
+      return 1e6 / (t:time().real * 1000)
+   end
+
+   function test.cachedPinnedMemory()
+      local cyclesPerMs = getCyclesPerMs()
+
+      -- check that allocations are re-used after deletion
+      t = cutorch.createCudaHostTensor({1})
+      ptr = t:data()
+      t = nil; collectgarbage()
+      t = cutorch.createCudaHostTensor({1})
+      tester:asserteq(t:data(), ptr, 'allocation not reused')
+
+      -- check that the allocation is not re-used if it's in-use by a copy
+      gpuTensor = torch.CudaTensor({0})
+      cutorch._sleep(50 * cyclesPerMs)  -- delay the copy
+      gpuTensor:copyAsync(t)
+      t = nil; collectgarbage()
+      t = cutorch.createCudaHostTensor({1})
+      tester:assertne(t:data(), ptr, 'allocation re-used too soon')
+   end
+end
+
+-- unfortunately, torch.Tester() forgot setUp and tearDown functions.
+-- It would be nice to fix torch.Tester() eventually.
+local function setUp()
+  cutorch.setDevice(1)
+  checkHalf()
+end
+
+local test_ = torch.TestSuite()
+for k,v in pairs(test) do
+  test_[k] = function()
+    setUp()
+    v()
+  end
+end
+test = test_
+
+local function initSeed(seed)
+   seed = seed or os.time()
+   -- ensure that you can reproduce a failing test
+   print('seed: ', seed)
+   math.randomseed(seed)
+   torch.manualSeed(seed)
+   cutorch.manualSeedAll(seed)
+end
+
+function cutorch.test(tests, seed)
+   initSeed(seed)
+   tester = torch.Tester()
+   tester:add(test)
+   tester:run(tests)
+end
+
+if runtests then
+   cutorch.test()
+   os.exit(#tester.errors == 0 and 0 or 1)
+end
+return test
diff --git a/test/test_shutdown.lua b/test/test_shutdown.lua
new file mode 100644
index 0000000..750df06
--- /dev/null
+++ b/test/test_shutdown.lua
@@ -0,0 +1,64 @@
+local Threads = require 'threads'
+require 'cutorch'
+
+local function test_cudaEvent()
+   cutorch.reserveStreams(2)
+   cutorch.setStream(1)
+
+   local t1 = torch.CudaTensor(10000000):zero()
+   local t2 = torch.CudaTensor(1):zero()
+
+   local t1View = t1:narrow(1, 10000000, 1)
+   t1:fill(1)
+
+   -- Event is created here
+   local event = cutorch.Event()
+
+   cutorch.setStream(2)
+
+   -- assert below will fail without this
+   event:waitOn()
+   t2:copy(t1View)
+
+   -- revert to default stream
+   cutorch.setStream(0)
+end
+
+local Gig = 1024*1024*1024
+
+local function test_getMemInfo()
+   local sz = Gig*0.1
+   local t1 = torch.CudaTensor(sz):zero()
+   print('Memory usage after 1st allocation [free memory], [total memory]')
+   local free, total = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   local t2 = torch.CudaTensor(sz*1.3):zero()
+   print('Memory usage after 2nd allocation [free memory], [total memory]')
+   local free, total = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   t1 = nil
+   collectgarbage()
+   print('Memory usage after 1st deallocation [free memory], [total memory]')
+   local free, total = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   t2 = nil
+   collectgarbage()
+   print('Memory usage after 2nd deallocation [free memory], [total memory]')
+   free, total = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+end
+
+print ("cutorch.hasHalf is ", cutorch.hasHalf)
+print('Memory usage before intialization of threads [free memory], [total memory]')
+local free, total = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
+threads = Threads(20, function() require 'cutorch'; test_getMemInfo(); test_cudaEvent(); end)
+print('Memory usage after intialization of threads [free memory], [total memory]')
+free, total = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
+threads:terminate()
+collectgarbage()  
+print('Memory usage after termination of threads [free memory], [total memory]')
+free, total = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
+
diff --git a/torch/generic/Storage.c b/torch/generic/Storage.c
new file mode 100644
index 0000000..deb89be
--- /dev/null
+++ b/torch/generic/Storage.c
@@ -0,0 +1,280 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Storage.c"
+#else
+
+static int torch_Storage_(new)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCStorage *storage;
+  if(lua_type(L, 1) == LUA_TSTRING)
+  {
+    const char *fileName = luaL_checkstring(L, 1);
+    int isShared = luaT_optboolean(L, 2, 0);
+    ptrdiff_t size = luaL_optinteger(L, 3, 0);
+    storage = THCStorage_(newWithMapping)(state, fileName, size, isShared);
+  }
+  else if(lua_type(L, 1) == LUA_TTABLE)
+  {
+    ptrdiff_t size = lua_objlen(L, 1);
+    ptrdiff_t i;
+    storage = THCStorage_(newWithSize)(state, size);
+    for(i = 1; i <= size; i++)
+    {
+      lua_rawgeti(L, 1, i);
+      if(!lua_isnumber(L, -1))
+      {
+        THCStorage_(free)(state, storage);
+        luaL_error(L, "element at index %d is not a number", i);
+      }
+#ifdef THC_REAL_IS_HALF
+      half v = THC_float2half((float) lua_tonumber(L, -1));
+      THCStorage_(set)(state, storage, i-1, v);
+#else
+      THCStorage_(set)(state, storage, i-1, (real)lua_tonumber(L, -1));
+#endif
+      lua_pop(L, 1);
+    }
+  }
+  else if(lua_type(L, 1) == LUA_TUSERDATA)
+  {
+    THCStorage *src = luaT_checkudata(L, 1, torch_Storage);
+    real *ptr = src->data;
+    ptrdiff_t offset = luaL_optinteger(L, 2, 1) - 1;
+    if (offset < 0 || offset >= src->size) {
+      luaL_error(L, "offset out of bounds");
+    }
+    ptrdiff_t size = luaL_optinteger(L, 3, src->size - offset);
+    if (size < 1 || size > (src->size - offset)) {
+      luaL_error(L, "size out of bounds");
+    }
+    storage = THCStorage_(newWithData)(state, ptr + offset, size);
+    storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
+    storage->view = src;
+    THCStorage_(retain)(state, storage->view);
+  }
+  else if(lua_type(L, 2) == LUA_TNUMBER)
+  {
+    ptrdiff_t size = luaL_optinteger(L, 1, 0);
+    real *ptr = (real *)luaL_optinteger(L, 2, 0);
+    storage = THCStorage_(newWithData)(state, ptr, size);
+    storage->flag = TH_STORAGE_REFCOUNTED;
+  }
+  else
+  {
+    ptrdiff_t size = luaL_optinteger(L, 1, 0);
+    storage = THCStorage_(newWithSize)(state, size);
+  }
+  luaT_pushudata(L, storage, torch_Storage);
+  return 1;
+}
+
+static int torch_Storage_(retain)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THCStorage_(retain)(cutorch_getstate(L), storage);
+  return 0;
+}
+
+static int torch_Storage_(free)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THCStorage_(free)(cutorch_getstate(L), storage);
+  return 0;
+}
+
+static int torch_Storage_(resize)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  ptrdiff_t size = luaL_checkinteger(L, 2);
+/*  int keepContent = luaT_optboolean(L, 3, 0); */
+  THCStorage_(resize)(cutorch_getstate(L), storage, size);/*, keepContent); */
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(copy)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  void *src;
+  if( (src = luaT_toudata(L, 2, torch_Storage)) )
+    THCStorage_(copy)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
+    THCStorage_(copyByte)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
+    THCStorage_(copyChar)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
+    THCStorage_(copyShort)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
+    THCStorage_(copyInt)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
+    THCStorage_(copyLong)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
+    THCStorage_(copyFloat)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
+    THCStorage_(copyDouble)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THCStorage_(copyHalf)(state, storage, src);
+  else
+    luaL_typerror(L, 2, "torch.*Storage");
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(fill)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+#ifdef THC_REAL_IS_HALF
+  half value = THC_float2half((float) luaL_checknumber(L, 2));
+#else
+  real value = (real) luaL_checknumber(L, 2);
+#endif
+  THCStorage_(fill)(cutorch_getstate(L), storage, value);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(elementSize)(lua_State *L)
+{
+  lua_pushnumber(L, THCStorage_(elementSize)(cutorch_getstate(L)));
+  return 1;
+}
+
+static int torch_Storage_(__len__)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  lua_pushinteger(L, storage->size);
+  return 1;
+}
+
+static int torch_Storage_(__newindex__)(lua_State *L)
+{
+  if(lua_isnumber(L, 2))
+  {
+    THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+    ptrdiff_t index = luaL_checkinteger(L, 2) - 1;
+    double number = luaL_checknumber(L, 3);
+
+#ifdef THC_REAL_IS_HALF
+    half value = THC_float2half((float) number);
+#else
+    real value = (real) number;
+#endif
+    THCStorage_(set)(cutorch_getstate(L), storage, index, value);
+    lua_pushboolean(L, 1);
+  }
+  else
+    lua_pushboolean(L, 0);
+
+  return 1;
+}
+
+static int torch_Storage_(__index__)(lua_State *L)
+{
+  if(lua_isnumber(L, 2))
+  {
+    THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+    ptrdiff_t index = luaL_checkinteger(L, 2) - 1;
+    real v = THCStorage_(get)(cutorch_getstate(L), storage, index);
+
+#ifdef THC_REAL_IS_HALF
+    double value = THC_half2float(v);
+#else
+    double value = (double) v;
+#endif
+
+    lua_pushnumber(L, value);
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else
+  {
+    lua_pushboolean(L, 0);
+    return 1;
+  }
+}
+
+static int torch_Storage_(totable)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  ptrdiff_t i;
+
+  /* Copy storage from device to host. */
+  THStorage *host_storage =
+      THStorage_(newWithSize)(THCStorage_(size)(state, storage));
+  THStorage_(copyCuda)(state, host_storage, storage);
+
+  lua_newtable(L);
+  for(i = 0; i < storage->size; i++)
+  {
+#ifndef THC_REAL_IS_HALF
+    lua_pushnumber(L, (lua_Number)host_storage->data[i]);
+#else
+    lua_pushnumber(L, (lua_Number)TH_half2float(host_storage->data[i]));
+#endif
+    lua_rawseti(L, -2, i+1);
+  }
+  THStorage_(free)(host_storage);
+  return 1;
+}
+
+static int torch_Storage_(factory)(lua_State *L)
+{
+  THCStorage *storage = THCStorage_(new)(cutorch_getstate(L));
+  luaT_pushudata(L, storage, torch_Storage);
+  return 1;
+}
+
+static int torch_Storage_(write)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+#ifdef _MSC_VER
+  THAssert(storage->size < LONG_MAX);
+#endif
+  THFile_writeLongScalar(file, storage->size);
+  THFile_writeRealRaw(file, storage->data, storage->size);
+
+  return 0;
+}
+
+static int torch_Storage_(read)(lua_State *L)
+{
+  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+  long size = THFile_readLongScalar(file);
+
+  THCStorage_(resize)(cutorch_getstate(L), storage, size);
+  THFile_readRealRaw(file, storage->data, storage->size);
+
+  return 0;
+}
+
+static const struct luaL_Reg torch_Storage_(_) [] = {
+  {"retain", torch_Storage_(retain)},
+  {"free", torch_Storage_(free)},
+  {"size", torch_Storage_(__len__)},
+  {"elementSize", torch_Storage_(elementSize)},
+  {"__len__", torch_Storage_(__len__)},
+  {"__newindex__", torch_Storage_(__newindex__)},
+  {"__index__", torch_Storage_(__index__)},
+  {"resize", torch_Storage_(resize)},
+  {"fill", torch_Storage_(fill)},
+  {"copy", torch_Storage_(copy)},
+  {"totable", torch_Storage_(totable)},
+  {"write", torch_Storage_(write)},
+  {"read", torch_Storage_(read)},
+  {NULL, NULL}
+};
+
+void torch_Storage_(init)(lua_State *L)
+{
+  luaT_newmetatable(L, torch_Storage, NULL,
+                    torch_Storage_(new), torch_Storage_(free), torch_Storage_(factory));
+  luaL_setfuncs(L, torch_Storage_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c
new file mode 100644
index 0000000..dbe27ab
--- /dev/null
+++ b/torch/generic/Tensor.c
@@ -0,0 +1,1440 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tensor.c"
+#else
+
+#include "THCHalf.h"
+
+static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
+                                                         THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_);
+
+static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_);
+
+static int torch_Tensor_(size)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(lua_isnumber(L,2))
+  {
+    int dim = luaL_checkint(L, 2)-1;
+    luaL_argcheck(L, dim >= 0 && dim < tensor->nDimension, 2, "out of range");
+    lua_pushnumber(L, tensor->size[dim]);
+  }
+  else
+  {
+    THLongStorage *storage = THLongStorage_newWithSize(tensor->nDimension);
+    memmove(storage->data, tensor->size, sizeof(long)*tensor->nDimension);
+    luaT_pushudata(L, storage, "torch.LongStorage");
+  }
+  return 1;
+}
+
+static int torch_Tensor_(elementSize)(lua_State *L)
+{
+  lua_pushnumber(L, THCStorage_(elementSize)(cutorch_getstate(L)));
+  return 1;
+}
+
+static int torch_Tensor_(stride)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(lua_isnumber(L,2))
+  {
+    int dim = luaL_checkint(L, 2)-1;
+    luaL_argcheck(L, dim >= 0 && dim < tensor->nDimension, 2, "out of range");
+    lua_pushnumber(L, tensor->stride[dim]);
+  }
+  else
+  {
+    THLongStorage *storage = THLongStorage_newWithSize(tensor->nDimension);
+    memmove(storage->data, tensor->stride, sizeof(long)*tensor->nDimension);
+    luaT_pushudata(L, storage, "torch.LongStorage");
+  }
+  return 1;
+}
+
+static int torch_Tensor_(nDimension)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushnumber(L, tensor->nDimension);
+  return 1;
+}
+
+static int torch_Tensor_(storage)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(tensor->storage)
+  {
+    THCStorage_(retain)(cutorch_getstate(L), tensor->storage);
+    luaT_pushudata(L, tensor->storage, torch_Storage);
+  }
+  else
+    lua_pushnil(L);
+
+  return 1;
+}
+
+static int torch_Tensor_(storageOffset)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushinteger(L, tensor->storageOffset+1);
+  return 1;
+}
+
+static int torch_Tensor_(new)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor;
+  ptrdiff_t storageOffset;
+  THLongStorage *size, *stride;
+
+  if(lua_type(L, 1) == LUA_TTABLE)
+  {
+    ptrdiff_t i, j;
+    THLongStorage *counter;
+    ptrdiff_t si = 0;
+    int dimension = 0;
+    int is_finished = 0;
+
+    lua_settop(L, 1);
+    size = THLongStorage_new();
+
+    while( (lua_type(L, -1) == LUA_TTABLE) && (lua_objlen(L, -1) > 0) )
+    {
+      THLongStorage_resize(size, dimension+1);
+      size->data[dimension] = lua_objlen(L, -1);
+      dimension++;
+      lua_rawgeti(L, -1, 1);
+    }
+    lua_pop(L, 1);
+
+    counter = THLongStorage_newWithSize(size->size);
+    THLongStorage_fill(counter, 0);
+
+    tensor = THCTensor_(newWithSize)(state, size, NULL);
+
+    if(size->size == 0)
+      is_finished = 1;
+
+    while(!is_finished)
+    {
+      if(!lua_istable(L, -1))
+      {
+        THLongStorage_free(size);
+        THLongStorage_free(counter);
+        THCTensor_(free)(state, tensor);
+        luaL_error(L, "invalid tensor definition");
+      }
+
+      if(lua_objlen(L, -1) != size->data[size->size-1])
+      {
+        THLongStorage_free(size);
+        THLongStorage_free(counter);
+        THCTensor_(free)(state, tensor);
+        luaL_error(L, "invalid tensor sizes");
+      }
+
+      for(i = 0; i < size->data[size->size-1]; i++)
+      {
+        lua_rawgeti(L, -1, i+1);
+        if(!lua_isnumber(L, -1))
+        {
+          THLongStorage_free(size);
+          THLongStorage_free(counter);
+          THCTensor_(free)(state, tensor);
+          luaL_error(L, "invalid element (not a number)");
+        }
+
+#ifdef THC_REAL_IS_HALF
+        half value = THC_float2half((float) lua_tonumber(L, -1));
+#else
+        real value = (real) lua_tonumber(L, -1);
+#endif
+
+        THCStorage_(set)(state, THCTensor_(storage)(state, tensor), si++, value);
+        lua_pop(L, 1);
+      }
+
+      if(size->size == 1)
+        break;
+
+      for(i = size->size-2; i >= 0; i--)
+      {
+        if(++counter->data[i] == size->data[i])
+        {
+          if(i == 0)
+          {
+            is_finished = 1;
+            break;
+          }
+          else
+          {
+            counter->data[i] = 0;
+            lua_pop(L, 1);
+          }
+        }
+        else
+        {
+          lua_pop(L, 1);
+          for(j = i; j < size->size-1; j++)
+          {
+            if(!lua_istable(L, -1))
+            {
+              THLongStorage_free(size);
+              THLongStorage_free(counter);
+              THCTensor_(free)(state, tensor);
+              luaL_error(L, "invalid tensor definition");
+            }
+            if(lua_objlen(L, -1) != size->data[j])
+            {
+              THLongStorage_free(size);
+              THLongStorage_free(counter);
+              THCTensor_(free)(state, tensor);
+              luaL_error(L, "invalid tensor sizes");
+            }
+            lua_rawgeti(L, -1, counter->data[j]+1);
+          }
+          break;
+        }
+      }
+    }
+
+    THLongStorage_free(size);
+    THLongStorage_free(counter);
+  }
+  else
+  {
+    THCStorage *storage;
+
+    torch_Tensor_(c_readTensorStorageSizeStride)(L, 1, 1, 1, 1, 1,
+                                                 &storage, &storageOffset, &size, &stride);
+
+    tensor = THCTensor_(newWithStorage)(state, storage, storageOffset, size, stride);
+
+    THLongStorage_free(size);
+    THLongStorage_free(stride);
+  }
+
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(set)(lua_State *L)
+{
+  THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  THCStorage *storage;
+  ptrdiff_t storageOffset;
+  THLongStorage *size, *stride;
+
+  torch_Tensor_(c_readTensorStorageSizeStride)(L, 2, 1, 1, 1, 1,
+                                               &storage, &storageOffset, &size, &stride);
+
+  THCTensor_(setStorage)(cutorch_getstate(L), self, storage, storageOffset, size, stride);
+
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(clone)(lua_State *L)
+{
+  THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  self = THCTensor_(newClone)(cutorch_getstate(L), self);
+  luaT_pushudata(L, self, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(contiguous)(lua_State *L)
+{
+  THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  self = THCTensor_(newContiguous)(cutorch_getstate(L), self);
+  luaT_pushudata(L, self, torch_Tensor);
+  return 1;
+}
+
+/* Resize */
+static int torch_Tensor_(resizeAs)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor *src = luaT_checkudata(L, 2, torch_Tensor);
+  THCTensor_(resizeAs)(cutorch_getstate(L), tensor, src);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(resize)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *size, *stride;
+
+  torch_Tensor_(c_readSizeStride)(L, 2, 0, &size, &stride);
+
+  THCTensor_(resize)(cutorch_getstate(L), tensor, size, stride);
+
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(narrow)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long firstIndex = luaL_checklong(L, 3)-1;
+  long size = luaL_checklong(L, 4);
+
+/*  THArgCheck( (dimension >= 0) && (dimension < tensor->nDimension), 2, "out of range");
+  THArgCheck( (firstIndex >= 0) && (firstIndex < tensor->size[dimension]), 3, "out of range");
+  THArgCheck( (size > 0) && (firstIndex+size <= tensor->size[dimension]), 4, "out of range");
+*/
+  tensor = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(narrow)(state, tensor, NULL, dimension, firstIndex, size);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(sub)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  long d0s = -1, d0e = -1, d1s = -1, d1e = -1, d2s = -1, d2e = -1, d3s = -1, d3e = -1;
+
+  d0s = luaL_checklong(L, 2)-1;
+  d0e = luaL_checklong(L, 3)-1;
+  if(d0s < 0)
+    d0s += tensor->size[0]+1;
+  if(d0e < 0)
+    d0e += tensor->size[0]+1;
+  luaL_argcheck(L, tensor->nDimension > 0, 2, "invalid dimension");
+  luaL_argcheck(L, d0s >= 0 && d0s < tensor->size[0], 2, "out of range");
+  luaL_argcheck(L, d0e >= 0 && d0e < tensor->size[0], 3, "out of range");
+  luaL_argcheck(L, d0e >= d0s, 3, "end smaller than beginning");
+
+  if(!lua_isnone(L, 4))
+  {
+    d1s = luaL_checklong(L, 4)-1;
+    d1e = luaL_checklong(L, 5)-1;
+    if(d1s < 0)
+      d1s += tensor->size[1]+1;
+    if(d1e < 0)
+      d1e += tensor->size[1]+1;
+    luaL_argcheck(L, tensor->nDimension > 1, 4, "invalid dimension");
+    luaL_argcheck(L, d1s >= 0 && d1s < tensor->size[1], 4, "out of range");
+    luaL_argcheck(L, d1e >= 0 && d1e < tensor->size[1], 5, "out of range");
+    luaL_argcheck(L, d1e >= d1s, 5, "end smaller than beginning");
+
+    if(!lua_isnone(L, 6))
+    {
+      d2s = luaL_checklong(L, 6)-1;
+      d2e = luaL_checklong(L, 7)-1;
+      if(d2s < 0)
+        d2s += tensor->size[2]+1;
+      if(d2e < 0)
+        d2e += tensor->size[2]+1;
+      luaL_argcheck(L, tensor->nDimension > 2, 6, "invalid dimension");
+      luaL_argcheck(L, d2s >= 0 && d2s < tensor->size[2], 6, "out of range");
+      luaL_argcheck(L, d2e >= 0 && d2e < tensor->size[2], 7, "out of range");
+      luaL_argcheck(L, d2e >= d2s, 7, "end smaller than beginning");
+
+      if(!lua_isnone(L, 8))
+      {
+        d3s = luaL_checklong(L, 8)-1;
+        d3e = luaL_checklong(L, 9)-1;
+        if(d3s < 0)
+          d3s += tensor->size[3]+1;
+        if(d3e < 0)
+          d3e += tensor->size[3]+1;
+        luaL_argcheck(L, tensor->nDimension > 3, 8, "invalid dimension");
+        luaL_argcheck(L, d3s >= 0 && d3s < tensor->size[3], 8, "out of range");
+        luaL_argcheck(L, d3e >= 0 && d3e < tensor->size[3], 9, "out of range");
+        luaL_argcheck(L, d3e >= d3s, 9, "end smaller than beginning");
+      }
+    }
+  }
+
+  tensor = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(narrow)(state, tensor, NULL, 0, d0s, d0e-d0s+1);
+  if(d1s >= 0)
+    THCTensor_(narrow)(state, tensor, NULL, 1, d1s, d1e-d1s+1);
+  if(d2s >= 0)
+    THCTensor_(narrow)(state, tensor, NULL, 2, d2s, d2e-d2s+1);
+  if(d3s >= 0)
+    THCTensor_(narrow)(state, tensor, NULL, 3, d3s, d3e-d3s+1);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(select)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long sliceIndex = luaL_checklong(L, 3)-1;
+
+  if(tensor->nDimension > 1)
+  {
+    tensor = THCTensor_(newWithTensor)(state, tensor);
+    THCTensor_(select)(state, tensor, NULL, dimension, sliceIndex);
+    luaT_pushudata(L, tensor, torch_Tensor);
+  }
+  else
+  {
+    THArgCheck(tensor->nDimension == 1, 1, "empty Tensor");
+    real v = THCTensor_(get1d)(state, tensor, sliceIndex);
+
+#ifdef THC_REAL_IS_HALF
+    double value = THC_half2float(v);
+#else
+    double value = (double) v;
+#endif
+
+    lua_pushnumber(L, value);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(indexSelect)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  int narg = lua_gettop(L);
+  THCTensor *tensor, *src;
+  THCudaLongTensor *index;
+  THLongTensor *longIndex;
+  THCTensor *realIndex;
+  int dim;
+  if (narg == 3)
+  {
+    tensor = THCTensor_(new)(state);
+    src = luaT_checkudata(L, 1, torch_Tensor);
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_toudata(L, 3, "torch.CudaLongTensor");
+    longIndex = luaT_toudata(L, 3, "torch.LongTensor");
+    realIndex = luaT_toudata(L, 3, torch_Tensor);
+    if (!index && !longIndex && !realIndex) luaT_typerror(L, 3, "CudaLongTensor | LongTensor | Tensor");
+    luaT_pushudata(L,tensor,torch_Tensor);
+  }
+  else if(narg == 4)
+  {
+    src = luaT_checkudata(L, 2, torch_Tensor);
+    dim = luaL_checkint(L, 3) - 1;
+    index = luaT_toudata(L, 4, "torch.CudaLongTensor");
+    longIndex = luaT_toudata(L, 4, "torch.LongTensor");
+    realIndex = luaT_toudata(L, 4, torch_Tensor);
+    if (!index && !longIndex && !realIndex) luaT_typerror(L, 3, "CudaLongTensor | LongTensor | Tensor");
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    luaL_error(L, "[Tensor,] Tensor, number, Tensor | LongTensor | CudaLongTensor expected");
+    return 0;
+  }
+
+  if (index)
+    THCTensor_(indexSelect)(state, tensor,src,dim,index);
+  else if (longIndex)
+    THCTensor_(indexSelect_long)(state, tensor,src,dim,longIndex);
+  else { // realIndex
+#if defined THC_REAL_IS_BYTE || defined THC_REAL_IS_CHAR
+    THError("indices are in a ByteTensor or CharTensor which is not allowed. "
+            "ByteTensor | CharTensor have a small range of indices to "
+            "hold [0 to 255], but Tensor indices might "
+            "usually be much larger.");
+#endif
+    THCState *state = cutorch_getstate(L);
+    index = THCudaLongTensor_new(state);
+    THLongStorage *indexSize = THCTensor_(newSizeOf)(state, realIndex);
+    THCudaLongTensor_resize(state, index, indexSize, NULL);
+    THLongStorage_free(indexSize);
+    TH_CONCAT_2(THCudaLongTensor_copyCuda, Real)(state, index, realIndex);
+    THCTensor_(indexSelect)(state, tensor, src, dim, index);
+    THCudaLongTensor_free(state, index);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(indexCopy)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THCTensor *tensor, *src;
+  THCudaLongTensor *index;
+  THLongTensor *longIndex;
+  THCTensor *realIndex;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_toudata(L, 3, "torch.CudaLongTensor");
+    longIndex = luaT_toudata(L, 3, "torch.LongTensor");
+    realIndex = luaT_toudata(L, 3, torch_Tensor);
+    if (!index && !longIndex && !realIndex) luaT_typerror(L, 3, "CudaLongTensor | LongTensor | Tensor");
+    src = luaT_checkudata(L, 4, torch_Tensor);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    luaL_error(L,"Tensor, number, Tensor | LongTensor, Tensor expected");
+    return 0;
+  }
+  THCState *state = cutorch_getstate(L);
+  if (index)
+    THCTensor_(indexCopy)(state, tensor,dim,index,src);
+  else if (longIndex)
+    THCTensor_(indexCopy_long)(state, tensor,dim,longIndex,src);
+  else { // realIndex
+#if defined THC_REAL_IS_BYTE || defined THC_REAL_IS_CHAR
+    THError("indices are in a ByteTensor or CharTensor which is not allowed. "
+            "ByteTensor | CharTensor have a small range of indices to "
+            "hold [0 to 255], but Tensor indices might "
+            "usually be much larger.");
+#endif
+    index = THCudaLongTensor_new(state);
+    THLongStorage *indexSize = THCTensor_(newSizeOf)(state, realIndex);
+    THCudaLongTensor_resize(state, index, indexSize, NULL);
+    THLongStorage_free(indexSize);
+    TH_CONCAT_2(THCudaLongTensor_copyCuda, Real)(state, index, realIndex);
+    THCTensor_(indexCopy)(state, tensor,dim,index,src);
+    THCudaLongTensor_free(state, index);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(indexAdd)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THCTensor *tensor, *src;
+  THCudaLongTensor *index;
+  THLongTensor *longIndex;
+  THCTensor *realIndex;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_toudata(L, 3, "torch.CudaLongTensor");
+    longIndex = luaT_toudata(L, 3, "torch.LongTensor");
+    realIndex = luaT_toudata(L, 3, torch_Tensor);
+    if (!index && !longIndex && !realIndex) luaT_typerror(L, 3, "CudaLongTensor | LongTensor | Tensor");
+    src = luaT_checkudata(L, 4, torch_Tensor);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    luaL_error(L,"Tensor, number, Tensor | LongTensor | CudaLongTensor, Tensor expected");
+    return 0;
+  }
+
+  if (index)
+    THCTensor_(indexAdd)(cutorch_getstate(L), tensor, dim, index, src);
+  else if (longIndex)
+    THCTensor_(indexAdd_long)(cutorch_getstate(L), tensor,dim,longIndex,src);
+  else { // realIndex
+#if defined THC_REAL_IS_BYTE || defined THC_REAL_IS_CHAR
+    THError("indices are in a ByteTensor or CharTensor which is not allowed. "
+            "ByteTensor | CharTensor have a small range of indices to "
+            "hold [0 to 255], but Tensor indices might "
+            "usually be much larger.");
+#endif
+    THCState *state = cutorch_getstate(L);
+    index = THCudaLongTensor_new(state);
+    THLongStorage *indexSize = THCTensor_(newSizeOf)(state, realIndex);
+    THCudaLongTensor_resize(state, index, indexSize, NULL);
+    THLongStorage_free(indexSize);
+    TH_CONCAT_2(THCudaLongTensor_copyCuda, Real)(state, index, realIndex);
+    THCTensor_(indexAdd)(state, tensor, dim, index, src);
+    THCudaLongTensor_free(state, index);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(indexFill)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THCTensor *tensor;
+  THCudaLongTensor *index;
+  THLongTensor *longIndex;
+  THCTensor *realIndex;
+  real val;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_toudata(L, 3, "torch.CudaLongTensor");
+    longIndex = luaT_toudata(L, 3, "torch.LongTensor");
+    realIndex = luaT_toudata(L, 3, torch_Tensor);
+    if (!index && !longIndex && !realIndex) luaT_typerror(L, 3, "CudaLongTensor | LongTensor | Tensor");
+#ifdef THC_REAL_IS_HALF
+    val = THC_float2half((float)luaL_checknumber(L, 4));
+#else
+    val = luaL_checknumber(L, 4);
+#endif
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    luaL_error(L,"Tensor, number, Tensor | LongTensor | CudaLongTensor, number expected");
+    return 0;
+  }
+
+  if (index)
+    THCTensor_(indexFill)(cutorch_getstate(L), tensor, dim, index, val);
+  else if (longIndex)
+    THCTensor_(indexFill_long)(cutorch_getstate(L), tensor, dim, longIndex, val);
+  else { // realIndex
+#if defined THC_REAL_IS_BYTE || defined THC_REAL_IS_CHAR
+    THError("indices are in a ByteTensor or CharTensor which is not allowed. "
+            "ByteTensor | CharTensor have a small range of indices to "
+            "hold [0 to 255], but Tensor indices might "
+            "usually be much larger.");
+#endif
+    THCState *state = cutorch_getstate(L);
+    index = THCudaLongTensor_new(state);
+    THLongStorage *indexSize = THCTensor_(newSizeOf)(state, realIndex);
+    THCudaLongTensor_resize(state, index, indexSize, NULL);
+    THLongStorage_free(indexSize);
+    TH_CONCAT_2(THCudaLongTensor_copyCuda  , Real)(state, index, realIndex);
+    THCTensor_(indexFill)(state, tensor, dim, index, val);
+    THCudaLongTensor_free(state, index);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(transpose)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension1 = luaL_checkint(L, 2)-1;
+  int dimension2 = luaL_checkint(L, 3)-1;
+
+/*
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->nDimension), 2, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->nDimension), 3, "out of range");
+*/
+
+  tensor = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(transpose)(state, tensor, NULL, dimension1, dimension2);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(t)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+
+  luaL_argcheck(L, tensor->nDimension == 2, 1, "Tensor must have 2 dimensions");
+
+  tensor = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(transpose)(state, tensor, NULL, 0, 1);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(unfold)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long size = luaL_checklong(L, 3);
+  long step = luaL_checklong(L, 4);
+
+/*
+  THArgCheck( (src->nDimension > 0), 1, "cannot unfold an empty tensor");
+  THArgCheck(dimension < src->nDimension, 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+*/
+
+  tensor = THCTensor_(newWithTensor)(state, tensor);
+  THCTensor_(unfold)(state, tensor, NULL, dimension, size, step);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+/* is contiguous? [a bit like in TnXIterator] */
+static int torch_Tensor_(isContiguous)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushboolean(L, THCTensor_(isContiguous)(cutorch_getstate(L), tensor));
+  return 1;
+}
+
+static int torch_Tensor_(isSize)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *size = luaT_checkudata(L, 2, "torch.LongStorage");
+  lua_pushboolean(L, THCTensor_(isSize)(cutorch_getstate(L), tensor, size));
+  return 1;
+}
+
+static int torch_Tensor_(isSetTo)(lua_State *L)
+{
+  THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor *src = luaT_checkudata(L, 2, torch_Tensor);
+  lua_pushboolean(L, THCTensor_(isSetTo)(cutorch_getstate(L), self, src));
+  return 1;
+}
+
+static int torch_Tensor_(isSameSizeAs)(lua_State *L)
+{
+  THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor *src = luaT_checkudata(L, 2, torch_Tensor);
+  lua_pushboolean(L, THCTensor_(isSameSizeAs)(cutorch_getstate(L), self, src));
+  return 1;
+}
+
+static int torch_Tensor_(nElement)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushinteger(L, THCTensor_(nElement)(cutorch_getstate(L), tensor));
+  return 1;
+}
+
+static int torch_Tensor_(copy)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  void *src;
+  if( (src = luaT_toudata(L, 2, torch_Tensor)) )
+    THCTensor_(copy)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
+    THCTensor_(copyByte)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
+    THCTensor_(copyChar)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
+    THCTensor_(copyShort)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
+    THCTensor_(copyInt)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
+    THCTensor_(copyLong)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
+    THCTensor_(copyFloat)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
+    THCTensor_(copyDouble)(state, tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) )
+    THCTensor_(copyHalf)(state, tensor, src);
+  else
+    luaL_typerror(L, 2, "torch.*Tensor");
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(__newindex__)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *idx = NULL;
+  THByteTensor *mask;
+  THCudaByteTensor *maskCuda;
+  THCTensor *maskCudaReal;
+
+  if(lua_isnumber(L, 2))
+  {
+    void *src;
+    long index = luaL_checklong(L,2)-1;
+    luaL_argcheck(L, tensor->nDimension > 0, 1, "empty tensor");
+    if (index < 0) index = tensor->size[0] + index + 1;
+
+    if (lua_isnumber(L,3)) {
+#ifdef THC_REAL_IS_HALF
+      half value = THC_float2half(luaL_checknumber(L, 3));
+#else
+      real value = (real)luaL_checknumber(L,3);
+#endif
+
+      if (tensor->nDimension == 1) {
+        luaL_argcheck(L, index >= 0 && index < tensor->size[0], 2, "out of range");
+        THCStorage_(set)(state, tensor->storage, tensor->storageOffset+index*tensor->stride[0], value);
+      } else {
+        tensor = THCTensor_(newWithTensor)(state, tensor);
+        THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+        THCTensor_(fill)(state, tensor, value);
+        THCTensor_(free)(state, tensor);
+      }
+    } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copy)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyByte)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.CharTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyChar)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.ShortTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyShort)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.IntTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyInt)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.LongTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyLong)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.FloatTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyFloat)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyDouble)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(narrow)(state, tensor, NULL, 0, index, 1);
+      THCTensor_(copyHalf)(state, tensor, src);
+      THCTensor_(free)(state, tensor);
+    } else {
+      luaL_typerror(L, 3, "torch.*Tensor");
+    }
+    lua_pushboolean(L, 1);
+  }
+  else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
+  {
+    ptrdiff_t index = THCTensor_(storageOffset)(state, tensor);
+
+#ifdef THC_REAL_IS_HALF
+    real value = THC_float2half((float) luaL_checknumber(L,3));
+#else
+    real value = (real)luaL_checknumber(L,3);
+#endif
+
+    ptrdiff_t dim;
+
+    luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size");
+
+    for(dim = 0; dim < idx->size; dim++)
+    {
+      long z = idx->data[dim]-1;
+      if (z < 0) z = tensor->size[dim] + z + 1;
+      luaL_argcheck(L, (z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
+      index += z*tensor->stride[dim];
+    }
+
+    THCStorage_(set)(state, tensor->storage, index, value);
+    lua_pushboolean(L, 1);
+  }
+  else if(lua_istable(L, 2))
+  {
+    int dim;
+    int cdim = 0;
+    int ndims;
+    int done = 0;
+    ndims = tensor->nDimension;
+    luaL_argcheck(L, lua_objlen(L, 2) <= ndims, 2, "too many indices provided");
+    tensor = THCTensor_(newWithTensor)(state, tensor);
+    for(dim = 0; dim < ndims; dim++)
+    {
+      lua_rawgeti(L, 2, dim+1);
+      if(lua_isnumber(L, -1))
+      {
+        long z = lua_tonumber(L, -1)-1;
+        lua_pop(L, 1);
+        if (z < 0) z = tensor->size[cdim] + z + 1;
+        luaL_argcheck(L, (z >= 0) && (z < tensor->size[cdim]), 2, "index out of bound");
+        if(tensor->nDimension == 1) {
+
+#ifdef THC_REAL_IS_HALF
+          real value = THC_float2half((float) luaL_checknumber(L,3));
+#else
+          real value = (real) luaL_checknumber(L,3);
+#endif
+          done = 1;
+          THCStorage_(set)(state, tensor->storage, tensor->storageOffset+z*tensor->stride[0], value);
+        } else {
+          THCTensor_(select)(state, tensor, NULL, cdim, z);
+        }
+      }
+      else if (lua_istable(L, -1))
+      {
+        long start = 0;
+        long end = tensor->size[cdim]-1;
+        lua_rawgeti(L, -1, 1);
+        if(lua_isnumber(L, -1)) {
+          start = lua_tonumber(L, -1)-1;
+          end = start;
+        }
+        lua_pop(L, 1);
+        if (start < 0) start = tensor->size[cdim] + start + 1;
+        luaL_argcheck(L, (start >= 0) && (start < tensor->size[cdim]), 2, "start index out of bound");
+
+        lua_rawgeti(L, -1, 2);
+        if(lua_isnumber(L, -1)) {
+          end = lua_tonumber(L, -1)-1;
+        }
+        lua_pop(L, 2);
+        if (end < 0) end = tensor->size[cdim] + end + 1;
+        luaL_argcheck(L, (end >= 0) && (end < tensor->size[cdim]), 2, "end index out of bound");
+
+        luaL_argcheck(L, (end >= start), 2, "end index must be greater or equal to start index");
+
+        THCTensor_(narrow)(state, tensor, NULL, cdim++, start, end-start+1);
+      }
+      else
+      {
+        break;
+      }
+    }
+    if(!done) {
+      /* doing a copy */
+      void *src;
+      if (lua_isnumber(L,3)) {
+
+#ifdef THC_REAL_IS_HALF
+        real value = THC_float2half((float) lua_tonumber(L, 3));
+#else
+        real value = (real) lua_tonumber(L, 3);
+#endif
+
+        THCTensor_(fill)(state, tensor, value);
+      } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
+        THCTensor_(copy)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
+        THCTensor_(copyByte)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.CharTensor")) ) {
+        THCTensor_(copyChar)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.ShortTensor")) ) {
+        THCTensor_(copyShort)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.IntTensor")) ) {
+        THCTensor_(copyInt)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.LongTensor")) ) {
+        THCTensor_(copyLong)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.FloatTensor")) ) {
+        THCTensor_(copyFloat)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
+        THCTensor_(copyDouble)(state, tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+        THCTensor_(copyHalf)(state, tensor, src);
+      } else {
+        luaL_typerror(L, 3, "torch.*Tensor");
+      }
+    }
+    THCTensor_(free)(state, tensor);
+    lua_pushboolean(L, 1);
+  }
+  else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
+  {
+    THCTensor *vals;
+    if (lua_isnumber(L, 3))
+    {
+#ifdef THC_REAL_IS_HALF
+      real value = THC_float2half((float) luaL_checknumber(L, 3));
+#else
+      real value = (real) luaL_checknumber(L, 3);
+#endif
+
+      THCTensor_(maskedFillByte)(state, tensor, mask, value);
+    }
+    else if((vals = luaT_toudata(L, 3, torch_Tensor)))
+    {
+      THCTensor_(maskedCopyByte)(state, tensor, mask, vals);
+    }
+    else
+    {
+      luaL_error(L,"number or tensor expected");
+    }
+  }
+  else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor")))
+  {
+    THCTensor *vals;
+    if (lua_isnumber(L, 3))
+    {
+#ifdef THC_REAL_IS_HALF
+      real value = THC_float2half((float) luaL_checknumber(L, 3));
+#else
+      real value = (real) luaL_checknumber(L, 3);
+#endif
+
+      THCTensor_(maskedFill)(state, tensor, maskCuda, value);
+    }
+    else if((vals = luaT_toudata(L, 3, torch_Tensor)))
+    {
+      THCTensor_(maskedCopy)(state, tensor, maskCuda, vals);
+    }
+    else
+    {
+      luaL_error(L,"number or tensor expected");
+    }
+
+  }
+  else if((maskCudaReal = luaT_toudata(L, 2, torch_Tensor)))
+  {
+    maskCuda = THCudaByteTensor_new(state);
+    THLongStorage *maskCudaSize = THCTensor_(newSizeOf)(state, maskCudaReal);
+    THCudaByteTensor_resize(state, maskCuda, maskCudaSize, NULL);
+    THLongStorage_free(maskCudaSize);
+    TH_CONCAT_2(THCudaByteTensor_copyCuda, Real)(state, maskCuda, maskCudaReal);
+
+    THCTensor *vals;
+    if (lua_isnumber(L, 3))
+    {
+#ifdef THC_REAL_IS_HALF
+      real value = THC_float2half((float) luaL_checknumber(L, 3));
+#else
+      real value = (real) luaL_checknumber(L, 3);
+#endif
+
+      THCTensor_(maskedFill)(state, tensor, maskCuda, value);
+    }
+    else if((vals = luaT_toudata(L, 3, torch_Tensor)))
+    {
+      THCTensor_(maskedCopy)(state, tensor, maskCuda, vals);
+    }
+    else
+    {
+      luaL_error(L,"number or tensor expected");
+    }
+
+    THCudaByteTensor_free(state, maskCuda);
+  }
+  else
+  {
+    lua_pushboolean(L, 0);
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(__index__)(lua_State *L)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *idx = NULL;
+  THByteTensor *mask;
+  THCudaByteTensor *maskCuda;
+  THCTensor *maskCudaReal;
+
+  if(lua_isnumber(L, 2))
+  {
+    ptrdiff_t index = luaL_checkinteger(L,2)-1;
+
+    luaL_argcheck(L, tensor->nDimension > 0, 1, "empty tensor");
+    if (index < 0) index = tensor->size[0] + index + 1;
+    luaL_argcheck(L, index >= 0 && index < tensor->size[0], 2, "out of range");
+
+    if(tensor->nDimension == 1)
+    {
+      real v =
+        THCStorage_(get)(state, tensor->storage,
+                         tensor->storageOffset+index*tensor->stride[0]);
+
+#ifdef THC_REAL_IS_HALF
+      double value = THC_half2float(v);
+#else
+      double value = (double) v;
+#endif
+
+      lua_pushnumber(L, value);
+    }
+    else
+    {
+      tensor = THCTensor_(newWithTensor)(state, tensor);
+      THCTensor_(select)(state, tensor, NULL, 0, index);
+      luaT_pushudata(L, tensor, torch_Tensor);
+    }
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
+  {
+    ptrdiff_t index = THCTensor_(storageOffset)(state, tensor);
+    ptrdiff_t dim;
+
+    luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size");
+
+    for(dim = 0; dim < idx->size; dim++)
+    {
+      long z = idx->data[dim]-1;
+      if (z < 0) z = tensor->size[dim] + z + 1;
+      luaL_argcheck(L, (z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
+      index += z*tensor->stride[dim];
+    }
+
+    real v =
+      THCStorage_(get)(state, THCTensor_(storage)(state, tensor), index);
+
+#ifdef THC_REAL_IS_HALF
+    double value = (double) THC_half2float(v);
+#else
+    double value = (double) v;
+#endif
+
+    lua_pushnumber(L, value);
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if(lua_istable(L, 2))
+  {
+    int dim;
+    int cdim = 0;
+    int ndims;
+    int done = 0;
+
+    ndims = tensor->nDimension;
+    luaL_argcheck(L, lua_objlen(L, 2) <= ndims, 2, "too many indices provided");
+    tensor = THCTensor_(newWithTensor)(state, tensor);
+
+    for(dim = 0; dim < ndims; dim++)
+    {
+      lua_rawgeti(L, 2, dim+1);
+      if(lua_isnumber(L, -1))
+      {
+        long z = lua_tonumber(L, -1)-1;
+        lua_pop(L, 1);
+        if (z < 0) z = tensor->size[cdim] + z + 1;
+        luaL_argcheck(L, (z >= 0) && (z < tensor->size[cdim]), 2, "index out of bound");
+        if(tensor->nDimension == 1) {
+          done = 1;
+
+          real v =
+            THCStorage_(get)(state, tensor->storage,
+                             tensor->storageOffset+z*tensor->stride[0]);
+#ifdef THC_REAL_IS_HALF
+          double value = (double) THC_half2float(v);
+#else
+          double value = (double) v;
+#endif
+
+          lua_pushnumber(L, value);
+        } else {
+          THCTensor_(select)(state, tensor, NULL, cdim, z);
+        }
+      }
+      else if (lua_istable(L, -1))
+      {
+        long start = 0;
+        long end = tensor->size[cdim]-1;
+        lua_rawgeti(L, -1, 1);
+        if(lua_isnumber(L, -1)) {
+          start = lua_tonumber(L, -1)-1;
+          end = start;
+        }
+        lua_pop(L, 1);
+        if (start < 0) start = tensor->size[cdim] + start + 1;
+        luaL_argcheck(L, (start >= 0) && (start < tensor->size[cdim]), 2, "start index out of bound");
+
+        lua_rawgeti(L, -1, 2);
+        if(lua_isnumber(L, -1)) {
+          end = lua_tonumber(L, -1)-1;
+        }
+        lua_pop(L, 2);
+        if (end < 0) end = tensor->size[cdim] + end + 1;
+        luaL_argcheck(L, (end >= 0) && (end < tensor->size[cdim]), 2, "end index out of bound");
+
+        luaL_argcheck(L, (end >= start), 2, "end index must be greater or equal to start index");
+
+        THCTensor_(narrow)(state, tensor, NULL, cdim++, start, end-start+1);
+      }
+      else
+      {
+        break;
+      }
+    }
+    if(!done) {
+      luaT_pushudata(L, tensor, torch_Tensor);
+    } else {
+      THCTensor_(free)(state, tensor);
+    }
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
+  {
+    THCTensor *vals = THCTensor_(new)(state);
+    THCTensor_(maskedSelectByte)(state, vals, tensor, mask);
+    luaT_pushudata(L, vals, torch_Tensor);
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor")))
+  {
+    THCTensor *vals = THCTensor_(new)(state);
+    THCTensor_(maskedSelect)(state, vals, tensor, maskCuda);
+    luaT_pushudata(L, vals, torch_Tensor);
+    lua_pushboolean(L, 1);
+
+    return 2;
+  }
+  else if((maskCudaReal = luaT_toudata(L, 2, torch_Tensor)))
+  {
+    maskCuda = THCudaByteTensor_new(state);
+    THLongStorage *maskCudaSize = THCTensor_(newSizeOf)(state, maskCudaReal);
+    THCudaByteTensor_resize(state, maskCuda, maskCudaSize, NULL);
+    THLongStorage_free(maskCudaSize);
+    TH_CONCAT_2(THCudaByteTensor_copyCuda, Real)(state, maskCuda, maskCudaReal);
+
+    THCTensor *vals = THCTensor_(new)(state);
+    THCTensor_(maskedSelect)(state, vals, tensor, maskCuda);
+    luaT_pushudata(L, vals, torch_Tensor);
+    lua_pushboolean(L, 1);
+
+    THCudaByteTensor_free(state, maskCuda);
+
+    return 2;
+  }
+  else
+  {
+    lua_pushboolean(L, 0);
+    return 1;
+  }
+}
+
+static int torch_Tensor_(retain)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor_(retain)(cutorch_getstate(L), tensor);
+  return 0;
+}
+
+static int torch_Tensor_(free)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THCTensor_(free)(cutorch_getstate(L), tensor);
+  return 0;
+}
+
+/* helpful functions */
+static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_)
+{
+  THLongStorage *size = NULL;
+  THLongStorage *stride = NULL;
+
+  if( (size = luaT_toudata(L, index, "torch.LongStorage")) )
+  {
+    if(!lua_isnoneornil(L, index+1))
+    {
+      if( (stride = luaT_toudata(L, index+1, "torch.LongStorage")) )
+        luaL_argcheck(L, stride->size == size->size, index+1, "provided stride and size are inconsistent");
+      else
+        luaL_argcheck(L, 0, index+1, "torch.LongStorage expected");
+    }
+    THLongStorage_retain(size);
+    if(stride)
+      THLongStorage_retain(stride);
+  }
+  else
+  {
+    int i;
+
+    size = THLongStorage_newWithSize(8);
+    stride = THLongStorage_newWithSize(8);
+    THLongStorage_fill(size, -1);
+    THLongStorage_fill(stride, -1);
+
+    if(allowStride)
+    {
+      for(i = 0; i < 8; i++)
+      {
+        if(lua_isnone(L, index+2*i))
+          break;
+        size->data[i] = luaL_checklong(L, index+2*i);
+
+        if(lua_isnone(L, index+2*i+1))
+          break;
+        stride->data[i] = luaL_checklong(L, index+2*i+1);
+      }
+    }
+    else
+    {
+      for(i = 0; i < 8; i++)
+      {
+        if(lua_isnone(L, index+i))
+          break;
+        size->data[i] = luaL_checklong(L, index+i);
+      }
+    }
+  }
+
+  *size_ = size;
+  *stride_ = stride;
+}
+
+static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
+                                                         THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_)
+{
+  THCState *state = cutorch_getstate(L);
+  THCTensor *src = NULL;
+  THCStorage *storage = NULL;
+
+  int arg1Type = lua_type(L, index);
+
+  if( allowNone && (arg1Type == LUA_TNONE) )
+  {
+    *storage_ = NULL;
+    *storageOffset_ = 0;
+    *size_ = NULL;
+    *stride_ = NULL;
+    return;
+  }
+  else if( allowTensor && (arg1Type == LUA_TUSERDATA) && (src = luaT_toudata(L, index, torch_Tensor)) )
+  {
+    *storage_ = src->storage;
+    *storageOffset_ = src->storageOffset;
+    *size_ = THCTensor_(newSizeOf)(state, src);
+    *stride_ = THCTensor_(newStrideOf)(state, src);
+    return;
+  }
+  else if( allowStorage && (arg1Type == LUA_TUSERDATA) && (storage = luaT_toudata(L, index, torch_Storage)) )
+  {
+    *storage_ = storage;
+    if(lua_isnone(L, index+1))
+    {
+      *storageOffset_ = 0;
+      *size_ = THLongStorage_newWithSize1(storage->size);
+      *stride_ = THLongStorage_newWithSize1(1);
+    }
+    else
+    {
+      *storageOffset_ = luaL_checkinteger(L, index+1)-1;
+      torch_Tensor_(c_readSizeStride)(L, index+2, allowStride, size_, stride_);
+    }
+    return;
+  }
+  else if( (arg1Type == LUA_TNUMBER) || (luaT_toudata(L, index, "torch.LongStorage")) )
+  {
+    *storage_ = NULL;
+    *storageOffset_ = 0;
+    torch_Tensor_(c_readSizeStride)(L, index, 0, size_, stride_);
+
+    return;
+  }
+
+  *storage_ = NULL;
+  *storageOffset_ = 0;
+
+  if(allowTensor && allowStorage)
+      luaL_argcheck(L, 0, index, "expecting number or Tensor or Storage");
+  else if(allowTensor)
+      luaL_argcheck(L, 0, index, "expecting number or Tensor");
+  else if(allowStorage)
+      luaL_argcheck(L, 0, index, "expecting number or Storage");
+  else
+      luaL_argcheck(L, 0, index, "expecting number");
+}
+
+static int torch_Tensor_(factory)(lua_State *L)
+{
+  THCTensor *tensor = THCTensor_(new)(cutorch_getstate(L));
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(write)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  THFile_writeIntScalar(file, tensor->nDimension);
+  THFile_writeLongRaw(file, tensor->size, tensor->nDimension);
+  THFile_writeLongRaw(file, tensor->stride, tensor->nDimension);
+#ifdef _MSC_VER
+  THAssert(tensor->storageOffset+1 < LONG_MAX);
+#endif
+  THFile_writeLongScalar(file, tensor->storageOffset+1); /* to respect Lua convention */
+
+  lua_getfield(L, 2, "writeObject"); /* the method */
+  lua_pushvalue(L, 2); /* the file */
+  /* the storage */
+  if(tensor->storage)
+  {
+    THCStorage_(retain)(cutorch_getstate(L), tensor->storage);
+    luaT_pushudata(L, tensor->storage, torch_Storage);
+  }
+  else
+    lua_pushnil(L);
+
+  lua_call(L, 2, 0); /* call the method */
+
+  return 0;
+}
+
+static int torch_Tensor_(read)(lua_State *L)
+{
+  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  tensor->nDimension = THFile_readIntScalar(file);
+  tensor->size = THAlloc(sizeof(long)*tensor->nDimension);
+  tensor->stride = THAlloc(sizeof(long)*tensor->nDimension);
+  THFile_readLongRaw(file, tensor->size, tensor->nDimension);
+  THFile_readLongRaw(file, tensor->stride, tensor->nDimension);
+  tensor->storageOffset = THFile_readLongScalar(file);
+  tensor->storageOffset--;  /* to respect Lua convention */
+
+  lua_getfield(L, 2, "readObject"); /* the method */
+  lua_pushvalue(L, 2); /* the file */
+  lua_call(L, 1, 1); /* call the method */
+
+  tensor->storage = luaT_toudata(L, -1, torch_Storage);
+  if(tensor->storage)
+    THCStorage_(retain)(cutorch_getstate(L), tensor->storage);
+
+  return 0;
+}
+
+static const struct luaL_Reg torch_Tensor_(_) [] = {
+  {"retain", torch_Tensor_(retain)},
+  {"free", torch_Tensor_(free)},
+  {"contiguous", torch_Tensor_(contiguous)},
+  {"size", torch_Tensor_(size)},
+  {"elementSize", torch_Tensor_(elementSize)},
+  {"__len__", torch_Tensor_(size)},
+  {"stride", torch_Tensor_(stride)},
+  {"dim", torch_Tensor_(nDimension)},
+  {"nDimension", torch_Tensor_(nDimension)},
+  {"set", torch_Tensor_(set)},
+  {"storage", torch_Tensor_(storage)},
+  {"storageOffset", torch_Tensor_(storageOffset)},
+  {"clone", torch_Tensor_(clone)},
+  {"contiguous", torch_Tensor_(contiguous)},
+  {"resizeAs", torch_Tensor_(resizeAs)},
+  {"resize", torch_Tensor_(resize)},
+  {"narrow", torch_Tensor_(narrow)},
+  {"sub", torch_Tensor_(sub)},
+  {"select", torch_Tensor_(select)},
+  {"index", torch_Tensor_(indexSelect)},
+  {"indexCopy", torch_Tensor_(indexCopy)},
+  {"indexAdd", torch_Tensor_(indexAdd)},
+  {"indexFill", torch_Tensor_(indexFill)},
+  {"transpose", torch_Tensor_(transpose)},
+  {"t", torch_Tensor_(t)},
+  {"unfold", torch_Tensor_(unfold)},
+  {"isContiguous", torch_Tensor_(isContiguous)},
+  {"isSize", torch_Tensor_(isSize)},
+  {"isSetTo", torch_Tensor_(isSetTo)},
+  {"isSameSizeAs", torch_Tensor_(isSameSizeAs)},
+  {"nElement", torch_Tensor_(nElement)},
+  {"copy", torch_Tensor_(copy)},
+  {"read", torch_Tensor_(read)},
+  {"write", torch_Tensor_(write)},
+  {"__index__", torch_Tensor_(__index__)},
+  {"__newindex__", torch_Tensor_(__newindex__)},
+  {NULL, NULL}
+};
+
+void torch_Tensor_(init)(lua_State *L)
+{
+  luaT_newmetatable(L, torch_Tensor, NULL,
+                    torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
+  luaL_setfuncs(L, torch_Tensor_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/torch/utils.c b/torch/utils.c
new file mode 100644
index 0000000..b9bfe1f
--- /dev/null
+++ b/torch/utils.c
@@ -0,0 +1,60 @@
+#include "utils.h"
+
+THLongStorage* cutorch_checklongargs(lua_State *L, int index)
+{
+  THLongStorage *storage;
+  int i;
+  int narg = lua_gettop(L)-index+1;
+
+  if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
+  {
+    THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage");
+    storage = THLongStorage_newWithSize(storagesrc->size);
+    THLongStorage_copy(storage, storagesrc);
+  }
+  else
+  {
+    storage = THLongStorage_newWithSize(narg);
+    for(i = index; i < index+narg; i++)
+    {
+      if(!lua_isnumber(L, i))
+      {
+        THLongStorage_free(storage);
+        luaL_argerror(L, i, "number expected");
+      }
+      THLongStorage_set(storage, i-index, lua_tonumber(L, i));
+    }
+  }
+  return storage;
+}
+
+int cutorch_islongargs(lua_State *L, int index)
+{
+  int narg = lua_gettop(L)-index+1;
+
+  if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
+  {
+    return 1;
+  }
+  else
+  {
+    int i;
+
+    for(i = index; i < index+narg; i++)
+    {
+      if(!lua_isnumber(L, i))
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+struct THCState* cutorch_getstate(lua_State* L)
+{
+  lua_getglobal(L, "cutorch");
+  lua_getfield(L, -1, "_state");
+  struct THCState *state = lua_touserdata(L, -1);
+  lua_pop(L, 2);
+  return state;
+}
diff --git a/torch/utils.h b/torch/utils.h
new file mode 100644
index 0000000..ae959b7
--- /dev/null
+++ b/torch/utils.h
@@ -0,0 +1,64 @@
+#ifndef CUTORCH_UTILS_INC
+#define CUTORCH_UTILS_INC
+
+#include "luaT.h"
+#include "TH.h"
+
+#ifdef __cplusplus
+# define TORCH_EXTERNC extern "C"
+#else
+# define TORCH_EXTERNC extern
+#endif
+
+#ifdef __GNUC__
+# define TORCH_UNUSED __attribute__((unused))
+#else
+# define TORCH_UNUSED
+#endif
+
+#ifdef _WIN32
+# ifdef cutorch_EXPORTS
+#  define TORCH_API TORCH_EXTERNC __declspec(dllexport)
+# else
+#  define TORCH_API TORCH_EXTERNC __declspec(dllimport)
+# endif
+#else
+# define TORCH_API TORCH_EXTERNC
+#endif
+
+#if LUA_VERSION_NUM == 501
+/*
+** Adapted from Lua 5.2.0
+*/
+TORCH_UNUSED static void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
+  luaL_checkstack(L, nup+1, "too many upvalues");
+  for (; l->name != NULL; l++) {  /* fill the table with given functions */
+    int i;
+    lua_pushstring(L, l->name);
+    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
+      lua_pushvalue(L, -(nup+1));
+    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
+    lua_settable(L, -(nup + 3));
+  }
+  lua_pop(L, nup);  /* remove upvalues */
+}
+#endif
+
+#if LUA_VERSION_NUM >= 503
+/* one can simply enable LUA_COMPAT_5_2 to be backward compatible.
+However, this does not work when we are trying to use system-installed lua,
+hence these redefines
+*/
+#define luaL_optlong(L,n,d)     ((long)luaL_optinteger(L, (n), (d)))
+#define luaL_optint(L,n,d)  ((int)luaL_optinteger(L, (n), (d)))
+#define luaL_checklong(L,n)     ((long)luaL_checkinteger(L, (n)))
+#define luaL_checkint(L,n)      ((int)luaL_checkinteger(L, (n)))
+#endif
+
+TORCH_API THLongStorage* cutorch_checklongargs(lua_State *L, int index);
+TORCH_API int cutorch_islongargs(lua_State *L, int index);
+
+struct THCState;
+TORCH_API struct THCState* cutorch_getstate(lua_State* L);
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-cutorch.git